From 4720e5cae556c12057f92ba98624fa74a2258796 Mon Sep 17 00:00:00 2001
From: Charlotte Soneson <charlottesoneson@gmail.com>
Date: Mon, 11 Apr 2022 20:18:38 +0200
Subject: [PATCH 1/2] Allow collapsing to AA in linkMultipleVariants()

---
 R/linkMultipleVariants.R                      |  42 +++-
 .../extdata/multipleVariableRegions_truth.rds | Bin 1385 -> 1661 bytes
 ...nerateExampleDataForLinkMultipleVariants.R |  43 ++++
 man/linkMultipleVariants.Rd                   |   9 +-
 tests/testthat/test_linkMultipleVariants.R    | 187 +++++++++++++++++-
 5 files changed, 274 insertions(+), 7 deletions(-)

diff --git a/R/linkMultipleVariants.R b/R/linkMultipleVariants.R
index 9f303fb..2aeb503 100644
--- a/R/linkMultipleVariants.R
+++ b/R/linkMultipleVariants.R
@@ -62,6 +62,11 @@
 #' @param ... Additional arguments providing arguments to \code{digestFastqs}
 #'     for the separate runs (processing each variable sequence in turn).
 #'     Each argument must be a named list of arguments to \code{digestFastqs}.
+#' @param collapseToAA Either \code{TRUE}, \code{FALSE}, or a character vector
+#'     indicating for which of the separate runs the sequences should be 
+#'     collapsed to the amino acid mutant name rather than the codon- or 
+#'     nucleotide-level name. \code{TRUE} is equivalent to providing the 
+#'     names of all lists provided in \code{...}. 
 #'
 #' @author Charlotte Soneson
 #'
@@ -75,13 +80,15 @@
 #' \item convSeparate - a list of conversion tables from the respective 
 #'     separate runs.
 #' \item outCombined - the \code{digestFastqs} output for the combined run.
+#' \item filtSeparate - a list of filtering tables for the separate runs.
 #' }
 #' 
 #' @importFrom dplyr select rename group_by summarize across matches mutate
 #' @importFrom tidyr separate separate_rows
 #' @importFrom rlang .data
 #'
-linkMultipleVariants <- function(combinedDigestParams = list(), ...) {
+linkMultipleVariants <- function(combinedDigestParams = list(), ..., 
+                                 collapseToAA = FALSE) {
 
     ## Process additional arguments
     paramsSeparate <- list(...)
@@ -111,6 +118,24 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) {
                               as.list(defaults[!(names(defaults) %in%
                                                      names(combinedDigestParams))]))
 
+    ## Make sure collapseToAA is a vector containing the names of the 
+    ## runs where sequences should be collapsed to AAs
+    if (length(collapseToAA) == 1) {
+        if (is.logical(collapseToAA)) {
+            if (collapseToAA) {
+                collapseToAA <- names(paramsSeparate)
+            } else {
+                collapseToAA <- character(0)
+            }
+        } else {
+            .assertVector(x = collapseToAA, type = "character",
+                          validValues = names(paramsSeparate))
+        }
+    } else {
+        .assertVector(x = collapseToAA, type = "character",
+                      validValues = names(paramsSeparate))
+    }
+    
     ## --------------------------------------------------------------------- ##
     ## Checks
     ## --------------------------------------------------------------------- ##
@@ -227,7 +252,8 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) {
     ## Conversion tables
     convSeparate <- lapply(outSeparate, function(out) {
         out$summaryTable %>%
-            dplyr::select(.data$mutantName, .data$sequence) %>%
+            dplyr::select(.data$mutantName, .data$mutantNameAA,
+                          .data$sequence) %>%
             tidyr::separate_rows(.data$sequence, sep = ",")
     })
 
@@ -235,9 +261,15 @@ linkMultipleVariants <- function(combinedDigestParams = list(), ...) {
     ## Replace naive sequences with corrected ones
     ## --------------------------------------------------------------------- ##
     for (i in names(paramsSeparate)) {
-        countCombined[[i]] <- 
-            convSeparate[[i]]$mutantName[match(countCombined[[i]],
-                                               convSeparate[[i]]$sequence)]
+        if (i %in% collapseToAA) {
+            countCombined[[i]] <- 
+                convSeparate[[i]]$mutantNameAA[match(countCombined[[i]],
+                                                     convSeparate[[i]]$sequence)]
+        } else {
+            countCombined[[i]] <- 
+                convSeparate[[i]]$mutantName[match(countCombined[[i]],
+                                                   convSeparate[[i]]$sequence)]
+        }
     }
 
     ## Filter out rows with NAs (the sequence was not retained)
diff --git a/inst/extdata/multipleVariableRegions_truth.rds b/inst/extdata/multipleVariableRegions_truth.rds
index fa1243cf3edc1771a16a1da7e070b33cdc0f80bd..a86cab19fc1feccbf8c2c5f7732f71b7968c1914 100644
GIT binary patch
literal 1661
zcmZwHeLNEg7zgmgm83|aG%2I&vqd>$%{n!eniR^CjY)*<(vI4iSQ1(+ZmDsSaT@Zn
z^<tKn);i`bY4aA%+%UwA*1S(_SNG?2&-e5E@%#M#d;WN=X!<Mu9o@+s3_*T)Q&Z<o
ze~G<Mts8m=eQpb?AvHXX0=Z?uH9qWsB^#bFHmb#xAg>0gx5u4%bB7QwT3`x^7-Bah
zoFlmW?qT*bc!j`f-KxaFb6hCEu*nl}Cw^Ka;H^qr7Lji<R7%}BSa4p$Imo_H*%xy>
z=&o-;39B%J>ERB_E(tBnusmMBmT2IUd~Exv+ZKl{3v}9(A`p6MH{1^b^EZg~BlI9E
zEp)!cNBKVDT@jlg1{Oc0-E>EMi;(|uMf^QO@%KE%-`g$z-bL|$O(Z#=uZRV)ZS&wv
z#lkDOnxlx}C@Kg|7qlJGO+@l7dF%u=i|kr947Gov5D=~h#%3yAUY2}>1)u1-{Cihb
z9A>$8^lE>6k6cDvx)K=_7?Z1g5h;5SzYQ)REc0?@l&0_=1*S)FTDHWQ?ISJK*ekD5
zKJ^`-C}i`~D49mNsD7+m7{hB#WQW%4P9ac5Z?+z|hZr^lg+DfEH$B?yU5sl)^a|<V
zv!~qCOYv@L0H<O<dteXl+fv*Uiry|hu-F=EZ2fzXjD_vIX8t}$kZ=#0e{pa0Y-)GU
zm`x)oxYfR~FyE+v;Z+wQNxjQYu=rt1Daa?qZns&vd*Il{vO?_JGid(6#bKYyungxd
z{@oL#T)4g?-mG(7*fYuN!fHx_C&p#>JLCD1a2?b@rPs<oOubAFwd{1<;O7mi@Iw|p
z4DoehJo6P=yO6!mG$!eOIr0940eXB_ySeZ4GkQM!QZPR9lVM8ikLec&Trf{>_eM%j
zf)f1=0g|28xSaMU>5rTTk9&ZpnI7ym^Gs3O%^MS!)}zwBY6tqQKU?9V6<6sI)Cxv6
zSI&ROQg3GRB#vuh7u1~Av>5X}+wT_ehImt0qI*|y;Zj#At+?m6IU{d2oD%TFikQ&$
zaY9YPZ_?uxEm<G@VCJ|H^w{G>R(S(1q|*9O`4tDrk$p!fskgXs^T#xxX-byL+)j8{
z9bDUzc{<>NW{D?(@vNQqChzDQ;wOU4M?`B#c>IiT=4cGcj$JcZpJI!Mf6Hn<xr>bg
z5l<lk`JN49lV#68LrynuD>yE-6nyZ#@5&9G!9rUl(8!3q&A{Zr8T1Su*?;~=Z4FTI
zVAUmTNQt+p)E9jgW(x*JHPxd_316TcM@^?JU*fqg&QVNG;H0LPP0-{5z>D+B-X~R3
z9NbkyahW51t@`2DWs9>E_TUa|?E_-;<QrV`{M8y)vrM?DCcFR7g|=<OmEdAr?8k8l
zs#fg~TSMw*XpH2N?qbg3r!b0Hs0iPrBcg5G=tl#GuIB~mwX}h;bl8w933nD#L!Irg
zjUtgBQ_S>>e^JTY>;{GCmt5O`7Ks-2<y|i9>kpYpl@qvj8I3yryJe%#<ut1*x;6RO
z{xSQ%=Z?^&r7mle5-<H>HSQZ_Vgu}4c)C^S8B#?Aq`TcM3GwRyCIYA@+0kfTRAoGk
zC1!Q#PIQ>Dn(;#Xcs0?V%8k&cCb*Q5>RWr;TNp)-J0wXe2Epk>-3B_>6b*2QJB!Vi
zAnDR@lV9Stct!m6Q}uvN8&qA;X~WEQTd-_;b)U1}v|71Op{qAPT}7=MBcD0{J}Rtm
z7a#=JB`kYbH{ASIR@HfBZyE5^tI94&#%1U|VZq;E<Oj9Q5y?5Oh{b0uEHCv(o9>aX
zjv5`i2J_0C`KVCjj6kOqtz`{AshmXyx9J`l%pk(oId3iQLD6p<O#qV5em_0i=J8%J
zH5tbkV}x$A8&y<K^*EjOFTGBt%=#cbvwuCV4HjG?5VRM9w9jmmVVnJ~k0fCLec>Od
z%8EF_^!`24#AO@jAk_ijQ}hIaiBi>1iY~|4GCy~`B3sHv4%jQRl(g~<G9X1n>yp-{
ztwTaKt5k2Azh3!HV@gbrgF#=8KQ*B9_G^Yw<bcv*3EG!MNy*~MjE1nu(mfBwfV}vs
zDD1F_J<`d`yDpFU^U<L@_k2Pg@ilOV(X_l(YqC0xt6yFdp!4Db^v7u<t2sm3(d1uO
J*1N7)@ehw5Wb^<4

literal 1385
zcmZX~dpr{e00(d-VslMGo(&m0G}6wI#FDkVvO4Bbv{b~+K4eL2T+fBfbC{daqVvki
zyoR+duaHLbsP;L+nAeb$$84_a&-45K_51vO;1m^wA5oO#2jPWwnjKmp)usg*nu^I-
z#r5W${AZ|hV25o3EeaqNdjczX4hzC5zW0_kf~xtA4}UY2xw+?936nET+%&+3!3ktC
zxwuW>3{z{F(!Z0%=mP`#bmVVGWOKU61;;g`E|Iuwm8-!6hMuRV@P}(@Crz`$%>rl_
z!!ol@Mw!LaMCPYAKhlrEjWr%<GL`!7Pr1Yz-%v;FFWTL!621ezU2j|JA4r$~Dl}XZ
z4$xCdy^TaXE<*pPK=@Yr@U3>jx7rKe3IyK@4F5mplAl^<7!kzI6)QCzv6Tp+ejo9&
z%^cXutE4M=TIhh<(yUZDrLA!=)+{eF;T-W1DDQW@`&L!DcRf*#x!{OI;YOVlPm~KO
zg>@YmOt5&<RTw8JWEG6mlE_sHRpV@l?5RXn7MURYW_UOems`(u{=Qd97S^S{Lvp-S
ziXQshH@H^vf=oXlVmr{hhriHoPXWp;->5z}_eAFjJfpRLja~*Yo<M!PJj&2T+H)3A
zK!LXAJ?~j9NKSI!Q2%>Z!~@%nb+fNIgH6v8R*ClUpssy~FnE7bs^5h<oluAN=Imue
zr6gq$`=LC8OV)(|1pTG07&{>PF>E<$s26t7FOWMrH-Tnf&X~7lKM~F!jnlS_uoF1>
zJ?Gr<0E)NM>|*_yKKwGl1Mg3@c#{(9P1Cogn|Ub>nOz*VyCv1&uktcViG_BbF|k^2
zIkvbX@Wgag5tVY9gsAN+H;TXz+_<uWN73N649AAI807H{vnX@w-<8U{a2gpkopYED
zbk-sy$g-PJN2&8~AmMU$TqH)8#w6q`Eo{5BNj<tR9f-B&q>uA}aXpFIaU}Wja9ekG
zfM9L()HE!aD>}w7;6+LsUqs%daMA-TIqG%$3&Dg$s{>(La+=etjUL7&*j>p-#`kCb
z)3gsBu;K&K!V@r&@a4w!dR@!C&=%B8CK-Jmq7_r`#bU?4_UFbms)e5o@}M-UqL>tX
ze*26i-q@EJjM7>%<&GruaL-OeJw~c>$Gic!?qck5?QW0~!Fco0brhFmG5d{>Gow>c
zf6CqYV}%^*9(Gx*4^_XKIy#6w0=(pEsGM);AhNj@K@}{eR>*?Pd`B#BI7AmR)&HLx
zw3?@`fp+&~uj#%`qh4cEaK4Qn(~B13G+>h)=EC;75^cN~IuWe=sluwWXmIslKyUS>
zd_wQwpA3=?q~eS!ue&uja3AvxDK{Q$)DQx3%0_lf=U~rGUQ2%GFnHOkhh>qnzdj+$
zo84JSK{-*MMVjt?rxbxEFpPMN8q^_WC*y+Rw!%qp?(9W<XRqu#12K%KX_j3@(}eYx
zfgZ^P+Kr%c6XubU&7qaCZ~R(6BA14rA3VG=*>bZW=FCLfg%`9_@<7$Bni+A0&^oKX
zKf|EYd5kZU$#`Sq&Yz)8)8^EX>Q$9J;?h;Bxbz7BmCZxGKcjs(DBjB`BFTD&2}nz;
zw@IgpT=Tv{W*1%LJE}wC=aZ02Y9j3F`N>X_ZF>LhTl6%#XN+x60DaejIbwGhYsx2Z
z-6g%fBQ$+MV~J%S$7%Nvt}u%+Zc@b*a?1qE;Oy!rji0al!tycU8InE6)dT^1u5#BL
n4c`zMO1H8F>h?_>S<P^>wAzQk$}KEO+EiTY&~UN0RZ#dB(4UvR

diff --git a/inst/scripts/generateExampleDataForLinkMultipleVariants.R b/inst/scripts/generateExampleDataForLinkMultipleVariants.R
index a38c9f5..c33b147 100644
--- a/inst/scripts/generateExampleDataForLinkMultipleVariants.R
+++ b/inst/scripts/generateExampleDataForLinkMultipleVariants.R
@@ -187,10 +187,53 @@ WTV3 <- c(V3 = V3)
 constFwd <- "ACGTGATTGATCCCGGTAAATACCGG"
 constRev <- "TAAATACCGGACCTGA"
 
+## Get AA mutant name as well
+V2varAA <- vapply(V2var, function(w) {
+    w <- strsplit(w, "\\.")[[1]]
+    if (w[3] != "WT") {
+        pos0 <- ceiling(as.numeric(w[2]) / 3) - 1
+        wt <- strsplit(WTV2[w[1]], "")[[1]]
+        wt_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "")
+        wt_aa <- mutscan:::translateString(wt_codon)
+        wt[as.numeric(w[2])] <- w[3]
+        mut_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "")
+        mut_aa <- mutscan:::translateString(mut_codon)
+        if (mut_aa == wt_aa) {
+            paste(c(w[1], 0, "WT"), collapse = ".")
+        } else {
+            paste(c(w[1], pos0 + 1, mut_aa), collapse = ".")
+        }
+    } else {
+        paste(w, collapse = ".")
+    }
+}, "")
+V3varAA <- vapply(V3var, function(w) {
+    w <- strsplit(w, "\\.")[[1]]
+    if (w[3] != "WT") {
+        pos0 <- ceiling(as.numeric(w[2]) / 3) - 1
+        wt <- strsplit(WTV3[w[1]], "")[[1]]
+        wt_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "")
+        wt_aa <- mutscan:::translateString(wt_codon)
+        wt[as.numeric(w[2])] <- w[3]
+        mut_codon <- paste(wt[(3 * pos0 + 1):(3 * pos0 + 3)], collapse = "")
+        mut_aa <- mutscan:::translateString(mut_codon)
+        if (mut_aa == wt_aa) {
+            paste(c(w[1], 0, "WT"), collapse = ".")
+        } else {
+            paste(c(w[1], pos0 + 1, mut_aa), collapse = ".")
+        }
+    } else {
+        paste(w, collapse = ".")
+    }
+}, "")
+
+
 truth <- data.frame(read = paste0("R", seq_len(nReads)),
                     trueBarcode = barcode,
                     trueV2 = V2var,
                     trueV3 = V3var,
+                    trueV2aa = V2varAA,
+                    trueV3aa = V3varAA,
                     obsBarcode = v1,
                     status = status)
 
diff --git a/man/linkMultipleVariants.Rd b/man/linkMultipleVariants.Rd
index cb60207..b9a8122 100644
--- a/man/linkMultipleVariants.Rd
+++ b/man/linkMultipleVariants.Rd
@@ -4,7 +4,7 @@
 \alias{linkMultipleVariants}
 \title{Process an experiment with multiple variable sequences}
 \usage{
-linkMultipleVariants(combinedDigestParams = list(), ...)
+linkMultipleVariants(combinedDigestParams = list(), ..., collapseToAA = FALSE)
 }
 \arguments{
 \item{combinedDigestParams}{A named list of arguments to
@@ -13,6 +13,12 @@ linkMultipleVariants(combinedDigestParams = list(), ...)
 \item{...}{Additional arguments providing arguments to \code{digestFastqs}
 for the separate runs (processing each variable sequence in turn).
 Each argument must be a named list of arguments to \code{digestFastqs}.}
+
+\item{collapseToAA}{Either \code{TRUE}, \code{FALSE}, or a character vector
+indicating for which of the separate runs the sequences should be 
+collapsed to the amino acid mutant name rather than the codon- or 
+nucleotide-level name. \code{TRUE} is equivalent to providing the 
+names of all lists provided in \code{...}.}
 }
 \value{
 A list with the following elements: 
@@ -23,6 +29,7 @@ A list with the following elements:
 \item convSeparate - a list of conversion tables from the respective 
     separate runs.
 \item outCombined - the \code{digestFastqs} output for the combined run.
+\item filtSeparate - a list of filtering tables for the separate runs.
 }
 }
 \description{
diff --git a/tests/testthat/test_linkMultipleVariants.R b/tests/testthat/test_linkMultipleVariants.R
index 933db0b..f476375 100644
--- a/tests/testthat/test_linkMultipleVariants.R
+++ b/tests/testthat/test_linkMultipleVariants.R
@@ -16,6 +16,10 @@ test_that("linkMultipleVariants works", {
     expect_error(linkMultipleVariants(combinedDigestParams = list(fastqForward = 1),
                                       secondArg = list(unknown = 1)),
                  "All values in 'namesparms' must be one of")
+    expect_error(linkMultipleVariants(combinedDigestParams = list(fastqForward = 1),
+                                      secondArg = list(fastqForward = 1),
+                                      collapseToAA = 1),
+                 "'collapseToAA' must be of class 'character'")
     
     expect_error(linkMultipleVariants(combinedDigestParams = list(fastqForward = 1),
                                       secondArg = list(elementsForward = "CC")),
@@ -86,7 +90,8 @@ test_that("linkMultipleVariants works", {
             wildTypeForward = truth$WTV3,
             nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, 
             forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE,
-            verbose = verbose, useTreeWTmatch = TRUE)
+            verbose = verbose, useTreeWTmatch = TRUE),
+        collapseToAA = FALSE
     )
     
     ## Filter out only reads with N/deletions in the variable sequence
@@ -124,6 +129,186 @@ test_that("linkMultipleVariants works", {
     expect_equal(res$filtSeparate$V3[, "nbrRetained"], 
                  sum(!grepl("del", truth$truth$status)))
     
+    ## ------------------------------------------------------------------------
+    ## Don't filter by mutations in constant sequence, collapse by AA
+    res <- linkMultipleVariants(
+        combinedDigestParams = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1),
+            mergeForwardReverse = TRUE, minOverlap = 20, 
+            maxOverlap = 30, maxFracMismatchOverlap = 0,
+            revComplForward = FALSE, revComplReverse = TRUE,
+            avePhredMinForward = 20, avePhredMinReverse = 20,
+            verbose = verbose),
+        barcode = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            avePhredMinForward = 20, variableCollapseMaxDist = 1, 
+            variableCollapseMinReads = 1, variableCollapseMinRatio = 1,
+            verbose = verbose),
+        V2 = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1),
+            mergeForwardReverse = TRUE, minOverlap = 10, 
+            maxOverlap = 20, maxFracMismatchOverlap = 0,
+            revComplForward = FALSE, revComplReverse = TRUE,
+            avePhredMinForward = 20, avePhredMinReverse = 20,
+            wildTypeForward = truth$WTV2,
+            nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, 
+            forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE,
+            verbose = verbose, useTreeWTmatch = TRUE),
+        V3 = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1),
+            mergeForwardReverse = TRUE, minOverlap = 5, 
+            maxOverlap = 15, maxFracMismatchOverlap = 0,
+            revComplForward = FALSE, revComplReverse = TRUE,
+            avePhredMinForward = 20, avePhredMinReverse = 20,
+            wildTypeForward = truth$WTV3,
+            nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, 
+            forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE,
+            verbose = verbose, useTreeWTmatch = TRUE),
+        collapseToAA = c("V2", "V3")
+    )
+    
+    ## Filter out only reads with N/deletions in the variable sequence
+    keep <- !grepl("N|del", truth$truth$status)
+    
+    ## Truth
+    correct <- truth$truth[keep, ] %>%
+        dplyr::group_by(trueBarcode, trueV2aa, trueV3aa) %>% dplyr::tally() %>%
+        dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2aa, trueV3aa)
+    
+    ## Obs
+    obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3)
+    
+    expect_equal(sum(obs$nbrReads), sum(keep))
+    expect_equal(correct$trueBarcode, obs$barcode)
+    expect_equal(correct$trueV2aa, obs$V2)
+    expect_equal(correct$trueV3aa, obs$V3)
+    expect_equal(correct$n, obs$nbrReads)
+    expect_equal(res$outCombined$filterSummary[, "f4_nbrNoValidOverlap"], 
+                 sum(grepl("del", truth$truth$status)))  ## del
+    expect_equal(res$outCombined$filterSummary[, "f6_nbrTooManyNinVar"], 
+                 sum(grepl("N", truth$truth$status)))   ## N
+    expect_equal(res$outCombined$filterSummary[, "nbrRetained"], 
+                 sum(!grepl("N|del", truth$truth$status)))
+    expect_equal(res$filtSeparate$barcode[, "f6_nbrTooManyNinVar"], 
+                 sum(grepl("V1N", truth$truth$status)))
+    expect_equal(res$filtSeparate$barcode[, "nbrRetained"], 
+                 sum(!grepl("V1N", truth$truth$status)))
+    expect_equal(res$filtSeparate$V2[, "f4_nbrNoValidOverlap"], 
+                 sum(grepl("del", truth$truth$status)))
+    expect_equal(res$filtSeparate$V2[, "nbrRetained"], 
+                 sum(!grepl("del", truth$truth$status)))
+    expect_equal(res$filtSeparate$V3[, "f4_nbrNoValidOverlap"], 
+                 sum(grepl("del", truth$truth$status)))
+    expect_equal(res$filtSeparate$V3[, "nbrRetained"], 
+                 sum(!grepl("del", truth$truth$status)))
+    
+    ## ------------------------------------------------------------------------
+    ## Don't filter by mutations in constant sequence, collapse by AA (only V2)
+    res <- linkMultipleVariants(
+        combinedDigestParams = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1),
+            mergeForwardReverse = TRUE, minOverlap = 20, 
+            maxOverlap = 30, maxFracMismatchOverlap = 0,
+            revComplForward = FALSE, revComplReverse = TRUE,
+            avePhredMinForward = 20, avePhredMinReverse = 20,
+            verbose = verbose),
+        barcode = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            avePhredMinForward = 20, variableCollapseMaxDist = 1, 
+            variableCollapseMinReads = 1, variableCollapseMinRatio = 1,
+            verbose = verbose),
+        V2 = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1),
+            mergeForwardReverse = TRUE, minOverlap = 10, 
+            maxOverlap = 20, maxFracMismatchOverlap = 0,
+            revComplForward = FALSE, revComplReverse = TRUE,
+            avePhredMinForward = 20, avePhredMinReverse = 20,
+            wildTypeForward = truth$WTV2,
+            nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, 
+            forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE,
+            verbose = verbose, useTreeWTmatch = TRUE),
+        V3 = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1),
+            mergeForwardReverse = TRUE, minOverlap = 5, 
+            maxOverlap = 15, maxFracMismatchOverlap = 0,
+            revComplForward = FALSE, revComplReverse = TRUE,
+            avePhredMinForward = 20, avePhredMinReverse = 20,
+            wildTypeForward = truth$WTV3,
+            nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, 
+            forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE,
+            verbose = verbose, useTreeWTmatch = TRUE),
+        collapseToAA = "V2"
+    )
+    
+    ## Filter out only reads with N/deletions in the variable sequence
+    keep <- !grepl("N|del", truth$truth$status)
+    
+    ## Truth
+    correct <- truth$truth[keep, ] %>%
+        dplyr::group_by(trueBarcode, trueV2aa, trueV3) %>% dplyr::tally() %>%
+        dplyr::arrange(dplyr::desc(n), trueBarcode, trueV2aa, trueV3)
+    
+    ## Obs
+    obs <- res$countAggregated %>% dplyr::arrange(desc(nbrReads), barcode, V2, V3)
+    
+    expect_equal(sum(obs$nbrReads), sum(keep))
+    expect_equal(correct$trueBarcode, obs$barcode)
+    expect_equal(correct$trueV2aa, obs$V2)
+    expect_equal(correct$trueV3, obs$V3)
+    expect_equal(correct$n, obs$nbrReads)
+    expect_equal(res$outCombined$filterSummary[, "f4_nbrNoValidOverlap"], 
+                 sum(grepl("del", truth$truth$status)))  ## del
+    expect_equal(res$outCombined$filterSummary[, "f6_nbrTooManyNinVar"], 
+                 sum(grepl("N", truth$truth$status)))   ## N
+    expect_equal(res$outCombined$filterSummary[, "nbrRetained"], 
+                 sum(!grepl("N|del", truth$truth$status)))
+    expect_equal(res$filtSeparate$barcode[, "f6_nbrTooManyNinVar"], 
+                 sum(grepl("V1N", truth$truth$status)))
+    expect_equal(res$filtSeparate$barcode[, "nbrRetained"], 
+                 sum(!grepl("V1N", truth$truth$status)))
+    expect_equal(res$filtSeparate$V2[, "f4_nbrNoValidOverlap"], 
+                 sum(grepl("del", truth$truth$status)))
+    expect_equal(res$filtSeparate$V2[, "nbrRetained"], 
+                 sum(!grepl("del", truth$truth$status)))
+    expect_equal(res$filtSeparate$V3[, "f4_nbrNoValidOverlap"], 
+                 sum(grepl("del", truth$truth$status)))
+    expect_equal(res$filtSeparate$V3[, "nbrRetained"], 
+                 sum(!grepl("del", truth$truth$status)))
+    
     ## ------------------------------------------------------------------------
     ## Filter out mutations in constant sequences
     res <- linkMultipleVariants(

From 36f78d8f4e47064e61025159bdf354b417a835e2 Mon Sep 17 00:00:00 2001
From: Charlotte Soneson <charlottesoneson@gmail.com>
Date: Mon, 11 Apr 2022 20:29:57 +0200
Subject: [PATCH 2/2] Add associateBarcodes() function

---
 NAMESPACE                               |   2 +
 R/associateBarcodes.R                   | 114 ++++++++++++++++
 man/associateBarcodes.Rd                |  40 ++++++
 tests/testthat/test-associateBarcodes.R | 168 ++++++++++++++++++++++++
 4 files changed, 324 insertions(+)
 create mode 100644 R/associateBarcodes.R
 create mode 100644 man/associateBarcodes.Rd
 create mode 100644 tests/testthat/test-associateBarcodes.R

diff --git a/NAMESPACE b/NAMESPACE
index 22df1e7..b35770a 100755
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(associateBarcodes)
 export(calcNearestStringDist)
 export(calculateFitnessScore)
 export(calculateRelativeFC)
@@ -76,6 +77,7 @@ importFrom(ggplot2,geom_point)
 importFrom(ggplot2,geom_text)
 importFrom(ggplot2,ggplot)
 importFrom(ggplot2,labs)
+importFrom(ggplot2,scale_color_manual)
 importFrom(ggplot2,scale_fill_continuous)
 importFrom(ggplot2,scale_fill_discrete)
 importFrom(ggplot2,scale_x_continuous)
diff --git a/R/associateBarcodes.R b/R/associateBarcodes.R
new file mode 100644
index 0000000..e4e81d6
--- /dev/null
+++ b/R/associateBarcodes.R
@@ -0,0 +1,114 @@
+#' Associate barcode(s) to variants
+#' 
+#' @param countTable A data frame containing barcode sequences, variant 
+#'     sequences and corresponding read counts for each barcode-variant pair. 
+#' @param barcodeCol,variantCol,countCol Character scalars indicating the 
+#'     names of the columns of \code{countTable} containing barcode 
+#'     sequences, variant IDs and read counts, respectively. 
+#' @param minCount Numeric scalar. Pairs with fewer reads will be flagged 
+#'     for exclusion.
+#' @param minFracOfBarcodeCount Numeric scalar. Pairs where the read count 
+#'     is lower than \code{minFracOfBarcodeCount} * total barcode count will 
+#'     be flagged for exclusion. 
+#' 
+#' @export
+#' @author Charlotte Soneson
+#' 
+#' @return A list with two elements: \code{counts} (a data frame with counts)
+#' and \code{plots} (a list of ggplot objects).
+#' 
+#' @importFrom ggplot2 scale_color_manual
+#' @importFrom dplyr %>% group_by mutate ungroup filter arrange desc
+#'     summarize
+#' @importFrom rlang .data
+#' @importFrom ggplot2 ggplot aes geom_histogram theme_bw labs
+#'     scale_color_manual scale_x_log10 scale_y_log10 geom_point 
+#'     
+associateBarcodes <- function(countTable, barcodeCol, variantCol, 
+                              countCol = "nbrReads", minCount = 10, 
+                              minFracOfBarcodeCount = 0.5) {
+    .assertScalar(x = barcodeCol, type = "character", 
+                  validValues = colnames(countTable))
+    .assertVector(x = countTable[[barcodeCol]], type = "character")
+    .assertScalar(x = variantCol, type = "character", 
+                  validValues = colnames(countTable))
+    .assertVector(x = countTable[[variantCol]], type = "character")
+    .assertScalar(x = countCol, type = "character", 
+                  validValues = colnames(countTable))
+    .assertVector(x = countTable[[countCol]], type = "numeric")
+    .assertScalar(x = minCount, type = "numeric", rngIncl = c(0, Inf))
+    .assertScalar(x = minFracOfBarcodeCount, type = "numeric", rngIncl = c(0, 1))
+    
+    ## Initialize plot list
+    plots <- list()
+    
+    ## Add columns with total counts for barcode/variant, with and without 
+    ## filtering
+    countTable <- countTable %>%
+        ## first aggregate all rows with the same barcode/variant pair, 
+        ## in case that wasn't already done
+        dplyr::group_by(.data[[barcodeCol]], .data[[variantCol]]) %>%
+        dplyr::summarize("{countCol}" := sum(.data[[countCol]])) %>%
+        dplyr::ungroup() %>%
+        ## flag pairs with < minCount reads
+        dplyr::mutate(keep = (.data[[countCol]] >= minCount)) %>%
+        ## add total barcode counts
+        dplyr::group_by(.data[[barcodeCol]]) %>%
+        dplyr::mutate(totalBarcodeCountAll = sum(.data[[countCol]])) %>%
+        dplyr::mutate(totalBarcodeCountFilt = sum(.data[[countCol]][.data$keep])) %>%
+        dplyr::ungroup() %>%
+        ## add total variant counts
+        dplyr::group_by(.data[[variantCol]]) %>%
+        dplyr::mutate(totalVariantCountAll = sum(.data[[countCol]])) %>%
+        dplyr::mutate(totalVariantCountFilt = sum(.data[[countCol]][.data$keep])) %>%
+        dplyr::ungroup()
+    plots[["totalBcCountAll"]] <- 
+        ggplot(countTable %>% dplyr::filter(!duplicated(.data[[barcodeCol]])), 
+               aes(x = .data$totalBarcodeCountAll)) + 
+        geom_histogram(bins = 100, fill = "lightgrey") + 
+        theme_bw() + scale_x_log10() + 
+        labs(title = "Total barcode count (all)", x = "Total barcode count")
+    plots[["totalBcCountFiltered"]] <- 
+        ggplot(countTable %>% dplyr::filter(!duplicated(.data[[barcodeCol]])), 
+               aes(x = .data$totalBarcodeCountFilt)) + 
+        geom_histogram(bins = 100, fill = "lightgrey") + 
+        theme_bw() + scale_x_log10() + 
+        labs(title = "Total barcode count (filtered)", x = "Total barcode count")
+    
+    ## Calculate relative frequency for each pair as the ratio between the 
+    ## count for the pair and the total count for the barcode or variant, 
+    ## respectively
+    countTable <- countTable %>%
+        dplyr::mutate(fracOfTotalBarcodeCountFilt = 
+                          .data[[countCol]]/.data$totalBarcodeCountFilt,
+                      fracOfTotalVariantCountFilt = 
+                          .data[[countCol]]/.data$totalVariantCountFilt)
+    
+    ## Order by decreasing frequency and keep only the top hit for each 
+    ## barcode
+    countTable <- countTable %>%
+        dplyr::arrange(dplyr::desc(.data[[countCol]])) %>%
+        dplyr::mutate(keep = .data$keep & !duplicated(.data[[barcodeCol]]))
+    
+    ## Flag pairs where the count/total barcode count is too low
+    countTable <- countTable %>%
+        dplyr::mutate(keep = .data$keep & 
+                          (.data$fracOfTotalBarcodeCountFilt >= 
+                               minFracOfBarcodeCount))
+    
+    ## Plot count for pair vs total count for barcode
+    plots[["pairVsBarcodeCount"]] <- 
+        ggplot(countTable, aes(x = .data$totalBarcodeCountFilt, 
+                               y = .data[[countCol]], 
+                               color = .data$keep)) + 
+        geom_point(size = 0.5) + theme_bw() + 
+        scale_color_manual(values = c(`TRUE` = "black", `FALSE` = "red"),
+                           name = "Keep") + 
+        scale_x_log10() + scale_y_log10() + 
+        labs(x = "Total barcode count", y = "Barcode/variant pair count")
+    
+    return(list(counts = countTable, plots = plots))
+}
+
+
+
diff --git a/man/associateBarcodes.Rd b/man/associateBarcodes.Rd
new file mode 100644
index 0000000..d198dc7
--- /dev/null
+++ b/man/associateBarcodes.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/associateBarcodes.R
+\name{associateBarcodes}
+\alias{associateBarcodes}
+\title{Associate barcode(s) to variants}
+\usage{
+associateBarcodes(
+  countTable,
+  barcodeCol,
+  variantCol,
+  countCol = "nbrReads",
+  minCount = 10,
+  minFracOfBarcodeCount = 0.5
+)
+}
+\arguments{
+\item{countTable}{A data frame containing barcode sequences, variant 
+sequences and corresponding read counts for each barcode-variant pair.}
+
+\item{barcodeCol, variantCol, countCol}{Character scalars indicating the 
+names of the columns of \code{countTable} containing barcode 
+sequences, variant IDs and read counts, respectively.}
+
+\item{minCount}{Numeric scalar. Pairs with fewer reads will be flagged 
+for exclusion.}
+
+\item{minFracOfBarcodeCount}{Numeric scalar. Pairs where the read count 
+is lower than \code{minFracOfBarcodeCount} * total barcode count will 
+be flagged for exclusion.}
+}
+\value{
+A list with two elements: \code{counts} (a data frame with counts)
+and \code{plots} (a list of ggplot objects).
+}
+\description{
+Associate barcode(s) to variants
+}
+\author{
+Charlotte Soneson
+}
diff --git a/tests/testthat/test-associateBarcodes.R b/tests/testthat/test-associateBarcodes.R
new file mode 100644
index 0000000..e8c6140
--- /dev/null
+++ b/tests/testthat/test-associateBarcodes.R
@@ -0,0 +1,168 @@
+test_that("associateBarcodes works", {
+    ## Read truth (for WT sequences)
+    truth <- readRDS(system.file("extdata", "multipleVariableRegions_truth.rds",
+                                 package = "mutscan"))
+    verbose <- FALSE
+    
+    ## Don't filter by mutations in constant sequence
+    res <- linkMultipleVariants(
+        combinedDigestParams = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CVCVCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            elementsReverse = "CVCV", elementLengthsReverse = c(6, 40, 10, -1),
+            mergeForwardReverse = TRUE, minOverlap = 20, 
+            maxOverlap = 30, maxFracMismatchOverlap = 0,
+            revComplForward = FALSE, revComplReverse = TRUE,
+            avePhredMinForward = 20, avePhredMinReverse = 20,
+            verbose = verbose),
+        barcode = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CVCSCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            avePhredMinForward = 20, variableCollapseMaxDist = 1, 
+            variableCollapseMinReads = 1, variableCollapseMinRatio = 1,
+            verbose = verbose),
+        V2 = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CSCVCS", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            elementsReverse = "CSCV", elementLengthsReverse = c(6, 40, 10, -1),
+            mergeForwardReverse = TRUE, minOverlap = 10, 
+            maxOverlap = 20, maxFracMismatchOverlap = 0,
+            revComplForward = FALSE, revComplReverse = TRUE,
+            avePhredMinForward = 20, avePhredMinReverse = 20,
+            wildTypeForward = truth$WTV2,
+            nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, 
+            forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE,
+            verbose = verbose, useTreeWTmatch = TRUE),
+        V3 = list(
+            fastqForward = system.file("extdata", "multipleVariableRegions_R1.fastq.gz",
+                                       package = "mutscan"),
+            fastqReverse = system.file("extdata", "multipleVariableRegions_R2.fastq.gz",
+                                       package = "mutscan"),
+            elementsForward = "CSCSCV", elementLengthsForward = c(6, 24, 10, 30, 10, -1),
+            elementsReverse = "CVCS", elementLengthsReverse = c(6, 40, 10, -1),
+            mergeForwardReverse = TRUE, minOverlap = 5, 
+            maxOverlap = 15, maxFracMismatchOverlap = 0,
+            revComplForward = FALSE, revComplReverse = TRUE,
+            avePhredMinForward = 20, avePhredMinReverse = 20,
+            wildTypeForward = truth$WTV3,
+            nbrMutatedCodonsMaxForward = -1, nbrMutatedBasesMaxForward = 1, 
+            forbiddenMutatedCodonsForward = "", collapseToWTForward = FALSE,
+            verbose = verbose, useTreeWTmatch = TRUE),
+        collapseToAA = FALSE
+    )
+    
+    ## Fails with the wrong input
+    ## ------------------------------------------------------------------------
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = 1,
+                                   variantCol = "V2", countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'barcodeCol' must be of class 'character'")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = c("barcode", "V2"),
+                                   variantCol = "V2", countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'barcodeCol' must have length 1")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "missing",
+                                   variantCol = "V2", countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "All values in 'barcodeCol' must be one of")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "nbrReads",
+                                   variantCol = "V2", countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'[[countTablebarcodeCol' must be of class 'character'", fixed = TRUE)
+    
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = 1, countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'variantCol' must be of class 'character'")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = c("V2", "barcode"), 
+                                   countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'variantCol' must have length 1")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "missing", countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "All values in 'variantCol' must be one of")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "nbrReads", countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'[[countTablevariantCol' must be of class 'character'", fixed = TRUE)
+    
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "V2", countCol = 1, minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'countCol' must be of class 'character'")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "V2", countCol = c("barcode", "V2"), minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'countCol' must have length 1")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "V2", countCol = "missing", minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "All values in 'countCol' must be one of")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "V2", countCol = "barcode", minCount = 2, 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'[[countTablecountCol' must be of class 'numeric'", fixed = TRUE)
+    
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "V2", countCol = "nbrReads", minCount = "2", 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'minCount' must be of class 'numeric'")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "V2", countCol = "nbrReads", minCount = c(1, 2), 
+                                   minFracOfBarcodeCount = 0.25),
+                 "'minCount' must have length 1")
+    
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "V2", countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = "0.25"),
+                 "'minFracOfBarcodeCount' must be of class 'numeric'")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "V2", countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = c(0.1, 0.25)),
+                 "'minFracOfBarcodeCount' must have length 1")
+    expect_error(associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                                   variantCol = "V2", countCol = "nbrReads", minCount = 2, 
+                                   minFracOfBarcodeCount = 2.5),
+                 "'minFracOfBarcodeCount' must be within [0,1]", fixed = TRUE)
+
+    
+    ## Works with the correct input
+    ## ------------------------------------------------------------------------
+    ## Run barcode association
+    out <- associateBarcodes(res$countAggregated, barcodeCol = "barcode",
+                             variantCol = "V2", minCount = 2, 
+                             minFracOfBarcodeCount = 0.25)
+    
+    ## Calculate expected values manually
+    correct <- truth$truth[!grepl("N|del", truth$truth$status), ] %>%
+        ## get frequency for each barcode/variant pair
+        dplyr::group_by(trueBarcode, trueV2) %>%
+        dplyr::tally() %>% 
+        ## filter rare pairs
+        dplyr::filter(n >= 2) %>%
+        ## filter pairs with too low fraction of total bc counts
+        dplyr::group_by(trueBarcode) %>% 
+        dplyr::mutate(relFrac = n/sum(n)) %>%
+        dplyr::filter(relFrac >= 0.25) %>%
+        ## keep top variant for each barcode %>%
+        dplyr::arrange(dplyr::desc(n)) %>%
+        dplyr::filter(!duplicated(trueBarcode))
+        
+    expect_equal(sum(out$counts$keep), nrow(correct))
+    expect_true(all(paste0(out$counts$barcode[out$counts$keep], 
+                           out$counts$V2[out$counts$keep]) %in% 
+                        paste0(correct$trueBarcode, correct$trueV2)))
+    tmp <- out$counts[match(paste0(correct$trueBarcode, correct$trueV2),
+                            paste0(out$counts$barcode, out$counts$V2)), ]
+    expect_equal(tmp$nbrReads, correct$n)
+    expect_equal(tmp$fracOfTotalBarcodeCountFilt, correct$relFrac)
+})