diff --git a/bin/needlestack_parallel.r b/bin/needlestack_parallel.r
new file mode 100644
index 0000000..14c7c53
--- /dev/null
+++ b/bin/needlestack_parallel.r
@@ -0,0 +1,923 @@
+#! /usr/bin/env Rscript
+
+# needlestack: a multi-sample somatic variant caller Copyright (C) 2017
+# IARC/WHO This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by the Free
+# Software Foundation, either version 3 of the License, or (at your option) any
+# later version. This program is distributed in the hope that it will be
+# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+# Public License for more details. You should have received a copy of the GNU
+# General Public License along with this program. If not, see
+# .
+library(parallel)
+
+options(warn = 1)
+print(Sys.time())
+
+############################## ARGUMENTS SECTION ##############################
+args <- commandArgs(TRUE)
+parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
+argsL <- as.list(as.character(as.data.frame(do.call("rbind", parseArgs(args)))$V2))
+names(argsL) <- as.data.frame(do.call("rbind", parseArgs(args)))$V1
+args <- argsL
+rm(argsL)
+
+if ("--help" %in% args | is.null(args$out_vcf_file) | is.null(args$out_qvals_file) |
+ is.null(args$fasta_ref)) {
+ cat("
+ The R Script arguments_section.R
+ Mandatory arguments:
+ --input_file=path - path to input file file (combined read counts for all samples in names file)
+ --names_file=path - path to names file (tsv, no header, path,sample_name)
+ --out_vcf_file=file_name - name of output vcf
+ --out_qvals_file=file_name - name of output qvals file
+ --fasta_ref=path - path of fasta ref
+ --help - print this text
+ Optional arguments:
+ --num_cores - number of cores to use, default=1
+ --source_path=path - path to source files (glm_rob_nb.r)
+ --samtools=path - path of samtools, default=samtools
+ --SB_type=SOR, RVSB or FS - strand bias measure, default=SOR
+ --SB_threshold_SNV=value - strand bias threshold for SNV, default=100
+ --SB_threshold_indel=value - strand bias threshold for indel, default=100
+ --min_coverage=value - minimum coverage in at least one sample to consider a site, default=50
+ --min_reads=value - minimum number of non-ref reads in at least one sample to consider a site, default=5
+ --min_af=value - minimum allelic fraction in at least one sample to consider a site, default=0
+ --GQ_threshold=value - phred scale qvalue threshold for variants, default=50
+ --output_all_SNVs=boolean - output all SNVs, even when no variant is detected, default=FALSE
+ --ref_genome=string - reference genome for alignments plot, examples : Hsapiens.UCSC.hg19, Hsapiens.UCSC.hg18, Mmusculus.UCSC.mm10...
+ --extra_rob=boolean - perform an extra-robust regression, default=FALSE
+ --min_af_extra_rob=value - minimum allelic fraction to exclude a sample at a position for extra-robust regression, default=0.2
+ --min_prop_extra_rob=value - minimum proportion of samples having an allelic fraction to be excluded from extra-robust regression, default=0.1
+ --max_prop_extra_rob=value - maximum proportion of samples having an allelic fraction to be excluded from extra-robust regression, default=0.1
+ --afmin_power=value - minimum allelic fraction in mean for somatic mutations, default=0.01
+ --sigma=value - sigma parameter for negative binomial modeling germline mutations, default=0.1
+ WARNING : by default samtools has to be in your path
+ Example:
+ needlestack_mt.r --input_file=read_counts.tsv --names_file=names.txt --out_qvals_file=test_qvals.vcf --out_vcf_file=test.vcf --fasta_ref=path_to_ref_fasta \n\n")
+
+ q(save = "no")
+}
+if (is.null(args$num_cores)) {
+ args$num_cores <- 1
+} else {
+ args$num_cores <- as.numeric(args$num_cores)
+}
+if (is.null(args$samtools)) {
+ args$samtools <- "samtools"
+}
+if (is.null(args$SB_type)) {
+ args$SB_type <- "SOR"
+}
+if (is.null(args$SB_threshold_SNV)) {
+ args$SB_threshold_SNV <- 100
+} else {
+ args$SB_threshold_SNV <- as.numeric(args$SB_threshold_SNV)
+}
+if (is.null(args$SB_threshold_indel)) {
+ args$SB_threshold_indel <- 100
+} else {
+ args$SB_threshold_indel <- as.numeric(args$SB_threshold_indel)
+}
+if (is.null(args$min_coverage)) {
+ args$min_coverage <- 30
+} else {
+ args$min_coverage <- as.numeric(args$min_coverage)
+}
+if (is.null(args$min_reads)) {
+ args$min_reads <- 3
+} else {
+ args$min_reads <- as.numeric(args$min_reads)
+}
+if (is.null(args$min_af)) {
+ args$min_af <- 0
+} else {
+ args$min_af <- as.numeric(args$min_af)
+}
+if (is.null(args$GQ_threshold)) {
+ args$GQ_threshold <- 50
+} else {
+ args$GQ_threshold <- as.numeric(args$GQ_threshold)
+}
+if (is.null(args$output_all_SNVs)) {
+ args$output_all_SNVs <- FALSE
+} else {
+ args$output_all_SNVs <- as.logical(args$output_all_SNVs)
+}
+
+
+
+if (is.null(args$extra_rob)) {
+ args$extra_rob <- FALSE
+} else {
+ args$extra_rob <- as.logical(args$extra_rob)
+}
+if (is.null(args$min_af_extra_rob)) {
+ args$min_af_extra_rob <- 0.2
+} else {
+ args$min_af_extra_rob <- as.numeric(args$min_af_extra_rob)
+}
+if (is.null(args$min_prop_extra_rob)) {
+ args$min_prop_extra_rob <- 0.1
+} else {
+ args$min_prop_extra_rob <- as.numeric(args$min_prop_extra_rob)
+}
+if (is.null(args$max_prop_extra_rob)) {
+ args$max_prop_extra_rob <- 0.8
+} else {
+ args$max_prop_extra_rob <- as.numeric(args$max_prop_extra_rob)
+}
+if (is.null(args$afmin_power)) {
+ args$afmin_power <- -1
+} else {
+ args$afmin_power <- as.numeric(args$afmin_power)
+}
+if (is.null(args$sigma)) {
+ args$sigma <- 0.1
+} else {
+ args$sigma <- as.numeric(args$sigma)
+}
+num_cores <- args$num_cores
+samtools <- args$samtools
+input_file <- args$input_file
+names_file <- args$names_file
+out_vcf_file <- args$out_vcf_file
+out_qvals_file <- args$out_qvals_file
+fasta_ref <- args$fasta_ref
+GQ_threshold <- args$GQ_threshold
+min_coverage <- args$min_coverage
+min_reads <- args$min_reads
+min_af <- args$min_af
+print(min_af)
+# http://gatkforums.broadinstitute.org/discussion/5533/strandoddsratio-computation
+# filter out SOR > 4 for SNVs and > 10 for indels filter out RVSB > 0.85 (maybe
+# less stringent for SNVs)
+SB_type <- args$SB_type
+SB_threshold_SNV <- args$SB_threshold_SNV
+SB_threshold_indel <- args$SB_threshold_indel
+output_all_SNVs <- args$output_all_SNVs
+
+
+ref_genome <- args$ref_genome
+
+
+extra_rob <- args$extra_rob
+min_af_extra_rob <- args$min_af_extra_rob
+min_prop_extra_rob <- args$min_prop_extra_rob
+max_prop_extra_rob <- args$max_prop_extra_rob
+sigma <- args$sigma
+afmin_power <- args$afmin_power
+
+
+
+
+# source R files
+getScriptPath <- function() {
+ cmd.args <- commandArgs()
+ m <- regexpr("(?<=^--file=).+", cmd.args, perl = TRUE)
+ script.dir <- dirname(regmatches(cmd.args, m))
+ if (length(script.dir) == 0)
+ stop("can't determine script dir: please call the script with Rscript")
+ if (length(script.dir) > 1)
+ stop("can't determine script dir: more than one '--file' argument detected")
+ return(script.dir)
+}
+
+if (is.null(args$source_path)) {
+ source_path <- paste0(getScriptPath(), "/")
+} else {
+ source_path <- args$source_path
+}
+
+source(paste(source_path, "glm_rob_nb.r", sep = ""))
+
+
+
+############################################################
+
+options(scipen = 100)
+
+indiv_run <- read.table(names_file, stringsAsFactors = F, colClasses = "character")
+# check if samples names exist in names_file file
+if (ncol(indiv_run) != 2) {
+ cat("Error : names_file contains only one column : samples names can't be obtained from the bam files : use --use_file_name option")
+ q(save = "no")
+}
+indiv_run[, 2] <- make.unique(indiv_run[, 2], sep = "_")
+
+nindiv <- nrow(indiv_run)
+
+id_samples <- seq(1, nindiv) # samples IDs (integers)
+
+base_cols = c("CHROM", "POS", "REF", "ALT", "TYPE", "DISPERSION", "ERROR", "REF_INV")
+output_cols = c(base_cols, indiv_run[, 2])
+
+
+
+# columns numbers in the input file
+depth <- (1:nindiv) * 11 - 10 + 3
+A_cols <- (1:nindiv) * 11 - 9 + 3
+T_cols <- (1:nindiv) * 11 - 8 + 3
+C_cols <- (1:nindiv) * 11 - 7 + 3
+G_cols <- (1:nindiv) * 11 - 6 + 3
+a_cols <- (1:nindiv) * 11 - 5 + 3
+t_cols <- (1:nindiv) * 11 - 4 + 3
+c_cols <- (1:nindiv) * 11 - 3 + 3
+g_cols <- (1:nindiv) * 11 - 2 + 3
+insertion_cols <- (1:nindiv) * 11 - 1 + 3
+deletion_cols <- (1:nindiv) * 11 - 0 + 3
+
+
+############################## FUNCTION SECTION ##############################
+non_ref_bases <- function(ref) {
+ setdiff(c("A", "T", "C", "G"), ref)
+}
+
+SOR <- function(RO_forward, AO_forward, RO_reverse, AO_reverse) {
+ X00 <- RO_forward
+ X10 <- AO_forward
+ X01 <- RO_reverse
+ X11 <- AO_reverse
+ refRatio <- pmax(X00, X01)/pmin(X00, X01)
+ altRatio <- pmax(X10, X11)/pmin(X10, X11)
+ R <- (X00/X01) * (X11/X10)
+ log((R + 1/R)/(refRatio/altRatio))
+}
+
+common_annot <- function(linepos, Vp, Vm) {
+ Rp <<- as.numeric(linepos[eval(as.name(paste(linepos[3], "_cols", sep = "")))])
+ Rm <<- as.numeric(linepos[eval(as.name(paste(tolower(linepos[3]), "_cols", sep = "")))])
+ Cp <<- Vp + Rp
+ Cm <<- Vm + Rm
+ tmp_rvsbs <- pmax(Vp * Cm, Vm * Cp)/(Vp * Cm + Vm * Cp)
+ tmp_rvsbs[which(is.na(tmp_rvsbs))] <- -1
+ rvsbs <<- tmp_rvsbs
+ all_rvsb <<- max(as.numeric(sum(Vp)) * sum(Cm), as.numeric(sum(Vm)) * sum(Cp))/(as.numeric(sum(Vp)) *
+ sum(Cm) + as.numeric(sum(Vm)) * sum(Cp))
+ if (is.na(all_rvsb))
+ all_rvsb <<- (-1)
+ tmp_sors <<- SOR(Rp, Vp, Rm, Vm)
+ tmp_sors[which(is.na(tmp_sors))] <- -1
+ tmp_sors[which(is.infinite(tmp_sors))] <- 99
+ sors <<- tmp_sors
+ all_sor <<- SOR(sum(Rp), sum(Vp), sum(Rm), sum(Vm))
+ if (is.na(all_sor))
+ all_sor <<- (-1)
+ if (is.infinite(all_sor))
+ all_sor <<- 99
+ FisherStrand <<- -10 * log10(unlist(lapply(1:nindiv, function(indiv) fisher.test(matrix(c(Vp[indiv],
+ Vm[indiv], Rp[indiv], Rm[indiv]), nrow = 2))$p.value))) # col1=alt(V), col2=ref(R)
+ FisherStrand[which(FisherStrand > 1000)] <- 1000
+ FisherStrand_all <<- -10 * log10(fisher.test(matrix(c(sum(Vp), sum(Vm), sum(Rp),
+ sum(Rm)), nrow = 2))$p.value)
+ if (FisherStrand_all > 1000)
+ FisherStrand_all <- 1000
+ all_RO <<- sum(Rp + Rm)
+}
+
+## functions to compute the Qvalue of a function with a given AF change sigma
+## value to change the departure from binomial distribution with parameter 0.5
+toQvalueN <- function(x, rob_nb_res, sigma) {
+ y <- qnbinom(0.01, size = 1/sigma, mu = 0.5 * x)
+ unlist(-10 * log10(p.adjust((dnbinom(c(rob_nb_res$ma_count, y), size = 1/rob_nb_res$coef[[1]],
+ mu = rob_nb_res$coef[[2]] * c(rob_nb_res$coverage, x)) + pnbinom(c(rob_nb_res$ma_count,
+ y), size = 1/rob_nb_res$coef[[1]], mu = rob_nb_res$coef[[2]] * c(rob_nb_res$coverage,
+ x), lower.tail = F)))[length(rob_nb_res$coverage) + 1]))
+}
+
+toQvalueT <- function(x, rob_nb_res, afmin_power) {
+ # change afmin_power to
+ y <- qbinom(0.01, x, afmin_power, lower.tail = T)
+ unlist(-10 * log10(p.adjust((dnbinom(c(rob_nb_res$ma_count, y), size = 1/rob_nb_res$coef[[1]],
+ mu = rob_nb_res$coef[[2]] * c(rob_nb_res$coverage, x)) + pnbinom(c(rob_nb_res$ma_count,
+ y), size = 1/rob_nb_res$coef[[1]], mu = rob_nb_res$coef[[2]] * c(rob_nb_res$coverage,
+ x), lower.tail = F)))[length(rob_nb_res$coverage) + 1]))
+}
+
+###############################################################################################
+
+
+############################## WRITEOUT SECTION ##############################
+write_out <- function(...) {
+ cat(paste(..., sep = ""), "\n", file = out_vcf_file, sep = "", append = T)
+}
+
+write_vcf_header <- function(samples) {
+
+ write_out("##fileformat=VCFv4.1")
+ write_out("##fileDate=", format(Sys.Date(), "%Y%m%d"))
+ write_out("##source=needlestack v1.1")
+ write_out("##reference=", fasta_ref)
+ write_out("##contig=")
+ write_out("##contig=")
+ write_out("##phasing=none")
+ write_out("##filter=\"QVAL > ", GQ_threshold, " & ", SB_type, "_SNV < ", SB_threshold_SNV,
+ " & ", SB_type, "_INDEL < ", SB_threshold_indel, " & min(AO) >= ", min_reads,
+ " & min(AF) >= ", min_af, " & min(DP) >= ", min_coverage, "\"")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+ write_out("##INFO=")
+
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+ write_out("##FORMAT=")
+
+ write_out("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t", paste(samples[,
+ 2], collapse = "\t"))
+
+}
+
+model_for_snv <- function(linepos) {
+ qvals = matrix(nrow = 0, ncol = length(output_cols))
+ dimnames(qvals) = list(NULL, output_cols)
+ vcf_records = list()
+
+ # SNV for each base different from the reference base on the
+
+ for (alt in non_ref_bases(linepos[3])) {
+
+ ref <- linepos[3]
+
+ # count the number of alternative bases (nonreference)
+
+ Vp <- as.numeric(linepos[eval(as.name(paste(alt, "_cols", sep = "")))])
+ Vm <- as.numeric(linepos[eval(as.name(paste(tolower(alt), "_cols", sep = "")))])
+ ma_count <- Vp + Vm
+ DP <- as.numeric(linepos[eval(depth)])
+
+ if (sum((ma_count/DP) > 0.8, na.rm = T) > 0.5 * length(ma_count)) {
+ # here we need to reverse alt and ref for the regression (use
+ # ma_count of the ref)
+ alt_inv <- linepos[3]
+ Vp_inv <- as.numeric(linepos[eval(as.name(paste(alt_inv, "_cols", sep = "")))])
+ Vm_inv <- as.numeric(linepos[eval(as.name(paste(tolower(alt_inv), "_cols",
+ sep = "")))])
+ ma_count_inv <- Vp_inv + Vm_inv
+ ref_inv <- TRUE
+
+ reg_res <- glmrob.nb(x = DP, y = ma_count_inv, min_coverage = min_coverage,
+ min_reads = min_reads, min_af = min_af, extra_rob = extra_rob, min_af_extra_rob = min_af_extra_rob,
+ min_prop_extra_rob = min_prop_extra_rob, max_prop_extra_rob = max_prop_extra_rob)
+ } else {
+ ref_inv <- FALSE
+ reg_res <- glmrob.nb(x = DP, y = ma_count, min_coverage = min_coverage,
+ min_reads = min_reads, min_af = min_af, extra_rob = extra_rob, min_af_extra_rob = min_af_extra_rob,
+ min_prop_extra_rob = min_prop_extra_rob, max_prop_extra_rob = max_prop_extra_rob)
+ }
+
+ # compute Qval for minAF
+ qval_minAF <- rep(0, nindiv)
+ somatic_status <- rep(".", nindiv)
+
+ if (afmin_power == -1) {
+ # no minimum frequency supplied -> use a negative binomial
+ # distribution to check the power
+ qval_minAF <- sapply(1:nindiv, function(ii) toQvalueN(DP[ii], reg_res,
+ sigma))
+ } else {
+ # minimum frequency supplied -> use a binomial distribution
+ # centered around afmin_power to check the power
+ qval_minAF <- sapply(1:nindiv, function(ii) toQvalueT(DP[ii], reg_res,
+ afmin_power))
+ }
+ output = c(linepos[1:3], alt, "SNV", reg_res$coef["sigma"], reg_res$coef["slope"],
+ ref_inv, reg_res$GQ)
+ names(output) = output_cols
+ qvals = rbind(qvals, output)
+
+
+ if (output_all_SNVs | (!is.na(reg_res$coef["slope"]) & sum(reg_res$GQ >=
+ GQ_threshold, na.rm = TRUE) > 0)) {
+ all_AO <- sum(ma_count)
+ all_DP <- sum(as.numeric(linepos[eval(depth)]))
+ common_annot(linepos, Vp, Vm)
+ all_RO <- sum(Rp + Rm)
+ if (SB_type == "SOR") {
+ sbs <- sors
+ } else {
+ if (SB_type == "RVSB") {
+ sbs <- rvsbs
+ } else {
+ sbs <- FisherStrand
+ }
+ }
+ if (output_all_SNVs | (sum(sbs <= SB_threshold_SNV & reg_res$GQ >= GQ_threshold,
+ na.rm = TRUE) > 0)) {
+ # to get context around the base
+ con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1],
+ ":", as.numeric(linepos[2]) - 3, "-", as.numeric(linepos[2]) -
+ 1, " | tail -n1", sep = ""))
+ before <- readLines(con)
+ close(con)
+ con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1],
+ ":", as.numeric(linepos[2]) + 1, "-", as.numeric(linepos[2]) +
+ 3, " | tail -n1", sep = ""))
+ after <- readLines(con)
+ close(con)
+
+
+ vcf_record = paste0(linepos[1], "\t", linepos[2], "\t", ".", "\t",
+ linepos[3], "\t", alt, "\t", max(reg_res$GQ), "\t", ".")
+ # INFO field
+
+ vcf_info = paste0("TYPE=snv;NS=", sum(as.numeric(linepos[eval(depth)]) >
+ 0), ";AF=", sum(reg_res$GQ >= GQ_threshold)/sum(as.numeric(linepos[eval(depth)]) >
+ 0), ";DP=", all_DP, ";RO=", all_RO, ";AO=", all_AO, ";SRF=", sum(Rp),
+ ";SRR=", sum(Rm), ";SAF=", sum(Vp), ";SAR=", sum(Vm), ";SOR=",
+ all_sor, ";RVSB=", all_rvsb, ";FS=", FisherStrand_all, ";ERR=",
+ reg_res$coef["slope"], ";SIG=", reg_res$coef["sigma"], ";CONT=",
+ paste(before, after, sep = "x"), ifelse(reg_res$extra_rob & !(ref_inv),
+ ";WARN=EXTRA_ROBUST_GL", ""), ifelse(reg_res$extra_rob & ref_inv,
+ ";WARN=EXTRA_ROBUST_GL/INV_REF", ""), ifelse(ref_inv & !(reg_res$extra_rob),
+ ";WARN=INV_REF", ""))
+ # FORMAT field
+
+ vcf_format = paste0("GT:", ifelse(ref_inv, "QVAL_INV", "QVAL"), ":DP:RO:AO:AF:SB:SOR:RVSB:FS:QVAL_minAF:STATUS")
+
+ vcf_record = paste(vcf_record, vcf_info, vcf_format, sep = "\t")
+ # all samples
+ if (ref_inv) {
+ genotype <- rep("1/1", l = nindiv)
+ } else {
+ genotype <- rep("0/0", l = nindiv)
+ }
+ heterozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV &
+ reg_res$ma_count/reg_res$coverage < 0.75)
+ genotype[heterozygotes] <- "0/1"
+ homozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV &
+ reg_res$ma_count/reg_res$coverage >= 0.75)
+ if (ref_inv) {
+ genotype[homozygotes] <- "0/0"
+ } else {
+ genotype[homozygotes] <- "1/1"
+ }
+ # no tumor variant but low power -> './.'
+ genotype[(reg_res$GQ < GQ_threshold) & (qval_minAF < GQ_threshold)] <- "./."
+ for (cur_sample in 1:nindiv) {
+
+ geno = paste0(genotype[cur_sample], ":", reg_res$GQ[cur_sample],
+ ":", DP[cur_sample], ":", (Rp + Rm)[cur_sample], ":", ma_count[cur_sample],
+ ":", (ma_count/DP)[cur_sample], ":", Rp[cur_sample], ",", Rm[cur_sample],
+ ",", Vp[cur_sample], ",", Vm[cur_sample], ":", sors[cur_sample],
+ ":", rvsbs[cur_sample], ":", FisherStrand[cur_sample], ":", qval_minAF[cur_sample],
+ ":", somatic_status[cur_sample])
+ vcf_record = paste(vcf_record, geno, sep = "\t")
+
+ }
+ vcf_record = paste0(vcf_record, "\n")
+ vcf_records = append(vcf_records, vcf_record)
+
+
+ }
+ }
+ }
+
+
+ return(list(qvals = qvals, vcf_records = vcf_records))
+}
+model_for_deletion <- function(linepos) {
+ # DEL
+ ldel <- linepos[deletion_cols]
+
+ names(ldel) <- id_samples
+ all_del <- ldel[!(is.na(ldel))]
+
+ qvals = matrix(nrow = 0, ncol = length(output_cols))
+ dimnames(qvals) = list(NULL, output_cols)
+ vcf_records = list()
+ ref = linepos[3]
+ if (length(all_del) > 0) {
+ all_del <- as.data.frame(strsplit(unlist(strsplit(paste(all_del), split = "|",
+ fixed = T)), split = ":", fixed = T), stringsAsFactors = F, )
+ uniq_del <- unique(toupper(as.character(all_del[2, ])))
+
+
+ coverage_all_del <- sum(as.numeric(all_del[1, ]))
+
+ for (cur_del in uniq_del) {
+
+ Vp <- rep(0, nindiv)
+ names(Vp) <- id_samples
+ Vm <- Vp
+ all_cur_del <- strsplit(grep(cur_del, ldel, ignore.case = T, value = T),
+ split = "|", fixed = T)
+ all_cur_del_p <- lapply(all_cur_del, function(x) {
+ grep(paste(":", cur_del, "$", sep = ""), x, value = T)
+ })
+ all_cur_del_m <- lapply(all_cur_del, function(x) {
+ grep(paste(":", tolower(cur_del), "$", sep = ""), x, value = T)
+ })
+ ma_p_cur_del <- unlist(lapply(all_cur_del_p, function(x) {
+ as.numeric(gsub(paste("([0-9]+):", cur_del, "$", sep = ""), "\\1",
+ x, ignore.case = T))
+ }))
+ ma_m_cur_del <- unlist(lapply(all_cur_del_m, function(x) {
+ as.numeric(gsub(paste("([0-9]+):", cur_del, "$", sep = ""), "\\1",
+ x, ignore.case = T))
+ }))
+ Vp[names(ma_p_cur_del)] <- ma_p_cur_del
+ Vm[names(ma_m_cur_del)] <- ma_m_cur_del
+ ma_count <- Vp + Vm
+ DP <- as.numeric(linepos[eval(depth)])
+ if (sum((ma_count/DP) > 0.8, na.rm = T) > 0.5 * length(ma_count)) {
+ # here we need to reverse alt and ref for the regression (use
+ # ma_count of the ref)
+ ref <- linepos[3]
+ cur_del_inv <- linepos[3]
+ Vp_inv <- as.numeric(linepos[eval(as.name(paste(cur_del_inv, "_cols",
+ sep = "")))])
+ Vm_inv <- as.numeric(linepos[eval(as.name(paste(tolower(cur_del_inv),
+ "_cols", sep = "")))])
+ ma_count_inv <- Vp_inv + Vm_inv
+ ref_inv <- TRUE
+ reg_res <- glmrob.nb(x = DP, y = ma_count_inv, min_coverage = min_coverage,
+ min_reads = min_reads, min_af = min_af, extra_rob = extra_rob,
+ min_af_extra_rob = min_af_extra_rob, min_prop_extra_rob = min_prop_extra_rob,
+ max_prop_extra_rob = max_prop_extra_rob)
+
+ } else {
+ ref_inv <- FALSE
+ reg_res <- glmrob.nb(x = DP, y = ma_count, min_coverage = min_coverage,
+ min_reads = min_reads, min_af = min_af, extra_rob = extra_rob,
+ min_af_extra_rob = min_af_extra_rob, min_prop_extra_rob = min_prop_extra_rob,
+ max_prop_extra_rob = max_prop_extra_rob)
+
+ }
+ output = c(linepos[1:3], cur_del, "DEL", reg_res$coef["sigma"], reg_res$coef["slope"],
+ ref_inv, reg_res$GQ)
+
+ names(output) = output_cols
+ qvals = rbind(qvals, output)
+ # compute Qval20pc
+ qval_minAF <- rep(0, nindiv)
+ somatic_status <- rep(".", nindiv)
+
+ if (afmin_power == -1) {
+ # no minimum frequency supplied -> use a negative binomial
+ # distribution to check the power
+ qval_minAF <- sapply(1:nindiv, function(ii) toQvalueN(DP[ii], reg_res,
+ sigma))
+ } else {
+ # minimum frequency supplied -> use a binomial distribution
+ # centered around afmin_power to check the power
+ qval_minAF <- sapply(1:nindiv, function(ii) toQvalueT(DP[ii], reg_res,
+ afmin_power))
+ }
+
+
+ if (!is.na(reg_res$coef["slope"]) & sum(reg_res$GQ >= GQ_threshold, na.rm = TRUE) >
+ 0) {
+ all_AO <- sum(ma_count)
+ all_DP <- sum(as.numeric(linepos[eval(depth)])) + sum(ma_count)
+ common_annot(linepos, Vp, Vm)
+ all_RO <- sum(Rp + Rm)
+ if (SB_type == "SOR") {
+ sbs <- sors
+ } else {
+ if (SB_type == "RVSB") {
+ sbs <- rvsbs
+ } else {
+ sbs <- FisherStrand
+ }
+ }
+ if (sum(sbs <= SB_threshold_indel & reg_res$GQ >= GQ_threshold, na.rm = TRUE) >
+ 0) {
+ con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1],
+ ":", as.numeric(linepos[2]) + 1 - 3, "-", as.numeric(linepos[2]) +
+ 1 - 1, " | tail -n1", sep = ""))
+ before <- toupper(readLines(con))
+ close(con)
+ con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1],
+ ":", as.numeric(linepos[2]) + 1 + nchar(cur_del), "-", as.numeric(linepos[2]) +
+ 1 + 3 + nchar(cur_del) - 1, " | tail -n1", sep = ""))
+ after <- toupper(readLines(con))
+ close(con)
+ prev_bp <- substr(before, 3, 3)
+ next_bp <- substr(after, 1, 1)
+
+ vcf_record = paste0(linepos[1], "\t", as.numeric(linepos[2]), "\t",
+ ".", "\t", paste(prev_bp, cur_del, sep = ""), "\t", prev_bp,
+ "\t", max(reg_res$GQ), "\t", ".")
+
+ # INFO field
+
+ vcf_info = paste0("TYPE=del;NS=", sum(as.numeric(linepos[eval(depth)]) >
+ 0), ";AF=", sum(reg_res$GQ >= GQ_threshold)/sum(as.numeric(linepos[eval(depth)]) >
+ 0), ";DP=", all_DP, ";RO=", all_RO, ";AO=", all_AO, ";SRF=",
+ sum(Rp), ";SRR=", sum(Rm), ";SAF=", sum(Vp), ";SAR=", sum(Vm),
+ ";SOR=", all_sor, ";RVSB=", all_rvsb, ";FS=", FisherStrand_all,
+ ";ERR=", reg_res$coef["slope"], ";SIG=", reg_res$coef["sigma"],
+ ";CONT=", paste(before, after, sep = "x"), ifelse(reg_res$extra_rob &
+ !(ref_inv), ";WARN=EXTRA_ROBUST_GL", ""), ifelse(reg_res$extra_rob &
+ ref_inv, ";WARN=EXTRA_ROBUST_GL/INV_REF", ""), ifelse(ref_inv &
+ !(reg_res$extra_rob), ";WARN=INV_REF", ""))
+
+ # FORMAT field
+
+ vcf_format = paste0("GT:", ifelse(ref_inv, "QVAL_INV", "QVAL"),
+ ":DP:RO:AO:AF:SB:SOR:RVSB:FS:QVAL_minAF:STATUS")
+ vcf_record = paste(vcf_record, vcf_info, vcf_format, sep = "\t")
+ # all samples
+ if (ref_inv) {
+ genotype <- rep("1/1", l = nindiv)
+ } else {
+ genotype <- rep("0/0", l = nindiv)
+ }
+ heterozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV &
+ reg_res$ma_count/reg_res$coverage < 0.75)
+ genotype[heterozygotes] <- "0/1"
+ homozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV &
+ reg_res$ma_count/reg_res$coverage >= 0.75)
+ if (ref_inv) {
+ genotype[homozygotes] <- "0/0"
+ } else {
+ genotype[homozygotes] <- "1/1"
+ }
+ # no tumor variant but low power -> './.'
+ genotype[(reg_res$GQ < GQ_threshold) & (qval_minAF < GQ_threshold)] <- "./."
+
+ for (cur_sample in 1:nindiv) {
+
+ geno = paste0(genotype[cur_sample], ":", reg_res$GQ[cur_sample],
+ ":", DP[cur_sample], ":", (Rp + Rm)[cur_sample], ":", ma_count[cur_sample],
+ ":", (ma_count/DP)[cur_sample], ":", Rp[cur_sample], ",", Rm[cur_sample],
+ ",", Vp[cur_sample], ",", Vm[cur_sample], ":", sors[cur_sample],
+ ":", rvsbs[cur_sample], ":", FisherStrand[cur_sample], ":",
+ qval_minAF[cur_sample], ":", somatic_status[cur_sample])
+ vcf_record = paste(vcf_record, geno, sep = "\t")
+
+ }
+
+ vcf_record = paste0(vcf_record, "\n")
+ vcf_records = append(vcf_records, vcf_record)
+
+
+ if (!ref_inv & nchar(cur_del) > 10)
+ cur_del <- paste(substr(cur_del, 1, 3 + match(cur_del, uniq_del)),
+ substr(cur_del, nchar(cur_del) - (3 + match(cur_del, uniq_del)),
+ nchar(cur_del)), sep = "x")
+ if (ref_inv & nchar(ref) > 10)
+ ref <- paste(substr(ref, 1, 3 + match(ref, uniq_del)), substr(ref,
+ nchar(ref) - (3 + match(ref, uniq_del)), nchar(ref)), sep = "x")
+
+
+ }
+ }
+ }
+ }
+ return(list(qvals = qvals, vcf_records = vcf_records))
+}
+model_for_insertion <- function(linepos) {
+ # INS
+ lins <- linepos[insertion_cols]
+ names(lins) <- id_samples
+ all_ins <- lins[!(is.na(lins))]
+ qvals = matrix(nrow = 0, ncol = length(output_cols))
+ dimnames(qvals) = list(NULL, output_cols)
+ vcf_records = list()
+ ref = linepos[3]
+ if (length(all_ins) > 0) {
+ all_ins <- as.data.frame(strsplit(unlist(strsplit(paste(all_ins), split = "|",
+ fixed = T)), split = ":", fixed = T), stringsAsFactors = F, )
+ uniq_ins <- unique(toupper(as.character(all_ins[2, ])))
+ coverage_all_ins <- sum(as.numeric(all_ins[1, ]))
+ for (cur_ins in uniq_ins) {
+
+ Vp <- rep(0, nindiv)
+ names(Vp) <- id_samples
+ Vm <- Vp
+ all_cur_ins <- strsplit(grep(cur_ins, lins, ignore.case = T, value = T),
+ split = "|", fixed = T)
+ all_cur_ins_p <- lapply(all_cur_ins, function(x) {
+ grep(paste(":", cur_ins, "$", sep = ""), x, value = T)
+ })
+ all_cur_ins_m <- lapply(all_cur_ins, function(x) {
+ grep(paste(":", tolower(cur_ins), "$", sep = ""), x, value = T)
+ })
+ ma_p_cur_ins <- unlist(lapply(all_cur_ins_p, function(x) {
+ as.numeric(gsub(paste("([0-9]+):", cur_ins, "$", sep = ""), "\\1",
+ x, ignore.case = T))
+ }))
+ ma_m_cur_ins <- unlist(lapply(all_cur_ins_m, function(x) {
+ as.numeric(gsub(paste("([0-9]+):", cur_ins, "$", sep = ""), "\\1",
+ x, ignore.case = T))
+ }))
+ Vp[names(ma_p_cur_ins)] <- ma_p_cur_ins
+ Vm[names(ma_m_cur_ins)] <- ma_m_cur_ins
+ ma_count <- Vp + Vm
+ DP <- as.numeric(linepos[eval(depth)])
+ if (sum((ma_count/DP) > 0.8, na.rm = T) > 0.5 * length(ma_count)) {
+ # here we need to reverse alt and ref for the regression (use
+ # ma_count of the ref)
+ ref <- linepos[3]
+ cur_ins_inv <- linepos[3]
+ Vp_inv <- as.numeric(linepos[eval(as.name(paste(cur_ins_inv, "_cols",
+ sep = "")))])
+ Vm_inv <- as.numeric(linepos[eval(as.name(paste(tolower(cur_ins_inv),
+ "_cols", sep = "")))])
+ ma_count_inv <- Vp_inv + Vm_inv
+ ref_inv <- TRUE
+ reg_res <- glmrob.nb(x = DP, y = ma_count_inv, min_coverage = min_coverage,
+ min_reads = min_reads, min_af = min_af, extra_rob = extra_rob,
+ min_af_extra_rob = min_af_extra_rob, min_prop_extra_rob = min_prop_extra_rob,
+ max_prop_extra_rob = max_prop_extra_rob)
+
+ } else {
+ ref_inv <- FALSE
+ reg_res <- glmrob.nb(x = DP, y = ma_count, min_coverage = min_coverage,
+ min_reads = min_reads, min_af = min_af, extra_rob = extra_rob,
+ min_af_extra_rob = min_af_extra_rob, min_prop_extra_rob = min_prop_extra_rob,
+ max_prop_extra_rob = max_prop_extra_rob)
+
+ }
+ output = c(linepos[1:3], cur_ins, "INS", reg_res$coef["sigma"], reg_res$coef["slope"],
+ ref_inv, reg_res$GQ)
+ names(output) = output_cols
+ qvals = rbind(qvals, output)
+
+ # compute Qval20pc
+ qval_minAF <- rep(0, nindiv)
+ somatic_status <- rep(".", nindiv)
+
+
+ if (afmin_power == -1) {
+ # no minimum frequency supplied -> use a negative binomial
+ # distribution to check the power
+ qval_minAF <- sapply(1:nindiv, function(ii) toQvalueN(DP[ii], reg_res,
+ sigma))
+ } else {
+ # minimum frequency supplied -> use a binomial distribution
+ # centered around afmin_power to check the power
+ qval_minAF <- sapply(1:nindiv, function(ii) toQvalueT(DP[ii], reg_res,
+ afmin_power))
+ }
+
+
+ if (!is.na(reg_res$coef["slope"]) & sum(reg_res$GQ >= GQ_threshold, na.rm = TRUE) >
+ 0) {
+ all_AO <- sum(ma_count)
+ all_DP <- sum(as.numeric(linepos[eval(depth)])) + sum(ma_count)
+ common_annot(linepos, Vp, Vm)
+ all_RO <- sum(Rp + Rm)
+ if (SB_type == "SOR") {
+ sbs <- sors
+ } else {
+ if (SB_type == "RVSB") {
+ sbs <- rvsbs
+ } else {
+ sbs <- FisherStrand
+ }
+ }
+ if (sum(sbs <= SB_threshold_indel & reg_res$GQ >= GQ_threshold, na.rm = TRUE) >
+ 0) {
+ con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1],
+ ":", as.numeric(linepos[2]) - 2, "-", as.numeric(linepos[2]),
+ " | tail -n1", sep = ""))
+ before <- toupper(readLines(con))
+ close(con)
+ con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1],
+ ":", as.numeric(linepos[2]) + 1, "-", as.numeric(linepos[2]) +
+ 3, " | tail -n1", sep = ""))
+ after <- toupper(readLines(con))
+ close(con)
+ prev_bp <- substr(before, 3, 3)
+
+ vcf_record = paste0(linepos[1], "\t", linepos[2], "\t", ".", "\t",
+ prev_bp, "\t", paste(prev_bp, cur_ins, sep = ""), "\t", max(reg_res$GQ),
+ "\t", ".")
+
+ # INFO field
+
+ vcf_info = paste0("TYPE=ins;NS=", sum(as.numeric(linepos[eval(depth)]) >
+ 0), ";AF=", sum(reg_res$GQ >= GQ_threshold)/sum(as.numeric(linepos[eval(depth)]) >
+ 0), ";DP=", all_DP, ";RO=", all_RO, ";AO=", all_AO, ";SRF=",
+ sum(Rp), ";SRR=", sum(Rm), ";SAF=", sum(Vp), ";SAR=", sum(Vm),
+ ";SOR=", all_sor, ";RVSB=", all_rvsb, ";FS=", FisherStrand_all,
+ ";ERR=", reg_res$coef["slope"], ";SIG=", reg_res$coef["sigma"],
+ ";CONT=", paste(before, after, sep = "x"), ifelse(reg_res$extra_rob &
+ !(ref_inv), ";WARN=EXTRA_ROBUST_GL", ""), ifelse(reg_res$extra_rob &
+ ref_inv, ";WARN=EXTRA_ROBUST_GL/INV_REF", ""), ifelse(ref_inv &
+ !(reg_res$extra_rob), ";WARN=INV_REF", ""))
+ # FORMAT field
+
+ vcf_format = paste0("GT:", ifelse(ref_inv, "QVAL_INV", "QVAL"),
+ ":DP:RO:AO:AF:SB:SOR:RVSB:FS:QVAL_minAF:STATUS")
+ # all samples
+ if (ref_inv) {
+ genotype <- rep("1/1", l = nindiv)
+ } else {
+ genotype <- rep("0/0", l = nindiv)
+ }
+ heterozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV &
+ reg_res$ma_count/reg_res$coverage < 0.75)
+ genotype[heterozygotes] <- "0/1"
+ homozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV &
+ reg_res$ma_count/reg_res$coverage >= 0.75)
+ if (ref_inv) {
+ genotype[homozygotes] <- "0/0"
+ } else {
+ genotype[homozygotes] <- "1/1"
+ }
+ # no tumor variant but low power -> './.'
+ genotype[(reg_res$GQ < GQ_threshold) & (qval_minAF < GQ_threshold)] <- "./."
+ vcf_record = paste(vcf_record, vcf_info, vcf_format, sep = "\t")
+ for (cur_sample in 1:nindiv) {
+
+ geno = paste0(genotype[cur_sample], ":", reg_res$GQ[cur_sample],
+ ":", DP[cur_sample], ":", (Rp + Rm)[cur_sample], ":", ma_count[cur_sample],
+ ":", (ma_count/DP)[cur_sample], ":", Rp[cur_sample], ",", Rm[cur_sample],
+ ",", Vp[cur_sample], ",", Vm[cur_sample], ":", sors[cur_sample],
+ ":", rvsbs[cur_sample], ":", FisherStrand[cur_sample], ":",
+ qval_minAF[cur_sample], ":", somatic_status[cur_sample])
+ vcf_record = paste(vcf_record, geno, sep = "\t")
+ }
+
+
+ vcf_record = paste0(vcf_record, "\n")
+ vcf_records = append(vcf_records, vcf_record)
+
+ if (!ref_inv & nchar(cur_ins) > 10)
+ cur_ins <- paste(substr(cur_ins, 1, 3 + match(cur_ins, uniq_ins)),
+ substr(cur_ins, nchar(cur_ins) - (3 + match(cur_ins, uniq_ins)),
+ nchar(cur_ins)), sep = "x")
+ if (ref_inv & nchar(ref) > 10)
+ ref <- paste(substr(ref, 1, 3 + match(ref, uniq_ins)), substr(ref,
+ nchar(ref) - (3 + match(ref, uniq_ins)), nchar(ref)), sep = "x")
+ }
+ }
+ }
+ }
+ return(list(qvals = qvals, vcf_records = vcf_records))
+}
+model_for_position <- function(line) {
+
+ linepos <- unlist(strsplit(line, "\t"))
+ qvals = matrix(, nrow = 0, ncol = length(output_cols))
+ dimnames(qvals) <- list(NULL, output_cols)
+ vcf_records = list()
+ if (is.element(linepos[3], c("A", "T", "C", "G"))) {
+
+ msnv = model_for_snv(linepos)
+ mdel = model_for_deletion(linepos)
+ mins = model_for_insertion(linepos)
+
+ qvals = rbind(qvals, msnv$qvals, mdel$qvals, mins$qvals)
+ vcf_records = c(vcf_records, msnv$vcf_records, mdel$vcf_records, mins$vcf_records)
+
+ }
+ return(list(qvals = qvals, vcf_records = vcf_records))
+
+
+}
+
+if (file.exists(out_vcf_file)) file.remove(out_vcf_file)
+
+write_vcf_header(indiv_run)
+
+
+
+###############################################################################################
+
+############################## REGRESSION SECTION
+############################## ##############################
+
+# Reading input file (output of mpileup2readcounts)
+f <- file(input_file, "r")
+lines = readLines(f)
+qvals_per_position = mclapply(lines, model_for_position, mc.cores = num_cores)
+qvals = do.call(rbind, lapply(qvals_per_position, function(x) x$qvals))
+write.table(qvals, out_qvals_file, sep = "\t", row.names = F, quote = F)
+vcf_records = unlist(lapply(qvals_per_position, function(x) x$vcf_records))
+con <- file(out_vcf_file, "a")
+writeLines(vcf_records, sep = "", con)
+close(con)
+
+
+print(Sys.time())
+###############################################################################################