From 3da1317e84445c7a184b3ff53f4bf6b33933c6cf Mon Sep 17 00:00:00 2001 From: Veena Rajaraman Date: Wed, 4 Dec 2024 14:24:49 -0800 Subject: [PATCH] adding parallelized version --- bin/needlestack_parallel.r | 923 +++++++++++++++++++++++++++++++++++++ 1 file changed, 923 insertions(+) create mode 100644 bin/needlestack_parallel.r diff --git a/bin/needlestack_parallel.r b/bin/needlestack_parallel.r new file mode 100644 index 0000000..14c7c53 --- /dev/null +++ b/bin/needlestack_parallel.r @@ -0,0 +1,923 @@ +#! /usr/bin/env Rscript + +# needlestack: a multi-sample somatic variant caller Copyright (C) 2017 +# IARC/WHO This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by the Free +# Software Foundation, either version 3 of the License, or (at your option) any +# later version. This program is distributed in the hope that it will be +# useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. You should have received a copy of the GNU +# General Public License along with this program. If not, see +# . +library(parallel) + +options(warn = 1) +print(Sys.time()) + +############################## ARGUMENTS SECTION ############################## +args <- commandArgs(TRUE) +parseArgs <- function(x) strsplit(sub("^--", "", x), "=") +argsL <- as.list(as.character(as.data.frame(do.call("rbind", parseArgs(args)))$V2)) +names(argsL) <- as.data.frame(do.call("rbind", parseArgs(args)))$V1 +args <- argsL +rm(argsL) + +if ("--help" %in% args | is.null(args$out_vcf_file) | is.null(args$out_qvals_file) | + is.null(args$fasta_ref)) { + cat(" + The R Script arguments_section.R + Mandatory arguments: + --input_file=path - path to input file file (combined read counts for all samples in names file) + --names_file=path - path to names file (tsv, no header, path,sample_name) + --out_vcf_file=file_name - name of output vcf + --out_qvals_file=file_name - name of output qvals file + --fasta_ref=path - path of fasta ref + --help - print this text + Optional arguments: + --num_cores - number of cores to use, default=1 + --source_path=path - path to source files (glm_rob_nb.r) + --samtools=path - path of samtools, default=samtools + --SB_type=SOR, RVSB or FS - strand bias measure, default=SOR + --SB_threshold_SNV=value - strand bias threshold for SNV, default=100 + --SB_threshold_indel=value - strand bias threshold for indel, default=100 + --min_coverage=value - minimum coverage in at least one sample to consider a site, default=50 + --min_reads=value - minimum number of non-ref reads in at least one sample to consider a site, default=5 + --min_af=value - minimum allelic fraction in at least one sample to consider a site, default=0 + --GQ_threshold=value - phred scale qvalue threshold for variants, default=50 + --output_all_SNVs=boolean - output all SNVs, even when no variant is detected, default=FALSE + --ref_genome=string - reference genome for alignments plot, examples : Hsapiens.UCSC.hg19, Hsapiens.UCSC.hg18, Mmusculus.UCSC.mm10... + --extra_rob=boolean - perform an extra-robust regression, default=FALSE + --min_af_extra_rob=value - minimum allelic fraction to exclude a sample at a position for extra-robust regression, default=0.2 + --min_prop_extra_rob=value - minimum proportion of samples having an allelic fraction to be excluded from extra-robust regression, default=0.1 + --max_prop_extra_rob=value - maximum proportion of samples having an allelic fraction to be excluded from extra-robust regression, default=0.1 + --afmin_power=value - minimum allelic fraction in mean for somatic mutations, default=0.01 + --sigma=value - sigma parameter for negative binomial modeling germline mutations, default=0.1 + WARNING : by default samtools has to be in your path + Example: + needlestack_mt.r --input_file=read_counts.tsv --names_file=names.txt --out_qvals_file=test_qvals.vcf --out_vcf_file=test.vcf --fasta_ref=path_to_ref_fasta \n\n") + + q(save = "no") +} +if (is.null(args$num_cores)) { + args$num_cores <- 1 +} else { + args$num_cores <- as.numeric(args$num_cores) +} +if (is.null(args$samtools)) { + args$samtools <- "samtools" +} +if (is.null(args$SB_type)) { + args$SB_type <- "SOR" +} +if (is.null(args$SB_threshold_SNV)) { + args$SB_threshold_SNV <- 100 +} else { + args$SB_threshold_SNV <- as.numeric(args$SB_threshold_SNV) +} +if (is.null(args$SB_threshold_indel)) { + args$SB_threshold_indel <- 100 +} else { + args$SB_threshold_indel <- as.numeric(args$SB_threshold_indel) +} +if (is.null(args$min_coverage)) { + args$min_coverage <- 30 +} else { + args$min_coverage <- as.numeric(args$min_coverage) +} +if (is.null(args$min_reads)) { + args$min_reads <- 3 +} else { + args$min_reads <- as.numeric(args$min_reads) +} +if (is.null(args$min_af)) { + args$min_af <- 0 +} else { + args$min_af <- as.numeric(args$min_af) +} +if (is.null(args$GQ_threshold)) { + args$GQ_threshold <- 50 +} else { + args$GQ_threshold <- as.numeric(args$GQ_threshold) +} +if (is.null(args$output_all_SNVs)) { + args$output_all_SNVs <- FALSE +} else { + args$output_all_SNVs <- as.logical(args$output_all_SNVs) +} + + + +if (is.null(args$extra_rob)) { + args$extra_rob <- FALSE +} else { + args$extra_rob <- as.logical(args$extra_rob) +} +if (is.null(args$min_af_extra_rob)) { + args$min_af_extra_rob <- 0.2 +} else { + args$min_af_extra_rob <- as.numeric(args$min_af_extra_rob) +} +if (is.null(args$min_prop_extra_rob)) { + args$min_prop_extra_rob <- 0.1 +} else { + args$min_prop_extra_rob <- as.numeric(args$min_prop_extra_rob) +} +if (is.null(args$max_prop_extra_rob)) { + args$max_prop_extra_rob <- 0.8 +} else { + args$max_prop_extra_rob <- as.numeric(args$max_prop_extra_rob) +} +if (is.null(args$afmin_power)) { + args$afmin_power <- -1 +} else { + args$afmin_power <- as.numeric(args$afmin_power) +} +if (is.null(args$sigma)) { + args$sigma <- 0.1 +} else { + args$sigma <- as.numeric(args$sigma) +} +num_cores <- args$num_cores +samtools <- args$samtools +input_file <- args$input_file +names_file <- args$names_file +out_vcf_file <- args$out_vcf_file +out_qvals_file <- args$out_qvals_file +fasta_ref <- args$fasta_ref +GQ_threshold <- args$GQ_threshold +min_coverage <- args$min_coverage +min_reads <- args$min_reads +min_af <- args$min_af +print(min_af) +# http://gatkforums.broadinstitute.org/discussion/5533/strandoddsratio-computation +# filter out SOR > 4 for SNVs and > 10 for indels filter out RVSB > 0.85 (maybe +# less stringent for SNVs) +SB_type <- args$SB_type +SB_threshold_SNV <- args$SB_threshold_SNV +SB_threshold_indel <- args$SB_threshold_indel +output_all_SNVs <- args$output_all_SNVs + + +ref_genome <- args$ref_genome + + +extra_rob <- args$extra_rob +min_af_extra_rob <- args$min_af_extra_rob +min_prop_extra_rob <- args$min_prop_extra_rob +max_prop_extra_rob <- args$max_prop_extra_rob +sigma <- args$sigma +afmin_power <- args$afmin_power + + + + +# source R files +getScriptPath <- function() { + cmd.args <- commandArgs() + m <- regexpr("(?<=^--file=).+", cmd.args, perl = TRUE) + script.dir <- dirname(regmatches(cmd.args, m)) + if (length(script.dir) == 0) + stop("can't determine script dir: please call the script with Rscript") + if (length(script.dir) > 1) + stop("can't determine script dir: more than one '--file' argument detected") + return(script.dir) +} + +if (is.null(args$source_path)) { + source_path <- paste0(getScriptPath(), "/") +} else { + source_path <- args$source_path +} + +source(paste(source_path, "glm_rob_nb.r", sep = "")) + + + +############################################################ + +options(scipen = 100) + +indiv_run <- read.table(names_file, stringsAsFactors = F, colClasses = "character") +# check if samples names exist in names_file file +if (ncol(indiv_run) != 2) { + cat("Error : names_file contains only one column : samples names can't be obtained from the bam files : use --use_file_name option") + q(save = "no") +} +indiv_run[, 2] <- make.unique(indiv_run[, 2], sep = "_") + +nindiv <- nrow(indiv_run) + +id_samples <- seq(1, nindiv) # samples IDs (integers) + +base_cols = c("CHROM", "POS", "REF", "ALT", "TYPE", "DISPERSION", "ERROR", "REF_INV") +output_cols = c(base_cols, indiv_run[, 2]) + + + +# columns numbers in the input file +depth <- (1:nindiv) * 11 - 10 + 3 +A_cols <- (1:nindiv) * 11 - 9 + 3 +T_cols <- (1:nindiv) * 11 - 8 + 3 +C_cols <- (1:nindiv) * 11 - 7 + 3 +G_cols <- (1:nindiv) * 11 - 6 + 3 +a_cols <- (1:nindiv) * 11 - 5 + 3 +t_cols <- (1:nindiv) * 11 - 4 + 3 +c_cols <- (1:nindiv) * 11 - 3 + 3 +g_cols <- (1:nindiv) * 11 - 2 + 3 +insertion_cols <- (1:nindiv) * 11 - 1 + 3 +deletion_cols <- (1:nindiv) * 11 - 0 + 3 + + +############################## FUNCTION SECTION ############################## +non_ref_bases <- function(ref) { + setdiff(c("A", "T", "C", "G"), ref) +} + +SOR <- function(RO_forward, AO_forward, RO_reverse, AO_reverse) { + X00 <- RO_forward + X10 <- AO_forward + X01 <- RO_reverse + X11 <- AO_reverse + refRatio <- pmax(X00, X01)/pmin(X00, X01) + altRatio <- pmax(X10, X11)/pmin(X10, X11) + R <- (X00/X01) * (X11/X10) + log((R + 1/R)/(refRatio/altRatio)) +} + +common_annot <- function(linepos, Vp, Vm) { + Rp <<- as.numeric(linepos[eval(as.name(paste(linepos[3], "_cols", sep = "")))]) + Rm <<- as.numeric(linepos[eval(as.name(paste(tolower(linepos[3]), "_cols", sep = "")))]) + Cp <<- Vp + Rp + Cm <<- Vm + Rm + tmp_rvsbs <- pmax(Vp * Cm, Vm * Cp)/(Vp * Cm + Vm * Cp) + tmp_rvsbs[which(is.na(tmp_rvsbs))] <- -1 + rvsbs <<- tmp_rvsbs + all_rvsb <<- max(as.numeric(sum(Vp)) * sum(Cm), as.numeric(sum(Vm)) * sum(Cp))/(as.numeric(sum(Vp)) * + sum(Cm) + as.numeric(sum(Vm)) * sum(Cp)) + if (is.na(all_rvsb)) + all_rvsb <<- (-1) + tmp_sors <<- SOR(Rp, Vp, Rm, Vm) + tmp_sors[which(is.na(tmp_sors))] <- -1 + tmp_sors[which(is.infinite(tmp_sors))] <- 99 + sors <<- tmp_sors + all_sor <<- SOR(sum(Rp), sum(Vp), sum(Rm), sum(Vm)) + if (is.na(all_sor)) + all_sor <<- (-1) + if (is.infinite(all_sor)) + all_sor <<- 99 + FisherStrand <<- -10 * log10(unlist(lapply(1:nindiv, function(indiv) fisher.test(matrix(c(Vp[indiv], + Vm[indiv], Rp[indiv], Rm[indiv]), nrow = 2))$p.value))) # col1=alt(V), col2=ref(R) + FisherStrand[which(FisherStrand > 1000)] <- 1000 + FisherStrand_all <<- -10 * log10(fisher.test(matrix(c(sum(Vp), sum(Vm), sum(Rp), + sum(Rm)), nrow = 2))$p.value) + if (FisherStrand_all > 1000) + FisherStrand_all <- 1000 + all_RO <<- sum(Rp + Rm) +} + +## functions to compute the Qvalue of a function with a given AF change sigma +## value to change the departure from binomial distribution with parameter 0.5 +toQvalueN <- function(x, rob_nb_res, sigma) { + y <- qnbinom(0.01, size = 1/sigma, mu = 0.5 * x) + unlist(-10 * log10(p.adjust((dnbinom(c(rob_nb_res$ma_count, y), size = 1/rob_nb_res$coef[[1]], + mu = rob_nb_res$coef[[2]] * c(rob_nb_res$coverage, x)) + pnbinom(c(rob_nb_res$ma_count, + y), size = 1/rob_nb_res$coef[[1]], mu = rob_nb_res$coef[[2]] * c(rob_nb_res$coverage, + x), lower.tail = F)))[length(rob_nb_res$coverage) + 1])) +} + +toQvalueT <- function(x, rob_nb_res, afmin_power) { + # change afmin_power to + y <- qbinom(0.01, x, afmin_power, lower.tail = T) + unlist(-10 * log10(p.adjust((dnbinom(c(rob_nb_res$ma_count, y), size = 1/rob_nb_res$coef[[1]], + mu = rob_nb_res$coef[[2]] * c(rob_nb_res$coverage, x)) + pnbinom(c(rob_nb_res$ma_count, + y), size = 1/rob_nb_res$coef[[1]], mu = rob_nb_res$coef[[2]] * c(rob_nb_res$coverage, + x), lower.tail = F)))[length(rob_nb_res$coverage) + 1])) +} + +############################################################################################### + + +############################## WRITEOUT SECTION ############################## +write_out <- function(...) { + cat(paste(..., sep = ""), "\n", file = out_vcf_file, sep = "", append = T) +} + +write_vcf_header <- function(samples) { + + write_out("##fileformat=VCFv4.1") + write_out("##fileDate=", format(Sys.Date(), "%Y%m%d")) + write_out("##source=needlestack v1.1") + write_out("##reference=", fasta_ref) + write_out("##contig=") + write_out("##contig=") + write_out("##phasing=none") + write_out("##filter=\"QVAL > ", GQ_threshold, " & ", SB_type, "_SNV < ", SB_threshold_SNV, + " & ", SB_type, "_INDEL < ", SB_threshold_indel, " & min(AO) >= ", min_reads, + " & min(AF) >= ", min_af, " & min(DP) >= ", min_coverage, "\"") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + write_out("##INFO=") + + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + write_out("##FORMAT=") + + write_out("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t", paste(samples[, + 2], collapse = "\t")) + +} + +model_for_snv <- function(linepos) { + qvals = matrix(nrow = 0, ncol = length(output_cols)) + dimnames(qvals) = list(NULL, output_cols) + vcf_records = list() + + # SNV for each base different from the reference base on the + + for (alt in non_ref_bases(linepos[3])) { + + ref <- linepos[3] + + # count the number of alternative bases (nonreference) + + Vp <- as.numeric(linepos[eval(as.name(paste(alt, "_cols", sep = "")))]) + Vm <- as.numeric(linepos[eval(as.name(paste(tolower(alt), "_cols", sep = "")))]) + ma_count <- Vp + Vm + DP <- as.numeric(linepos[eval(depth)]) + + if (sum((ma_count/DP) > 0.8, na.rm = T) > 0.5 * length(ma_count)) { + # here we need to reverse alt and ref for the regression (use + # ma_count of the ref) + alt_inv <- linepos[3] + Vp_inv <- as.numeric(linepos[eval(as.name(paste(alt_inv, "_cols", sep = "")))]) + Vm_inv <- as.numeric(linepos[eval(as.name(paste(tolower(alt_inv), "_cols", + sep = "")))]) + ma_count_inv <- Vp_inv + Vm_inv + ref_inv <- TRUE + + reg_res <- glmrob.nb(x = DP, y = ma_count_inv, min_coverage = min_coverage, + min_reads = min_reads, min_af = min_af, extra_rob = extra_rob, min_af_extra_rob = min_af_extra_rob, + min_prop_extra_rob = min_prop_extra_rob, max_prop_extra_rob = max_prop_extra_rob) + } else { + ref_inv <- FALSE + reg_res <- glmrob.nb(x = DP, y = ma_count, min_coverage = min_coverage, + min_reads = min_reads, min_af = min_af, extra_rob = extra_rob, min_af_extra_rob = min_af_extra_rob, + min_prop_extra_rob = min_prop_extra_rob, max_prop_extra_rob = max_prop_extra_rob) + } + + # compute Qval for minAF + qval_minAF <- rep(0, nindiv) + somatic_status <- rep(".", nindiv) + + if (afmin_power == -1) { + # no minimum frequency supplied -> use a negative binomial + # distribution to check the power + qval_minAF <- sapply(1:nindiv, function(ii) toQvalueN(DP[ii], reg_res, + sigma)) + } else { + # minimum frequency supplied -> use a binomial distribution + # centered around afmin_power to check the power + qval_minAF <- sapply(1:nindiv, function(ii) toQvalueT(DP[ii], reg_res, + afmin_power)) + } + output = c(linepos[1:3], alt, "SNV", reg_res$coef["sigma"], reg_res$coef["slope"], + ref_inv, reg_res$GQ) + names(output) = output_cols + qvals = rbind(qvals, output) + + + if (output_all_SNVs | (!is.na(reg_res$coef["slope"]) & sum(reg_res$GQ >= + GQ_threshold, na.rm = TRUE) > 0)) { + all_AO <- sum(ma_count) + all_DP <- sum(as.numeric(linepos[eval(depth)])) + common_annot(linepos, Vp, Vm) + all_RO <- sum(Rp + Rm) + if (SB_type == "SOR") { + sbs <- sors + } else { + if (SB_type == "RVSB") { + sbs <- rvsbs + } else { + sbs <- FisherStrand + } + } + if (output_all_SNVs | (sum(sbs <= SB_threshold_SNV & reg_res$GQ >= GQ_threshold, + na.rm = TRUE) > 0)) { + # to get context around the base + con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1], + ":", as.numeric(linepos[2]) - 3, "-", as.numeric(linepos[2]) - + 1, " | tail -n1", sep = "")) + before <- readLines(con) + close(con) + con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1], + ":", as.numeric(linepos[2]) + 1, "-", as.numeric(linepos[2]) + + 3, " | tail -n1", sep = "")) + after <- readLines(con) + close(con) + + + vcf_record = paste0(linepos[1], "\t", linepos[2], "\t", ".", "\t", + linepos[3], "\t", alt, "\t", max(reg_res$GQ), "\t", ".") + # INFO field + + vcf_info = paste0("TYPE=snv;NS=", sum(as.numeric(linepos[eval(depth)]) > + 0), ";AF=", sum(reg_res$GQ >= GQ_threshold)/sum(as.numeric(linepos[eval(depth)]) > + 0), ";DP=", all_DP, ";RO=", all_RO, ";AO=", all_AO, ";SRF=", sum(Rp), + ";SRR=", sum(Rm), ";SAF=", sum(Vp), ";SAR=", sum(Vm), ";SOR=", + all_sor, ";RVSB=", all_rvsb, ";FS=", FisherStrand_all, ";ERR=", + reg_res$coef["slope"], ";SIG=", reg_res$coef["sigma"], ";CONT=", + paste(before, after, sep = "x"), ifelse(reg_res$extra_rob & !(ref_inv), + ";WARN=EXTRA_ROBUST_GL", ""), ifelse(reg_res$extra_rob & ref_inv, + ";WARN=EXTRA_ROBUST_GL/INV_REF", ""), ifelse(ref_inv & !(reg_res$extra_rob), + ";WARN=INV_REF", "")) + # FORMAT field + + vcf_format = paste0("GT:", ifelse(ref_inv, "QVAL_INV", "QVAL"), ":DP:RO:AO:AF:SB:SOR:RVSB:FS:QVAL_minAF:STATUS") + + vcf_record = paste(vcf_record, vcf_info, vcf_format, sep = "\t") + # all samples + if (ref_inv) { + genotype <- rep("1/1", l = nindiv) + } else { + genotype <- rep("0/0", l = nindiv) + } + heterozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV & + reg_res$ma_count/reg_res$coverage < 0.75) + genotype[heterozygotes] <- "0/1" + homozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV & + reg_res$ma_count/reg_res$coverage >= 0.75) + if (ref_inv) { + genotype[homozygotes] <- "0/0" + } else { + genotype[homozygotes] <- "1/1" + } + # no tumor variant but low power -> './.' + genotype[(reg_res$GQ < GQ_threshold) & (qval_minAF < GQ_threshold)] <- "./." + for (cur_sample in 1:nindiv) { + + geno = paste0(genotype[cur_sample], ":", reg_res$GQ[cur_sample], + ":", DP[cur_sample], ":", (Rp + Rm)[cur_sample], ":", ma_count[cur_sample], + ":", (ma_count/DP)[cur_sample], ":", Rp[cur_sample], ",", Rm[cur_sample], + ",", Vp[cur_sample], ",", Vm[cur_sample], ":", sors[cur_sample], + ":", rvsbs[cur_sample], ":", FisherStrand[cur_sample], ":", qval_minAF[cur_sample], + ":", somatic_status[cur_sample]) + vcf_record = paste(vcf_record, geno, sep = "\t") + + } + vcf_record = paste0(vcf_record, "\n") + vcf_records = append(vcf_records, vcf_record) + + + } + } + } + + + return(list(qvals = qvals, vcf_records = vcf_records)) +} +model_for_deletion <- function(linepos) { + # DEL + ldel <- linepos[deletion_cols] + + names(ldel) <- id_samples + all_del <- ldel[!(is.na(ldel))] + + qvals = matrix(nrow = 0, ncol = length(output_cols)) + dimnames(qvals) = list(NULL, output_cols) + vcf_records = list() + ref = linepos[3] + if (length(all_del) > 0) { + all_del <- as.data.frame(strsplit(unlist(strsplit(paste(all_del), split = "|", + fixed = T)), split = ":", fixed = T), stringsAsFactors = F, ) + uniq_del <- unique(toupper(as.character(all_del[2, ]))) + + + coverage_all_del <- sum(as.numeric(all_del[1, ])) + + for (cur_del in uniq_del) { + + Vp <- rep(0, nindiv) + names(Vp) <- id_samples + Vm <- Vp + all_cur_del <- strsplit(grep(cur_del, ldel, ignore.case = T, value = T), + split = "|", fixed = T) + all_cur_del_p <- lapply(all_cur_del, function(x) { + grep(paste(":", cur_del, "$", sep = ""), x, value = T) + }) + all_cur_del_m <- lapply(all_cur_del, function(x) { + grep(paste(":", tolower(cur_del), "$", sep = ""), x, value = T) + }) + ma_p_cur_del <- unlist(lapply(all_cur_del_p, function(x) { + as.numeric(gsub(paste("([0-9]+):", cur_del, "$", sep = ""), "\\1", + x, ignore.case = T)) + })) + ma_m_cur_del <- unlist(lapply(all_cur_del_m, function(x) { + as.numeric(gsub(paste("([0-9]+):", cur_del, "$", sep = ""), "\\1", + x, ignore.case = T)) + })) + Vp[names(ma_p_cur_del)] <- ma_p_cur_del + Vm[names(ma_m_cur_del)] <- ma_m_cur_del + ma_count <- Vp + Vm + DP <- as.numeric(linepos[eval(depth)]) + if (sum((ma_count/DP) > 0.8, na.rm = T) > 0.5 * length(ma_count)) { + # here we need to reverse alt and ref for the regression (use + # ma_count of the ref) + ref <- linepos[3] + cur_del_inv <- linepos[3] + Vp_inv <- as.numeric(linepos[eval(as.name(paste(cur_del_inv, "_cols", + sep = "")))]) + Vm_inv <- as.numeric(linepos[eval(as.name(paste(tolower(cur_del_inv), + "_cols", sep = "")))]) + ma_count_inv <- Vp_inv + Vm_inv + ref_inv <- TRUE + reg_res <- glmrob.nb(x = DP, y = ma_count_inv, min_coverage = min_coverage, + min_reads = min_reads, min_af = min_af, extra_rob = extra_rob, + min_af_extra_rob = min_af_extra_rob, min_prop_extra_rob = min_prop_extra_rob, + max_prop_extra_rob = max_prop_extra_rob) + + } else { + ref_inv <- FALSE + reg_res <- glmrob.nb(x = DP, y = ma_count, min_coverage = min_coverage, + min_reads = min_reads, min_af = min_af, extra_rob = extra_rob, + min_af_extra_rob = min_af_extra_rob, min_prop_extra_rob = min_prop_extra_rob, + max_prop_extra_rob = max_prop_extra_rob) + + } + output = c(linepos[1:3], cur_del, "DEL", reg_res$coef["sigma"], reg_res$coef["slope"], + ref_inv, reg_res$GQ) + + names(output) = output_cols + qvals = rbind(qvals, output) + # compute Qval20pc + qval_minAF <- rep(0, nindiv) + somatic_status <- rep(".", nindiv) + + if (afmin_power == -1) { + # no minimum frequency supplied -> use a negative binomial + # distribution to check the power + qval_minAF <- sapply(1:nindiv, function(ii) toQvalueN(DP[ii], reg_res, + sigma)) + } else { + # minimum frequency supplied -> use a binomial distribution + # centered around afmin_power to check the power + qval_minAF <- sapply(1:nindiv, function(ii) toQvalueT(DP[ii], reg_res, + afmin_power)) + } + + + if (!is.na(reg_res$coef["slope"]) & sum(reg_res$GQ >= GQ_threshold, na.rm = TRUE) > + 0) { + all_AO <- sum(ma_count) + all_DP <- sum(as.numeric(linepos[eval(depth)])) + sum(ma_count) + common_annot(linepos, Vp, Vm) + all_RO <- sum(Rp + Rm) + if (SB_type == "SOR") { + sbs <- sors + } else { + if (SB_type == "RVSB") { + sbs <- rvsbs + } else { + sbs <- FisherStrand + } + } + if (sum(sbs <= SB_threshold_indel & reg_res$GQ >= GQ_threshold, na.rm = TRUE) > + 0) { + con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1], + ":", as.numeric(linepos[2]) + 1 - 3, "-", as.numeric(linepos[2]) + + 1 - 1, " | tail -n1", sep = "")) + before <- toupper(readLines(con)) + close(con) + con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1], + ":", as.numeric(linepos[2]) + 1 + nchar(cur_del), "-", as.numeric(linepos[2]) + + 1 + 3 + nchar(cur_del) - 1, " | tail -n1", sep = "")) + after <- toupper(readLines(con)) + close(con) + prev_bp <- substr(before, 3, 3) + next_bp <- substr(after, 1, 1) + + vcf_record = paste0(linepos[1], "\t", as.numeric(linepos[2]), "\t", + ".", "\t", paste(prev_bp, cur_del, sep = ""), "\t", prev_bp, + "\t", max(reg_res$GQ), "\t", ".") + + # INFO field + + vcf_info = paste0("TYPE=del;NS=", sum(as.numeric(linepos[eval(depth)]) > + 0), ";AF=", sum(reg_res$GQ >= GQ_threshold)/sum(as.numeric(linepos[eval(depth)]) > + 0), ";DP=", all_DP, ";RO=", all_RO, ";AO=", all_AO, ";SRF=", + sum(Rp), ";SRR=", sum(Rm), ";SAF=", sum(Vp), ";SAR=", sum(Vm), + ";SOR=", all_sor, ";RVSB=", all_rvsb, ";FS=", FisherStrand_all, + ";ERR=", reg_res$coef["slope"], ";SIG=", reg_res$coef["sigma"], + ";CONT=", paste(before, after, sep = "x"), ifelse(reg_res$extra_rob & + !(ref_inv), ";WARN=EXTRA_ROBUST_GL", ""), ifelse(reg_res$extra_rob & + ref_inv, ";WARN=EXTRA_ROBUST_GL/INV_REF", ""), ifelse(ref_inv & + !(reg_res$extra_rob), ";WARN=INV_REF", "")) + + # FORMAT field + + vcf_format = paste0("GT:", ifelse(ref_inv, "QVAL_INV", "QVAL"), + ":DP:RO:AO:AF:SB:SOR:RVSB:FS:QVAL_minAF:STATUS") + vcf_record = paste(vcf_record, vcf_info, vcf_format, sep = "\t") + # all samples + if (ref_inv) { + genotype <- rep("1/1", l = nindiv) + } else { + genotype <- rep("0/0", l = nindiv) + } + heterozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV & + reg_res$ma_count/reg_res$coverage < 0.75) + genotype[heterozygotes] <- "0/1" + homozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV & + reg_res$ma_count/reg_res$coverage >= 0.75) + if (ref_inv) { + genotype[homozygotes] <- "0/0" + } else { + genotype[homozygotes] <- "1/1" + } + # no tumor variant but low power -> './.' + genotype[(reg_res$GQ < GQ_threshold) & (qval_minAF < GQ_threshold)] <- "./." + + for (cur_sample in 1:nindiv) { + + geno = paste0(genotype[cur_sample], ":", reg_res$GQ[cur_sample], + ":", DP[cur_sample], ":", (Rp + Rm)[cur_sample], ":", ma_count[cur_sample], + ":", (ma_count/DP)[cur_sample], ":", Rp[cur_sample], ",", Rm[cur_sample], + ",", Vp[cur_sample], ",", Vm[cur_sample], ":", sors[cur_sample], + ":", rvsbs[cur_sample], ":", FisherStrand[cur_sample], ":", + qval_minAF[cur_sample], ":", somatic_status[cur_sample]) + vcf_record = paste(vcf_record, geno, sep = "\t") + + } + + vcf_record = paste0(vcf_record, "\n") + vcf_records = append(vcf_records, vcf_record) + + + if (!ref_inv & nchar(cur_del) > 10) + cur_del <- paste(substr(cur_del, 1, 3 + match(cur_del, uniq_del)), + substr(cur_del, nchar(cur_del) - (3 + match(cur_del, uniq_del)), + nchar(cur_del)), sep = "x") + if (ref_inv & nchar(ref) > 10) + ref <- paste(substr(ref, 1, 3 + match(ref, uniq_del)), substr(ref, + nchar(ref) - (3 + match(ref, uniq_del)), nchar(ref)), sep = "x") + + + } + } + } + } + return(list(qvals = qvals, vcf_records = vcf_records)) +} +model_for_insertion <- function(linepos) { + # INS + lins <- linepos[insertion_cols] + names(lins) <- id_samples + all_ins <- lins[!(is.na(lins))] + qvals = matrix(nrow = 0, ncol = length(output_cols)) + dimnames(qvals) = list(NULL, output_cols) + vcf_records = list() + ref = linepos[3] + if (length(all_ins) > 0) { + all_ins <- as.data.frame(strsplit(unlist(strsplit(paste(all_ins), split = "|", + fixed = T)), split = ":", fixed = T), stringsAsFactors = F, ) + uniq_ins <- unique(toupper(as.character(all_ins[2, ]))) + coverage_all_ins <- sum(as.numeric(all_ins[1, ])) + for (cur_ins in uniq_ins) { + + Vp <- rep(0, nindiv) + names(Vp) <- id_samples + Vm <- Vp + all_cur_ins <- strsplit(grep(cur_ins, lins, ignore.case = T, value = T), + split = "|", fixed = T) + all_cur_ins_p <- lapply(all_cur_ins, function(x) { + grep(paste(":", cur_ins, "$", sep = ""), x, value = T) + }) + all_cur_ins_m <- lapply(all_cur_ins, function(x) { + grep(paste(":", tolower(cur_ins), "$", sep = ""), x, value = T) + }) + ma_p_cur_ins <- unlist(lapply(all_cur_ins_p, function(x) { + as.numeric(gsub(paste("([0-9]+):", cur_ins, "$", sep = ""), "\\1", + x, ignore.case = T)) + })) + ma_m_cur_ins <- unlist(lapply(all_cur_ins_m, function(x) { + as.numeric(gsub(paste("([0-9]+):", cur_ins, "$", sep = ""), "\\1", + x, ignore.case = T)) + })) + Vp[names(ma_p_cur_ins)] <- ma_p_cur_ins + Vm[names(ma_m_cur_ins)] <- ma_m_cur_ins + ma_count <- Vp + Vm + DP <- as.numeric(linepos[eval(depth)]) + if (sum((ma_count/DP) > 0.8, na.rm = T) > 0.5 * length(ma_count)) { + # here we need to reverse alt and ref for the regression (use + # ma_count of the ref) + ref <- linepos[3] + cur_ins_inv <- linepos[3] + Vp_inv <- as.numeric(linepos[eval(as.name(paste(cur_ins_inv, "_cols", + sep = "")))]) + Vm_inv <- as.numeric(linepos[eval(as.name(paste(tolower(cur_ins_inv), + "_cols", sep = "")))]) + ma_count_inv <- Vp_inv + Vm_inv + ref_inv <- TRUE + reg_res <- glmrob.nb(x = DP, y = ma_count_inv, min_coverage = min_coverage, + min_reads = min_reads, min_af = min_af, extra_rob = extra_rob, + min_af_extra_rob = min_af_extra_rob, min_prop_extra_rob = min_prop_extra_rob, + max_prop_extra_rob = max_prop_extra_rob) + + } else { + ref_inv <- FALSE + reg_res <- glmrob.nb(x = DP, y = ma_count, min_coverage = min_coverage, + min_reads = min_reads, min_af = min_af, extra_rob = extra_rob, + min_af_extra_rob = min_af_extra_rob, min_prop_extra_rob = min_prop_extra_rob, + max_prop_extra_rob = max_prop_extra_rob) + + } + output = c(linepos[1:3], cur_ins, "INS", reg_res$coef["sigma"], reg_res$coef["slope"], + ref_inv, reg_res$GQ) + names(output) = output_cols + qvals = rbind(qvals, output) + + # compute Qval20pc + qval_minAF <- rep(0, nindiv) + somatic_status <- rep(".", nindiv) + + + if (afmin_power == -1) { + # no minimum frequency supplied -> use a negative binomial + # distribution to check the power + qval_minAF <- sapply(1:nindiv, function(ii) toQvalueN(DP[ii], reg_res, + sigma)) + } else { + # minimum frequency supplied -> use a binomial distribution + # centered around afmin_power to check the power + qval_minAF <- sapply(1:nindiv, function(ii) toQvalueT(DP[ii], reg_res, + afmin_power)) + } + + + if (!is.na(reg_res$coef["slope"]) & sum(reg_res$GQ >= GQ_threshold, na.rm = TRUE) > + 0) { + all_AO <- sum(ma_count) + all_DP <- sum(as.numeric(linepos[eval(depth)])) + sum(ma_count) + common_annot(linepos, Vp, Vm) + all_RO <- sum(Rp + Rm) + if (SB_type == "SOR") { + sbs <- sors + } else { + if (SB_type == "RVSB") { + sbs <- rvsbs + } else { + sbs <- FisherStrand + } + } + if (sum(sbs <= SB_threshold_indel & reg_res$GQ >= GQ_threshold, na.rm = TRUE) > + 0) { + con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1], + ":", as.numeric(linepos[2]) - 2, "-", as.numeric(linepos[2]), + " | tail -n1", sep = "")) + before <- toupper(readLines(con)) + close(con) + con <- pipe(paste(samtools, " faidx ", fasta_ref, " ", linepos[1], + ":", as.numeric(linepos[2]) + 1, "-", as.numeric(linepos[2]) + + 3, " | tail -n1", sep = "")) + after <- toupper(readLines(con)) + close(con) + prev_bp <- substr(before, 3, 3) + + vcf_record = paste0(linepos[1], "\t", linepos[2], "\t", ".", "\t", + prev_bp, "\t", paste(prev_bp, cur_ins, sep = ""), "\t", max(reg_res$GQ), + "\t", ".") + + # INFO field + + vcf_info = paste0("TYPE=ins;NS=", sum(as.numeric(linepos[eval(depth)]) > + 0), ";AF=", sum(reg_res$GQ >= GQ_threshold)/sum(as.numeric(linepos[eval(depth)]) > + 0), ";DP=", all_DP, ";RO=", all_RO, ";AO=", all_AO, ";SRF=", + sum(Rp), ";SRR=", sum(Rm), ";SAF=", sum(Vp), ";SAR=", sum(Vm), + ";SOR=", all_sor, ";RVSB=", all_rvsb, ";FS=", FisherStrand_all, + ";ERR=", reg_res$coef["slope"], ";SIG=", reg_res$coef["sigma"], + ";CONT=", paste(before, after, sep = "x"), ifelse(reg_res$extra_rob & + !(ref_inv), ";WARN=EXTRA_ROBUST_GL", ""), ifelse(reg_res$extra_rob & + ref_inv, ";WARN=EXTRA_ROBUST_GL/INV_REF", ""), ifelse(ref_inv & + !(reg_res$extra_rob), ";WARN=INV_REF", "")) + # FORMAT field + + vcf_format = paste0("GT:", ifelse(ref_inv, "QVAL_INV", "QVAL"), + ":DP:RO:AO:AF:SB:SOR:RVSB:FS:QVAL_minAF:STATUS") + # all samples + if (ref_inv) { + genotype <- rep("1/1", l = nindiv) + } else { + genotype <- rep("0/0", l = nindiv) + } + heterozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV & + reg_res$ma_count/reg_res$coverage < 0.75) + genotype[heterozygotes] <- "0/1" + homozygotes <- which(reg_res$GQ >= GQ_threshold & sbs <= SB_threshold_SNV & + reg_res$ma_count/reg_res$coverage >= 0.75) + if (ref_inv) { + genotype[homozygotes] <- "0/0" + } else { + genotype[homozygotes] <- "1/1" + } + # no tumor variant but low power -> './.' + genotype[(reg_res$GQ < GQ_threshold) & (qval_minAF < GQ_threshold)] <- "./." + vcf_record = paste(vcf_record, vcf_info, vcf_format, sep = "\t") + for (cur_sample in 1:nindiv) { + + geno = paste0(genotype[cur_sample], ":", reg_res$GQ[cur_sample], + ":", DP[cur_sample], ":", (Rp + Rm)[cur_sample], ":", ma_count[cur_sample], + ":", (ma_count/DP)[cur_sample], ":", Rp[cur_sample], ",", Rm[cur_sample], + ",", Vp[cur_sample], ",", Vm[cur_sample], ":", sors[cur_sample], + ":", rvsbs[cur_sample], ":", FisherStrand[cur_sample], ":", + qval_minAF[cur_sample], ":", somatic_status[cur_sample]) + vcf_record = paste(vcf_record, geno, sep = "\t") + } + + + vcf_record = paste0(vcf_record, "\n") + vcf_records = append(vcf_records, vcf_record) + + if (!ref_inv & nchar(cur_ins) > 10) + cur_ins <- paste(substr(cur_ins, 1, 3 + match(cur_ins, uniq_ins)), + substr(cur_ins, nchar(cur_ins) - (3 + match(cur_ins, uniq_ins)), + nchar(cur_ins)), sep = "x") + if (ref_inv & nchar(ref) > 10) + ref <- paste(substr(ref, 1, 3 + match(ref, uniq_ins)), substr(ref, + nchar(ref) - (3 + match(ref, uniq_ins)), nchar(ref)), sep = "x") + } + } + } + } + return(list(qvals = qvals, vcf_records = vcf_records)) +} +model_for_position <- function(line) { + + linepos <- unlist(strsplit(line, "\t")) + qvals = matrix(, nrow = 0, ncol = length(output_cols)) + dimnames(qvals) <- list(NULL, output_cols) + vcf_records = list() + if (is.element(linepos[3], c("A", "T", "C", "G"))) { + + msnv = model_for_snv(linepos) + mdel = model_for_deletion(linepos) + mins = model_for_insertion(linepos) + + qvals = rbind(qvals, msnv$qvals, mdel$qvals, mins$qvals) + vcf_records = c(vcf_records, msnv$vcf_records, mdel$vcf_records, mins$vcf_records) + + } + return(list(qvals = qvals, vcf_records = vcf_records)) + + +} + +if (file.exists(out_vcf_file)) file.remove(out_vcf_file) + +write_vcf_header(indiv_run) + + + +############################################################################################### + +############################## REGRESSION SECTION +############################## ############################## + +# Reading input file (output of mpileup2readcounts) +f <- file(input_file, "r") +lines = readLines(f) +qvals_per_position = mclapply(lines, model_for_position, mc.cores = num_cores) +qvals = do.call(rbind, lapply(qvals_per_position, function(x) x$qvals)) +write.table(qvals, out_qvals_file, sep = "\t", row.names = F, quote = F) +vcf_records = unlist(lapply(qvals_per_position, function(x) x$vcf_records)) +con <- file(out_vcf_file, "a") +writeLines(vcf_records, sep = "", con) +close(con) + + +print(Sys.time()) +###############################################################################################