From ea559a77a1f0e8a02c7e38b20fb0a126467e9741 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 19 Mar 2018 18:20:20 +0100 Subject: [PATCH 001/114] dockers for single R packages - init --- Docker/target-qc/Dockerfile | 18 +++++++++++++ build.sh | 51 +++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 Docker/target-qc/Dockerfile create mode 100755 build.sh diff --git a/Docker/target-qc/Dockerfile b/Docker/target-qc/Dockerfile new file mode 100644 index 0000000..1dbc44e --- /dev/null +++ b/Docker/target-qc/Dockerfile @@ -0,0 +1,18 @@ +FROM biodatageeks/bdg-spark +MAINTAINER biodatageeks + + +RUN apt-get update && apt-get install --yes \ + wget \ + && rm -rf /var/lib/apt/lists/* + +RUN wget https://repo1.maven.org/maven2/org/bdgenomics/adam/adam-distribution-spark2_2.11/0.23.0/adam-distribution-spark2_2.11-0.23.0-bin.tar.gz +RUN tar -zxvf adam-distribution-spark2_2.11-0.23.0-bin.tar.gz +RUN mv adam-distribution-spark2_2.11-0.23.0 adam && mv adam /tmp && rm -f adam-distribution-spark2_2.11-0.23.0-bin.tar.gz +ENV PATH="/tmp/adam/bin:${PATH}" + + + + + + diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..cae7f57 --- /dev/null +++ b/build.sh @@ -0,0 +1,51 @@ +#!/bin/bash -x + +BUILD_MODE=$1 +#only build images modified in the last 10h (10*3600s) +MAX_COMMIT_TS_DIFF=36000 + +bump_version () { + incl=0.01 + version="0.00" + if [ "$(curl -L -s "https://registry.hub.docker.com/v2/repositories/${image}/tags" | jq -r ".detail")" == "Object not found" ]; then + version="0.01" + else + version=`curl -L -s "https://registry.hub.docker.com/v2/repositories/${image}/tags" | jq -r '.results[0].name '` + version=`echo $version + $incl | bc| awk '{printf "%.2f\n", $0}'` + fi + echo $version +} + + +find Docker -name "Dockerfile" | sed 's/\/Dockerfile//' | while read dir; +do + + image=`echo $dir| sed 's/^Docker/biodatageeks/'` + version=`if [ ! -e $dir/version ]; then bump_version $image; else tail -1 $dir/version; fi` + if [ -e $dir/version ]; then + ver=`tail -1 $dir/version`; + if [[ $OSTYPE != "darwin17" ]]; then + sed -i "s/{{COMPONENT_VERSION}}/${ver}/g" $dir/Dockerfile ; + else + sed -i '' "s/{{COMPONENT_VERSION}}/${ver}/g" $dir/Dockerfile ; + fi + fi + echo "Building image ${image}..." + diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` + if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then + cd $dir + docker build -t $image:$version . + docker build -t $image:latest . + if [[ ${BUILD_MODE} != "local" ]]; then + docker push docker.io/$image:latest + docker push docker.io/$image:$version + fi + ##revert COMPONENT_VERSION variable + if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi + #keep only last 3 versions of an image locally (2+3 in tail part) + docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {} + + cd ../.. + fi + +done From 723d22248e261164a60bc62325af1d12401e519c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 19 Mar 2018 18:36:11 +0100 Subject: [PATCH 002/114] building docker images before running tests - only for speed up dev process --- Docker/reference-sample-set-selector/Dockerfile | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 Docker/reference-sample-set-selector/Dockerfile diff --git a/Docker/reference-sample-set-selector/Dockerfile b/Docker/reference-sample-set-selector/Dockerfile new file mode 100644 index 0000000..5ff1007 --- /dev/null +++ b/Docker/reference-sample-set-selector/Dockerfile @@ -0,0 +1,7 @@ +FROM biodatageeks/bdg-spark +MAINTAINER biodatageeks + + + + + From 357e36ce5299bc04fa1dae9edf2eaf7e56bae9ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 19 Mar 2018 18:39:36 +0100 Subject: [PATCH 003/114] bugfix in Jenkinsfile --- Docker/target-qc/Dockerfile | 11 ----------- Jenkinsfile | 15 +++++++++++++++ build.sh | 14 +++++++------- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/Docker/target-qc/Dockerfile b/Docker/target-qc/Dockerfile index 1dbc44e..5ff1007 100644 --- a/Docker/target-qc/Dockerfile +++ b/Docker/target-qc/Dockerfile @@ -2,17 +2,6 @@ FROM biodatageeks/bdg-spark MAINTAINER biodatageeks -RUN apt-get update && apt-get install --yes \ - wget \ - && rm -rf /var/lib/apt/lists/* - -RUN wget https://repo1.maven.org/maven2/org/bdgenomics/adam/adam-distribution-spark2_2.11/0.23.0/adam-distribution-spark2_2.11-0.23.0-bin.tar.gz -RUN tar -zxvf adam-distribution-spark2_2.11-0.23.0-bin.tar.gz -RUN mv adam-distribution-spark2_2.11-0.23.0 adam && mv adam /tmp && rm -f adam-distribution-spark2_2.11-0.23.0-bin.tar.gz -ENV PATH="/tmp/adam/bin:${PATH}" - - - diff --git a/Jenkinsfile b/Jenkinsfile index 4d46cc4..4b999d4 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2,6 +2,13 @@ pipeline { agent any stages { + stage('Building Docker images') { + steps { + echo 'Building Docker images....' + sh './build.sh' + } + } + stage('Test R code') { steps { echo 'Testing R code....' @@ -52,6 +59,14 @@ pipeline { } } + + stage('Building Docker images') { + steps { + echo 'Building Docker images....' + sh './build.sh' + } + } + stage('Publish to Nexus snapshots and copying assembly fat jar to the edge server') { when { branch 'master' diff --git a/build.sh b/build.sh index cae7f57..73ef363 100755 --- a/build.sh +++ b/build.sh @@ -36,16 +36,16 @@ do cd $dir docker build -t $image:$version . docker build -t $image:latest . - if [[ ${BUILD_MODE} != "local" ]]; then - docker push docker.io/$image:latest - docker push docker.io/$image:$version - fi +# if [[ ${BUILD_MODE} != "local" ]]; then +# docker push docker.io/$image:latest +# docker push docker.io/$image:$version +# fi ##revert COMPONENT_VERSION variable - if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi +# if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi #keep only last 3 versions of an image locally (2+3 in tail part) - docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {} +# docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {} - cd ../.. +# cd ../.. fi done From 7cd7620560b1288d783eb6c4f7fdd5788f30ab79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 19 Mar 2018 18:41:31 +0100 Subject: [PATCH 004/114] build.sh file without pushing dockers images --- Jenkinsfile | 2 +- build.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4b999d4..4c88e48 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -60,7 +60,7 @@ pipeline { } - stage('Building Docker images') { + stage('Build Docker images') { steps { echo 'Building Docker images....' sh './build.sh' diff --git a/build.sh b/build.sh index 73ef363..7bd1419 100755 --- a/build.sh +++ b/build.sh @@ -41,11 +41,11 @@ do # docker push docker.io/$image:$version # fi ##revert COMPONENT_VERSION variable -# if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi + if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi #keep only last 3 versions of an image locally (2+3 in tail part) -# docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {} + docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {} -# cd ../.. + cd ../.. fi done From f72a2ba71e447e22d5599ef47cf8defb7dbb755f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 19 Mar 2018 18:44:02 +0100 Subject: [PATCH 005/114] pushing images in build.sh file --- build.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/build.sh b/build.sh index 7bd1419..cae7f57 100755 --- a/build.sh +++ b/build.sh @@ -36,10 +36,10 @@ do cd $dir docker build -t $image:$version . docker build -t $image:latest . -# if [[ ${BUILD_MODE} != "local" ]]; then -# docker push docker.io/$image:latest -# docker push docker.io/$image:$version -# fi + if [[ ${BUILD_MODE} != "local" ]]; then + docker push docker.io/$image:latest + docker push docker.io/$image:$version + fi ##revert COMPONENT_VERSION variable if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi #keep only last 3 versions of an image locally (2+3 in tail part) From 5de7effb6a6e91a239eb359057ff8b7b161c4fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 19 Mar 2018 19:12:26 +0100 Subject: [PATCH 006/114] rename docker images --- .../Dockerfile | 0 Docker/{target-qc => cnv-opt-target-qc}/Dockerfile | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename Docker/{reference-sample-set-selector => cnv-opt-reference-sample-set-selector}/Dockerfile (100%) rename Docker/{target-qc => cnv-opt-target-qc}/Dockerfile (100%) diff --git a/Docker/reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile similarity index 100% rename from Docker/reference-sample-set-selector/Dockerfile rename to Docker/cnv-opt-reference-sample-set-selector/Dockerfile diff --git a/Docker/target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile similarity index 100% rename from Docker/target-qc/Dockerfile rename to Docker/cnv-opt-target-qc/Dockerfile From a0cc870a05fc43676841fef2159d591dcb262608 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 20 Mar 2018 14:27:37 +0100 Subject: [PATCH 007/114] draft of docker containers for CODEX, Canoes and Exomedepth --- Docker/cnv-opt-canoes/Dockerfile | 17 +++++++++++++++++ Docker/cnv-opt-codex/Dockerfile | 14 ++++++++++++++ Docker/cnv-opt-exomedepth/Dockerfile | 21 +++++++++++++++++++++ Docker/cnv-opt-target-qc/Dockerfile | 8 ++++++-- 4 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 Docker/cnv-opt-canoes/Dockerfile create mode 100644 Docker/cnv-opt-codex/Dockerfile create mode 100644 Docker/cnv-opt-exomedepth/Dockerfile diff --git a/Docker/cnv-opt-canoes/Dockerfile b/Docker/cnv-opt-canoes/Dockerfile new file mode 100644 index 0000000..6667651 --- /dev/null +++ b/Docker/cnv-opt-canoes/Dockerfile @@ -0,0 +1,17 @@ +FROM ubuntu:xenial +MAINTAINER biodatageeks + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev + +RUN Rscript -e "install.packages('nnls', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('Hmisc', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('mgcv', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('plyr', repos = 'http://cran.us.r-project.org')" diff --git a/Docker/cnv-opt-codex/Dockerfile b/Docker/cnv-opt-codex/Dockerfile new file mode 100644 index 0000000..6f56eb5 --- /dev/null +++ b/Docker/cnv-opt-codex/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:xenial +MAINTAINER biodatageeks + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev + +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')" diff --git a/Docker/cnv-opt-exomedepth/Dockerfile b/Docker/cnv-opt-exomedepth/Dockerfile new file mode 100644 index 0000000..fd16d85 --- /dev/null +++ b/Docker/cnv-opt-exomedepth/Dockerfile @@ -0,0 +1,21 @@ +FROM ubuntu:xenial +MAINTAINER biodatageeks + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev + +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrings')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomicRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('RCurl')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomicAlignments')" +RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')" + diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile index 5ff1007..ab19c2f 100644 --- a/Docker/cnv-opt-target-qc/Dockerfile +++ b/Docker/cnv-opt-target-qc/Dockerfile @@ -1,7 +1,11 @@ -FROM biodatageeks/bdg-spark +FROM ubuntu:xenial MAINTAINER biodatageeks +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev +RUN Rscript -e "install.packages('devtools', repos='http://cran.cnr.berkeley.edu'); devtools::install_github('hadley/testthat')" - +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')" From be88e856fc390c701bb7400816bae17fbba5d72f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 20 Mar 2018 16:33:18 +0100 Subject: [PATCH 008/114] CANOES package --- R/CANOES/DESCRIPTION | 26 ++ R/CANOES/NAMESPACE | 2 + R/CANOES/R/functions_CANOES.R | 685 ++++++++++++++++++++++++++++++++++ R/CANOES/R/run_CANOES.R | 32 ++ 4 files changed, 745 insertions(+) create mode 100644 R/CANOES/DESCRIPTION create mode 100644 R/CANOES/NAMESPACE create mode 100644 R/CANOES/R/functions_CANOES.R create mode 100644 R/CANOES/R/run_CANOES.R diff --git a/R/CANOES/DESCRIPTION b/R/CANOES/DESCRIPTION new file mode 100644 index 0000000..314f419 --- /dev/null +++ b/R/CANOES/DESCRIPTION @@ -0,0 +1,26 @@ +Package: CANOESCOV +Title: CANOES Package With Interface To External Coverage File +Version: 0.0.1 +Authors@R: c( + person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")), + person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")), + person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut"))) +Description: An extended implementation of the CANOES package in R. It extends + original implementation by using external coverage file, which should + speed up calculations for running application with multiple sets of input + parameters. +Depends: + R (>= 3.2.3), + devtools (>= 1.13.2), + DBI (== 0.8), + optparse (== 1.4.4), + IRanges (>= 2.0.0), + plyr (>= 1.8.4), + nnls (>= 1.4.0), + Hmisc (>= 4.0.0), + mgcv (>= 1.8.0), + REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1) +License: GPL-3 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.0.1.9000 diff --git a/R/CANOES/NAMESPACE b/R/CANOES/NAMESPACE new file mode 100644 index 0000000..884a631 --- /dev/null +++ b/R/CANOES/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: fake comment so roxygen2 overwrites silently. +exportPattern("^[^\\.]") diff --git a/R/CANOES/R/functions_CANOES.R b/R/CANOES/R/functions_CANOES.R new file mode 100644 index 0000000..b3077b4 --- /dev/null +++ b/R/CANOES/R/functions_CANOES.R @@ -0,0 +1,685 @@ +# Constants +NUM.ABNORMAL.STATES=2 +NUM.STATES=3 +DELETION=1 +NORMAL=2 +DUPLICATION=3 + +# PlotCNV +# Plots count data for targets of interest +# highlights sample of interest in red, +# highlights area of interest with a black line +# highlights probe locations with black dots +# Arguments: +# counts: +# count matrix, with column "target" with target numbers +# and sample data in columns 6:end +# sample.name: +# sample of interest (will be highlighted in red in figure) +# (should correspond to a column in counts) +# targets: +# targets of interest in the form start.target..end.target +# offset: +# number of targets to add on either end (default=1) +# Returns: +# returns nothing +PlotCNV <- function(counts, sample.name, targets, offset=1){ + sample.name <- as.character(sample.name) + if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} + if (length(setdiff("target", names(counts)[1:5]) > 0)){ + stop("counts matrix must have column named target") + } + t <- as.character(targets) + start.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[1]) + end.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[2]) + if (!start.target %in% counts$target){ + stop("no data for start.target in counts matrix") + } + if (!end.target %in% counts$target){ + stop("no data for end.target in counts matrix") + } + if ((start.target - offset) %in% counts$target){ + start.target <- start.target - offset + } + if ((end.target + offset) %in% counts$target){ + end.target <- end.target + offset + } + ref.sample.names <- setdiff(as.character(names(counts)[-seq(1,5)]), + sample.name) + data <- subset(counts, target >= start.target & target <= end.target) + sample.data <- data[, sample.name] + means <- apply(data[, ref.sample.names], 1, mean) + sd <- sqrt(apply(data[, ref.sample.names], 1, var)) + refs.z.scores <- matrix(NA, nrow(data), length(ref.sample.names)) + sample.z.score <- numeric(length = nrow(data)) + for (i in seq(1, dim(data)[1])){ + refs.z.scores[i, ] <- as.numeric((data[i, ref.sample.names] - means[i]) / + max(0.000001, sd[i])) + sample.z.score[i] <- (sample.data[i] - means[i]) / max(0.000001, sd[i]) + } + ylim <- max(abs(refs.z.scores), abs(sample.z.score)) + plot(seq(-6, 6), seq(-6, 6), + xlim=c(data[1, "start"], data[dim(data)[1], "start"]), + ylim=c(-ylim - 0.1, ylim + 0.1), type="n", xlab="", ylab="Z-score") + for (i in seq(1, length(ref.sample.names))){ + lines(data[, "start"], refs.z.scores[, i], col="#2f4f4f85") + } + lines(data[, "start"], sample.z.score, col="red", lwd=3) + points(data[, "start"], rep(-ylim - 0.05, length(data[, "start"])), pch=20) + lines( c(data[1 + offset, "start"], data[nrow(data) - offset, "end"]) , + c(ylim+0.2, ylim+0.2), lwd=2) + title(main=paste("Sample ", sample.name, ", ", + counts$chromosome[start.target], ":", + data$start[1], "-", data$end[nrow(data)], sep="")) +} + +# CallCNVs +# Calls CNVs in sample of interest +# Arguments: +# sample.name: +# sample to call CNVs in (should correspond to a column in counts) +# counts: +# count matrix, first five columns should be +# target: consecutive numbers for targets (integer) +# chromosome: chromosome number (integer-valued) +# (support for sex chromosomes to come) +# start: start position of probe (integer) +# end: end position of probe (integer) +# gc: gc content (real between 0 and 1) +# subsequent columns should include counts for each probe for samples +# p: +# average rate of occurrence of CNVs (real) default is 1e-08 +# D: +# expected distance between targets in a CNV (integer) default is 70,000 +# Tnum: +# expected number of targets in a CNV (integer) default is 6 +# numrefs +# maximum number of reference samples to use (integer) default is 30 +# the weighted variance calculations will take a long time if too +# many reference samples are used +# Returns: +# data frame with the following columns: +# SAMPLE: name of sample +# CNV: DEL of DUP +# INTERVAL: CNV coordinates in the form chr:start-stop +# KB: length of CNV in kilobases +# CHR: chromosome +# MID_BP: middle base pair of CNV +# TARGETS: target numbers of CNV in the form start..stop +# NUM_TARG: how many targets are in the CNV +# Q_SOME: a Phred-scaled quality score for the CNV +CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, numrefs=30, get.dfs=F, homdel.mean=0.2){ + if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} + if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ + stop("First five columns of counts matrix must be target, chromosome, start, end, gc") + } + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) { + # remove sex chromosomes + cat("Trying to remove sex chromosomes and 'chr' prefixes\n") + counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y")) + if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){ + counts$chromosome <- gsub("chr", "", counts$chromosome) + } + counts$chromosome <- as.numeric(counts$chromosome) + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) + stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)") + } + library(plyr) + counts <- arrange(counts, chromosome, start) + if (p <= 0){ + stop("parameter p must be positive") + } + if (Tnum <= 0){ + stop("parameter Tnum must be positive") + } + if (D <= 0){ + stop("parameter D must be positive") + } + if (numrefs <= 0){ + stop("parameter numrefs must be positive") + } + sample.names <- colnames(counts)[-seq(1,5)] + # find mean coverage of probes + mean.counts <- mean(apply(counts[, sample.names], 2, mean)) + # normalize counts; round so we can use negative binomial + counts[, sample.names] <- apply(counts[, sample.names], 2, + function(x, mean.counts) + round(x * mean.counts / mean(x)), mean.counts) + # calculate covariance of read count across samples + cov <- cor(counts[, sample.names], counts[, sample.names]) + reference.samples <- setdiff(sample.names, sample.name) + covariances <- cov[sample.name, reference.samples] + reference.samples <- names(sort(covariances, + decreasing=T)[1:min(numrefs, length(covariances))]) + sample.mean.counts <- mean(counts[, sample.name]) + sample.sumcounts <- apply(counts[, reference.samples], 2, sum) + # normalize reference samples to sample of interest + counts[, reference.samples] <- apply(counts[, reference.samples], 2, + function(x, sample.mean.counts) + round(x * sample.mean.counts / + mean(x)), sample.mean.counts) + # select reference samples and weightings using non-negative least squares + b <- counts[, sample.name] + A <- as.matrix(counts[, reference.samples]) + library(nnls) + all <- nnls(A, b)$x + est <- matrix(0, nrow=50, ncol=length(reference.samples)) + set.seed(1) + for (i in 1:50){ + d <- sample(nrow(A), min(500, nrow(A))) + est[i, ] <- nnls(A[d, ], b[d])$x + } + weights <- colMeans(est) + sample.weights <- weights / sum(weights) + library(Hmisc) + # calculate weighted mean of read count + # this is used to calculate emission probabilities + counts$mean <- apply(counts[, reference.samples], + 1, wtd.mean, sample.weights) + targets <- counts$target + # exclude probes with all zero counts + nonzero.rows <- counts$mean > 0 + nonzero.rows.df <- data.frame(target=counts$target, + nonzero.rows=nonzero.rows) + + counts <- counts[nonzero.rows, ] + # get the distances between consecutive probes + distances <- GetDistances(counts) + # estimate the read count variance at each probe + var.estimate <- EstimateVariance(counts, reference.samples, + sample.weights) + emission.probs <- EmissionProbs(counts[, sample.name], + counts$mean, var.estimate$var.estimate, + counts[, "target"]) + if (get.dfs){ + return(list(emission.probs=emission.probs, distances=distances)) + } + # call CNVs with the Viterbi algorithm + viterbi.state <- Viterbi(emission.probs, distances, p, Tnum, D) + # format the CNVs + cnvs <- PrintCNVs(sample.name, viterbi.state, + counts) + # if there aren't too many CNVs, calculate the Q_SOME + if (nrow(cnvs) > 0 & nrow(cnvs) <= 50){ + qualities <- GenotypeCNVs(cnvs, sample.name, counts, p, Tnum, D, numrefs, + emission.probs=emission.probs, + distances=distances) + for (i in 1:nrow(cnvs)){ + cnvs$Q_SOME[i] <- ifelse(cnvs$CNV[i]=="DEL", qualities[i, "SQDel"], + qualities[i, "SQDup"]) + } + } + data <- as.data.frame(cbind(counts$target, counts$mean, var.estimate$var.estimate, counts[, sample.name])) + names(data) <- c("target", "countsmean", "varestimate", "sample") + if (nrow(cnvs) > 0){ + cnvs <- CalcCopyNumber(data, cnvs, homdel.mean) + } + return(cnvs) +} + +# GenotypeCNVs +# Genotype CNVs in sample of interest +# Arguments: +# xcnv +# data frame with the following columns, and one row for each +# CNV to genotype +# INTERVAL: CNV coordinates in the form chr:start-stop +# TARGETS: target numbers of CNV in the form start..stop +# these should correspond to the target numbers in counts +# sample.name: +# sample to genotype CNVs in (should correspond to a column in counts) +# counts: +# count matrix, first five columns should be +# target: consecutive numbers for targets (integer) +# chromosome: chromosome number (integer-valued) +# (support for sex chromosomes to come) +# start: start position of probe (integer) +# end: end position of probe (integer) +# gc: gc content (real between 0 and 1) +# subsequent columns should include counts for each probe for samples +# p: +# average rate of occurrence of CNVs (real) default is 1e-08 +# D: +# expected distance between targets in a CNV (integer) default is 70,000 +# Tnum: +# expected number of targets in a CNV (integer) default is 6 +# numrefs +# maximum number of reference samples to use (integer) default is 30 +# the weighted variance calculations will take a long time if too +# many reference samples are used +# emission.probs and distances are for internal use only +# Returns: +# data frame with the following columns and one row for each genotyped CNV: +# INTERVAL: CNV coordinates in the form chr:start-stop +# NQDEL: a Phred-scaled quality score that sample.name has no deletion +# in the interval +# SQDEL: a Phred-scaled quality score that sample.name has a deletion +# in the interval +# NQDUP and SQDUP: same, but for a duplication +GenotypeCNVs <- function(xcnvs, sample.name, counts, p=1e-08, Tnum=6, + D=70000, numrefs=30, + emission.probs=NULL, + distances=NULL){ + if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} + if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ + stop("First five columns of counts matrix must be target, chromosome, start, end, gc") + } + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) { + # remove sex chromosomes + cat("Trying to remove sex chromosomes and 'chr' prefixes\n") + counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y")) + if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){ + counts$chromosome <- gsub("chr", "", counts$chromosome) + } + counts$chromosome <- as.numeric(counts$chromosome) + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) + stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)") + } + library(plyr) + counts <- arrange(counts, chromosome, start) + if (p <= 0){ + stop("parameter p must be positive") + } + if (Tnum <= 0){ + stop("parameter Tnum must be positive") + } + if (D <= 0){ + stop("parameter D must be positive") + } + if (numrefs <= 0){ + stop("parameter numrefs must be positive") + } + num.cnvs <- nrow(xcnvs) + cnv.intervals <- as.character(xcnvs$INTERVAL) + # if no emission probs matrix is passed in, generate a new one + if (is.null(emission.probs)){ + l <- CallCNVs(sample.name, counts, p, Tnum=6, D=70000, numrefs=30, get.dfs=T) + emission.probs <- l[['emission.probs']] + distances <- l[['distances']] + } + forward.m <- GetForwardMatrix(emission.probs, distances, p, Tnum, D) + backward.m <- GetBackwardMatrix(emission.probs, distances, p, Tnum, D) + qualities <- matrix(0, nrow=num.cnvs, ncol=5, + dimnames=list(cnv.intervals, + c("INTERVAL", "NQDel", "SQDel", "NQDup", "SQDup"))) + for (i in 1:num.cnvs){ + interval <- as.character(xcnvs[i, "INTERVAL"]) + targets <- as.numeric(strsplit(as.character(xcnvs[i, "TARGETS"]), ".", fixed=T)[[1]][c(1,3)]) + left.target <- targets[1] + right.target <- targets[2] + likelihoods <- GetModifiedLikelihood(forward.m, backward.m, + emission.probs, distances, + left.target, right.target, + c(DUPLICATION, DELETION), p, Tnum, D) + modified.likelihood <- likelihoods[1]; + unmodified.likelihood <- likelihoods[2] + Prob.All.Normal <- exp(modified.likelihood - unmodified.likelihood) + likelihoods <- GetModifiedLikelihood(forward.m, backward.m, + emission.probs, distances, + left.target, right.target, DELETION, p, Tnum, D) + modified.likelihood <- likelihoods[1]; + unmodified.likelihood <- likelihoods[2] + Prob.No.Deletion <- exp(modified.likelihood - unmodified.likelihood) + likelihoods <- GetModifiedLikelihood(forward.m, backward.m, + emission.probs, distances, + left.target, right.target, DUPLICATION, p, Tnum, D) + modified.likelihood <- likelihoods[1]; + unmodified.likelihood <- likelihoods[2] + Prob.No.Duplication <- exp(modified.likelihood - unmodified.likelihood) + # Check if probabilities greater than 1 are numerical error or bug + Phred <- function(prob){ + return(round(min(99, -10 * log10(1 - prob)))) + } + qualities[i, "NQDel"] <- Phred(Prob.No.Deletion) + qualities[i, "SQDel"] <- Phred(Prob.No.Duplication - Prob.All.Normal) + qualities[i, "NQDup"] <- Phred(Prob.No.Duplication) + qualities[i, "SQDup"] <- Phred(Prob.No.Deletion - Prob.All.Normal) + qualities[i, "INTERVAL"] <- interval + } + qualities <- as.data.frame(qualities, stringsAsFactors=F) + qualities$NQDel <- as.integer(qualities$NQDel) + qualities$NQDup <- as.integer(qualities$NQDup) + qualities$SQDel <- as.integer(qualities$SQDel) + qualities$SQDup <- as.integer(qualities$SQDup) + return(qualities) +} + +# returns data frame with distance to each target from the previous target +# (0 in the case of the first target on chromosome 1, a very big number +# for the first target on each other chromosome--this resets the HMM +# for each chromosome) +GetDistances <- function(counts){ + chromosome <- counts[, "chromosome"] + startbase <- counts[, "start"] + num.nonzero.exons <- length(startbase) + distances <- c(0, startbase[2:num.nonzero.exons] - + startbase[1:(num.nonzero.exons - 1)] + + 1000000000000 * (chromosome[2:num.nonzero.exons] - + chromosome[1:(num.nonzero.exons - 1)])) + return(data.frame(target=counts[, "target"], distance=distances)) +} + +EstimateVariance <- function(counts, ref.sample.names, sample.weights){ + library(Hmisc) + counts$var <- apply(counts[, ref.sample.names], 1, wtd.var, sample.weights, normwt=T) + set.seed(1) + counts.subset <- counts[sample(nrow(counts), min(36000, nrow(counts))), ] + library(mgcv) + # can't do gamma regression with negative + counts.subset$var[counts.subset$var==0] <- 0.1 + fit <- gam(var ~ s(mean) + s(gc), family=Gamma(link=log), data=counts.subset) + # we don't want variance less than Poisson + # we take maximum of genome-wide estimate, method of moments estimate + # and Poisson variance + v.estimate <- pmax(predict(fit, counts, type="response"), counts$var, + counts$mean * 1.01) + return(data.frame(target=counts$target, var.estimate=v.estimate)) +} + +EmissionProbs <- function(test.counts, target.means, + var.estimate, targets){ + num.targets <- length(test.counts) + # calculate the means for the deletion, normal and duplication states + state.target.means <- t(apply(data.frame(x=target.means), 1, function(x) c(x*1/2, x, x*3/2))) + # calculate the expected size (given the predicted variance) + size <- target.means ^ 2 / (var.estimate - target.means) + emission.probs <- matrix(NA, num.targets, 4) + colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") + # calculate the emission probabilities given the read count + size.del <- size + size.dup <- size + size.del <- size / 2 + size.dup <- size * 3 / 2 + emission.probs[, "delprob"] <- dnbinom( + test.counts, + mu=state.target.means[, 1], + size=size.del, log=T) + emission.probs[, "normalprob"] <- dnbinom( + test.counts, + mu=state.target.means[, 2], + size=size, log=T) + emission.probs[, "dupprob"] <- dnbinom( + test.counts, + mu=state.target.means[, 3], + size=size.dup, log=T) + emission.probs[, "target"] <- targets + # some values may be infinite as a result of extreme read count + row.all.inf <- which(apply(emission.probs, 1, function(x){all(is.infinite(x))})) + if (length(row.all.inf) > 0){ + for (i in row.all.inf){ + if (test.counts[i] >= state.target.means[i, 3]){ + emission.probs[i, 2:4] <- c(-Inf, -Inf, -0.01) + } + else if (test.counts[i] <= state.target.means[i, 1]){ + emission.probs[i, 2:4] <- c(-0.01, -Inf, -Inf) + } + else emission.probs[i, 2:4] <- c(-Inf, -0.01, -Inf) + } + } + return(emission.probs) +} + +# Viterbi algorithm +Viterbi <- function(emission.probs.matrix, distances, p, Tnum, D){ + targets <- emission.probs.matrix[, 1] + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + num.exons <- dim(emission.probs.matrix)[1] + viterbi.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) + viterbi.pointers <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + viterbi.matrix[1, ] <- initial.state + emission.probs.matrix[1,] + for (i in 2:num.exons) { + temp.matrix <- viterbi.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) + viterbi.matrix[i, ] <- apply(temp.matrix, 2, max) + emission.probs <- c(emission.probs.matrix[i,]) + dim(emission.probs) <- c(NUM.STATES, 1) + viterbi.matrix[i, ] <- viterbi.matrix[i, ] + emission.probs + viterbi.pointers[i, ] <- apply(temp.matrix, 2, which.max) + } + viterbi.states = vector(length = num.exons) + viterbi.states[num.exons] = which.max(viterbi.matrix[num.exons, ]) + for (i in (num.exons - 1):1) { + viterbi.states[i] <- viterbi.pointers[i + 1, viterbi.states[i + 1]] + } + return(data.frame(target=targets, viterbi.state=viterbi.states)) +} + +# returns a transition matrix +# to state +# deletion normal duplication +# deletion +#from state normal +# duplication +GetTransitionMatrix <- function(distance, p, Tnum, D){ + q <- 1 / Tnum + f = exp(-distance/D) + prob.abnormal.abnormal <- f * (1 - q) + (1 - f) * p + prob.abnormal.normal <- f * q + (1 - f) * (1 - 2 * p) + prob.abnormal.diff.abnormal <- (1 - f) * p + prob.normal.normal <- 1 - 2 * p + prob.normal.abnormal <- p + transition.probs <- + c(prob.abnormal.abnormal, prob.abnormal.normal, prob.abnormal.diff.abnormal, + prob.normal.abnormal, prob.normal.normal, prob.normal.abnormal, + prob.abnormal.diff.abnormal, prob.abnormal.normal, prob.abnormal.abnormal) + transition.m = log(matrix(transition.probs, NUM.STATES, NUM.STATES, byrow=TRUE)) + return(transition.m) +} + +# adds two log-space probabilities using the identity +# log (p1 + p2) = log p1 + log(1 + exp(log p2 - log p1)) +AddTwoProbabilities <- function(x, y){ + if (is.infinite(x)) return (y) + if (is.infinite(y)) return (x) + sum.probs <- max(x, y) + log1p(exp(-abs(x - y))) +} + +# adds multiple log-space probabilities +SumProbabilities <- function(x){ + sum.probs <- x[1] + for (i in 2:length(x)){ + sum.probs <- AddTwoProbabilities(sum.probs, x[i]) + } + return(sum.probs) +} + +# finds the data likelihood by summing the product of the corresponding +# forward and backward probabilities at any token (should give the same value +# regardless of the token) +GetLikelihood <- function(forward.matrix, backward.matrix, x){ + SumProbabilities(forward.matrix[x, ] + backward.matrix[x, ]) +} + +# get the forward probabilities +GetForwardMatrix <- function(emission.probs.matrix, distances, p, Tnum, D){ + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + num.exons <- dim(emission.probs.matrix)[1] + forward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold forward probabilities + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + forward.matrix[1, ] <- initial.state + emission.probs.matrix[1, ] + for (i in 2:num.exons){ + # compute matrix with probability we were in state j and are now in state i + # in temp.matrix[j, i] (ignoring emission of current token) + temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) + # find the probability that we are in each of the three states + sum.probs <- apply(temp.matrix, 2, SumProbabilities) + forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] + } + return(forward.matrix) +} + +# get the backward probabilities +GetBackwardMatrix <- function(emission.probs.matrix, distances, + p, Tnum, D){ + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + num.exons <- dim(emission.probs.matrix)[1] + backward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold backward probabilities + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + backward.matrix[num.exons, ] <- rep(0, NUM.STATES) + for (i in (num.exons - 1):1){ + temp.matrix <- GetTransitionMatrix(distances$distance[i+1], p, Tnum, D) + + matrix(backward.matrix[i + 1, ], 3, 3, byrow=T) + + matrix(emission.probs.matrix[i+1, ], 3, 3, byrow=T) + backward.matrix[i, ] <- apply(temp.matrix, 1, SumProbabilities) + } + final.prob <- backward.matrix[1, ] + emission.probs.matrix[1, ] + initial.state + return(backward.matrix) +} + +# find the likelihood of the data given that certain states are disallowed +# between start target and end target +GetModifiedLikelihood <- function(forward.matrix, backward.matrix, emission.probs.matrix, distances, + start.target, end.target, disallowed.states, p, Tnum, D){ + targets <- emission.probs.matrix[, 1] + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + # there may be missing targets in this sample, we genotype the largest stretch of + # targets that lie in the CNV + left.target <- min(which(targets >= start.target)) + right.target <- max(which(targets <= end.target)) + num.exons <- dim(emission.probs.matrix)[1] + unmodified.likelihood <- GetLikelihood(forward.matrix, + backward.matrix, min(right.target + 1, num.exons)) + #right.target or left.target may be empty + + #if (right.target >= left.target) return(c(NA, unmodified.likelihood)) + stopifnot(right.target >= left.target) + modified.emission.probs.matrix <- emission.probs.matrix + modified.emission.probs.matrix[left.target:right.target, + disallowed.states] <- -Inf + + # if the start target is the first target we need to recalculate the + # forward probabilities + # for that target, using the modified emission probabilities + if (left.target == 1){ + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + forward.matrix[1, ] <- initial.state + modified.emission.probs.matrix[1, ] + left.target <- left.target + 1 + } + for (i in seq(left.target, min(right.target + 1, num.exons))){ + # compute matrix with probability we were in state j and are now in state i + # in temp.matrix[j, i] (ignoring emission of current token) + temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) + # find the probability that we are in each of the three states + sum.probs <- apply(temp.matrix, 2, SumProbabilities) + if (!i == (right.target + 1)){ + forward.matrix[i, ] <- sum.probs + modified.emission.probs.matrix[i, ] + } else{ + forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] + } + } + # find the modified likelihood of the sequence + modified.likelihood <- GetLikelihood(forward.matrix, backward.matrix, min(right.target + 1, num.exons)) + return(c(modified.likelihood, unmodified.likelihood)) +} + +SummarizeCNVs <- function(cnv.targets, counts, sample.name, state){ + sample.name <- sample.name + cnv.type <- ifelse(state==3, "DUP", "DEL") + cnv.start <- min(cnv.targets$target) + cnv.end <- max(cnv.targets$target) + cnv.chromosome <- counts[cnv.start, "chromosome"] + cnv.start.base <- counts[cnv.start, "start"] + cnv.start.target <- counts[cnv.start, "target"] + cnv.end.base <- counts[cnv.end, "end"] + cnv.end.target <- counts[cnv.end, "target"] + cnv.kbs <- (cnv.end.base - cnv.start.base) / 1000 + cnv.midbp <- round((cnv.end.base - cnv.start.base) / 2) + cnv.start.base + cnv.targets <- paste(cnv.start.target, "..", cnv.end.target, sep="") + cnv.interval <- paste(cnv.chromosome, ":", cnv.start.base, "-", cnv.end.base, sep="") + num.targets <- cnv.end.target - cnv.start.target + 1 + return(data.frame(sample.name=sample.name, cnv.type=cnv.type, cnv.interval=cnv.interval, + cnv.kbs=cnv.kbs, cnv.chromosome=cnv.chromosome, + cnv.midbp=cnv.midbp, cnv.targets=cnv.targets, num.targets=num.targets)) +} + +PrintCNVs <- function(test.sample.name, viterbi.state, + nonzero.counts){ + consecutiveGroups <- function(sequence){ + num <- length(sequence) + group <- 1 + groups <- rep(0, num) + groups[1] <- group + if (num > 1){ + for (i in 2:num){ + if (!sequence[i] == (sequence[i - 1] + 1)) group <- group + 1 + groups[i] <- group + } + } + return(groups) + } + num.duplications <- 0 + num.deletions <- 0 + for (state in c(1, 3)){ + cnv.targets <- which(viterbi.state$viterbi.state == state) + if (!length(cnv.targets) == 0){ + groups <- consecutiveGroups(cnv.targets) + library(plyr) + cnvs.temp.df <- ddply(data.frame(target=cnv.targets, group=groups), + "group", SummarizeCNVs, nonzero.counts, test.sample.name, + state) + if (state == 1){ + deletions.df <- cnvs.temp.df + if (!is.null(dim(deletions.df))){ + num.deletions <- dim(deletions.df)[1] + } + } else { + duplications.df <- cnvs.temp.df + if (!is.null(dim(duplications.df))){ + num.duplications <- dim(duplications.df)[1] + } + } + } + } + num.calls <- num.deletions + num.duplications + cat(num.calls, "CNVs called in sample", test.sample.name, "\n") + if (num.deletions == 0 & num.duplications == 0){ + df <- data.frame(SAMPLE=character(0), CNV=character(0), INTERVAL=character(0), + KB=numeric(0), CHR=character(0), + MID_BP=numeric(), TARGETS=character(0), NUM_TARG=numeric(0), Q_SOME=numeric(0), MLCN=numeric(0)) + return(df) + } + if (num.deletions > 0 & num.duplications > 0){ + cnvs.df <- rbind(deletions.df, duplications.df) + } else { + ifelse(num.deletions > 0, + cnvs.df <- deletions.df, cnvs.df <- duplications.df) + } + xcnv <- cbind(cnvs.df[, c("sample.name", "cnv.type", "cnv.interval", + "cnv.kbs", "cnv.chromosome", "cnv.midbp", + "cnv.targets", "num.targets")], 0) + colnames(xcnv) <- c("SAMPLE", "CNV", "INTERVAL", "KB", "CHR", "MID_BP", "TARGETS", + "NUM_TARG", "MLCN") + xcnv$Q_SOME <- NA + return(xcnv) +} + +CalcCopyNumber <- function(data, cnvs, homdel.mean){ + for (i in 1:nrow(cnvs)){ + cnv <- cnvs[i, ] + targets <- as.numeric(unlist(strsplit(as.character(cnv$TARGETS), "..", fixed=T))) + cnv.data <- subset(data, target >= targets[1] & target <= targets[2]) + state.target.means <- t(apply(data.frame(x=cnv.data$countsmean), 1, + function(x) c(C1=x*1/2, C2=x, C3=x*3/2, + C4=x * 2, C5=x * 5/2, C6=x*6/2))) + # calculate the expected size (given the predicted variance) + size <- cnv.data$countsmean ^ 2 / (cnv.data$varestimate - cnv.data$countsmean) + emission.probs <- matrix(NA, nrow(cnv.data), 7) + colnames(emission.probs) <- c("C0", "C1", "C2", "C3", "C4", "C5", "C6") + #colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") + # calculate the emission probabilities given the read count + emission.probs[, 1] <- dpois(cnv.data$sample, homdel.mean, log=T) + for (s in 1:6){ + size.state <- size * s/2 + emission.probs[, s+1] <- dnbinom(cnv.data$sample, mu=state.target.means[, s], + size=size.state, log=T) + } + cs <- colSums(emission.probs) + ml.state <- which.max(cs) - 1 + if (ml.state==2){ + ml.state <- ifelse(cnv$CNV=="DEL", 1, 3) + } + cnvs$MLCN[i] <- ml.state + } + return(cnvs) +} + diff --git a/R/CANOES/R/run_CANOES.R b/R/CANOES/R/run_CANOES.R new file mode 100644 index 0000000..65ddb36 --- /dev/null +++ b/R/CANOES/R/run_CANOES.R @@ -0,0 +1,32 @@ +Test <- function(){ + # read in the data + gc <- read.table("gc.txt")$V2 + canoes.reads <- read.table("canoes.reads.txt") + # rename the columns of canoes.reads + sample.names <- paste("S", seq(1:26), sep="") + names(canoes.reads) <- c("chromosome", "start", "end", sample.names) + # create a vector of consecutive target ids + target <- seq(1, nrow(canoes.reads)) + # combine the data into one data frame + canoes.reads <- cbind(target, gc, canoes.reads) + # call CNVs in each sample + # create a vector to hold the results for each sample + xcnv.list <- vector('list', length(sample.names)) + for (i in 1:length(sample.names)){ + xcnv.list[[i]] <- CallCNVs(sample.names[i], canoes.reads) + } + # combine the results into one data frame + xcnvs <- do.call('rbind', xcnv.list) + # inspect the first two CNV calls + print(head(xcnvs, 2)) + # plot all the CNV calls to a pdf + pdf("CNVplots.pdf") + for (i in 1:nrow(xcnvs)){ + PlotCNV(canoes.reads, xcnvs[i, "SAMPLE"], xcnvs[i, "TARGETS"]) + } + dev.off() + # genotype all the CNVs calls made above in sample S2 + genotyping.S2 <- GenotypeCNVs(xcnvs, "S2", canoes.reads) + # inspect the genotype scores for the first two CNV calls + print(head(genotyping.S2, 2)) +} From 0edbf3686e6dd7a011b1896c1b0c5846f6294b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 20 Mar 2018 16:36:44 +0100 Subject: [PATCH 009/114] CANOES package in Jenkinsfile --- Jenkinsfile | 1 + R/CANOES/DESCRIPTION | 14 +++----------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4c88e48..dca1340 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -29,6 +29,7 @@ pipeline { sh "cd R && R CMD build CODEXCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CODEXCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CODEXCOV_0.0.1.tar.gz" sh "cd R && R CMD build EXOMEDEPTHCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file EXOMEDEPTHCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/EXOMEDEPTHCOV_0.0.1.tar.gz" sh "cd R && R CMD build CANOESCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOESCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOESCOV_0.0.1.tar.gz" + sh "cd R && R CMD build CANOES/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOES_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOES_0.0.1.tar.gz" sh "cd R && R CMD build CNVCALLER.RUNNER/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.RUNNER_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.RUNNER_0.0.1.tar.gz" sh "cd R && R CMD build CNVCALLER.EVALUATOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.EVALUATOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.EVALUATOR_0.0.1.tar.gz" } diff --git a/R/CANOES/DESCRIPTION b/R/CANOES/DESCRIPTION index 314f419..7f4bc54 100644 --- a/R/CANOES/DESCRIPTION +++ b/R/CANOES/DESCRIPTION @@ -1,25 +1,17 @@ Package: CANOESCOV -Title: CANOES Package With Interface To External Coverage File +Title: CANOES Package Version: 0.0.1 Authors@R: c( person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")), person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")), person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut"))) -Description: An extended implementation of the CANOES package in R. It extends - original implementation by using external coverage file, which should - speed up calculations for running application with multiple sets of input - parameters. +Description: An implementation of the CANOES package in R. Depends: R (>= 3.2.3), - devtools (>= 1.13.2), - DBI (== 0.8), - optparse (== 1.4.4), - IRanges (>= 2.0.0), plyr (>= 1.8.4), nnls (>= 1.4.0), Hmisc (>= 4.0.0), - mgcv (>= 1.8.0), - REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1) + mgcv (>= 1.8.0) License: GPL-3 Encoding: UTF-8 LazyData: true From 28e60cdf6dbf047ccb86a26ff31e906287b2071d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 20 Mar 2018 17:10:40 +0100 Subject: [PATCH 010/114] bugfix --- R/CANOES/DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/CANOES/DESCRIPTION b/R/CANOES/DESCRIPTION index 7f4bc54..0824435 100644 --- a/R/CANOES/DESCRIPTION +++ b/R/CANOES/DESCRIPTION @@ -1,4 +1,4 @@ -Package: CANOESCOV +Package: CANOES Title: CANOES Package Version: 0.0.1 Authors@R: c( From f3bb5c97bcba8dc057bd350932c9c1c9f480e387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 20 Mar 2018 17:33:29 +0100 Subject: [PATCH 011/114] Canoes added to docker container --- Docker/cnv-opt-canoes/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Docker/cnv-opt-canoes/Dockerfile b/Docker/cnv-opt-canoes/Dockerfile index 6667651..b404cab 100644 --- a/Docker/cnv-opt-canoes/Dockerfile +++ b/Docker/cnv-opt-canoes/Dockerfile @@ -15,3 +15,4 @@ RUN Rscript -e "install.packages('nnls', repos = 'http://cran.us.r-project.org') RUN Rscript -e "install.packages('Hmisc', repos = 'http://cran.us.r-project.org')" RUN Rscript -e "install.packages('mgcv', repos = 'http://cran.us.r-project.org')" RUN Rscript -e "install.packages('plyr', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('CANOES', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From 7a925405474bd21cf1c6a11fbfac67567eb2828f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Thu, 22 Mar 2018 16:20:15 +0100 Subject: [PATCH 012/114] TARGET.QC package in docker --- Docker/cnv-opt-target-qc/Dockerfile | 9 +++++++-- R/TARGET.QC/R/run_TARGET.QC.R | 2 -- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile index ab19c2f..cccf0e0 100644 --- a/Docker/cnv-opt-target-qc/Dockerfile +++ b/Docker/cnv-opt-target-qc/Dockerfile @@ -1,11 +1,16 @@ FROM ubuntu:xenial MAINTAINER biodatageeks +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + RUN apt-get update && \ apt-get upgrade -y && \ apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev -RUN Rscript -e "install.packages('devtools', repos='http://cran.cnr.berkeley.edu'); devtools::install_github('hadley/testthat')" - RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')" +RUN Rscript -e "install.packages('TARGET.QC', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index 23eed55..be7b841 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -13,8 +13,6 @@ run_TARGET.QC <- function(mapp_thresh, #length_thresh_to <- 2000 #gc_thresh_from <- 20 #gc_thresh_to <- 80 - #K_from <- 1 - #K_to <- 9 #lmax <- 200 sampname <- unique(cov_table[,"sample_name"]) targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] From 28c33116848c102d384ba05db77b69bc3a7b8cbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Thu, 22 Mar 2018 19:07:30 +0100 Subject: [PATCH 013/114] switch off R tests to speed up development --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index dca1340..ce4967f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,7 @@ pipeline { } } - stage('Test R code') { + /*stage('Test R code') { steps { echo 'Testing R code....' sh 'docker run -i --rm --network="host" -e CNV_OPT_PSQL_USER="cnv-opt" -e CNV_OPT_PSQL_PASSWORD="zsibio321" -e CNV_OPT_PSQL_DRV_URL="http://zsibio.ii.pw.edu.pl/nexus/repository/zsi-bio-raw/common/jdbc/postgresql-42.1.1.jar" -e CNV_OPT_PSQL_CONN_URL="jdbc:postgresql://cdh00.ii.pw.edu.pl:15432/cnv-opt" -w="/tmp" -v $(pwd | sed "s|/var/jenkins_home|/data/home/jenkins|g")/R:/tmp zsibio.ii.pw.edu.pl:50009/zsi-bio-toolset Rscript tests/run_tests.R' @@ -19,7 +19,7 @@ pipeline { junit '**R/tests/*.xml' } } - } + }*/ stage('Build R package') { steps { From e515fefa952cecc9bdbacfdc6bbe2e2ce69c33fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Thu, 22 Mar 2018 19:07:51 +0100 Subject: [PATCH 014/114] TARGET.QC package with reading and writing coverage table --- R/TARGET.QC/R/run_TARGET.QC.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index be7b841..ca7e3be 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -5,7 +5,8 @@ run_TARGET.QC <- function(mapp_thresh, length_thresh_to, gc_thresh_from, gc_thresh_to, - cov_table){ + input_cov_table, + output_cov_table){ #mapp_thresh <- 0.9 #cov_thresh_from <- 20 #cov_thresh_to <- 4000 @@ -14,6 +15,7 @@ run_TARGET.QC <- function(mapp_thresh, #gc_thresh_from <- 20 #gc_thresh_to <- 80 #lmax <- 200 + cov_table <- read.csv(input_cov_table) sampname <- unique(cov_table[,"sample_name"]) targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] targets <- targets[!duplicated(targets[,"target_id"]),] @@ -50,7 +52,7 @@ run_TARGET.QC <- function(mapp_thresh, cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"]) cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"]) cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"]) - cov_table_qc + write.csv(cov_table_qc, output_cov_table) } # sample_name target_id chr pos_min pos_max read_count From 905ed26d4eae0c2fb4cd37230236f45e5f042214 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Thu, 22 Mar 2018 19:38:28 +0100 Subject: [PATCH 015/114] bugfixes in TARGET.QC package --- Docker/cnv-opt-target-qc/Dockerfile | 14 +------------- R/TARGET.QC/R/run_TARGET.QC.R | 2 +- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile index cccf0e0..8391b91 100644 --- a/Docker/cnv-opt-target-qc/Dockerfile +++ b/Docker/cnv-opt-target-qc/Dockerfile @@ -1,16 +1,4 @@ -FROM ubuntu:xenial +FROM biodatageeks/cnv-opt-codex MAINTAINER biodatageeks -RUN apt-get update -RUN apt-get install -y software-properties-common -RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 -RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' -RUN apt-get install -y apt-transport-https - -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev - -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')" - RUN Rscript -e "install.packages('TARGET.QC', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index ca7e3be..cef44ee 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -52,7 +52,7 @@ run_TARGET.QC <- function(mapp_thresh, cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"]) cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"]) cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"]) - write.csv(cov_table_qc, output_cov_table) + write.csv(cov_table_qc, output_cov_table, row.names=F, quote=F) } # sample_name target_id chr pos_min pos_max read_count From 53dcff69eeb4d892bf0bda4fb1c77f6fb73ea685 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Thu, 22 Mar 2018 19:49:45 +0100 Subject: [PATCH 016/114] building dockers without cache - to reload R packages while development --- build.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index cae7f57..f2dba85 100755 --- a/build.sh +++ b/build.sh @@ -34,8 +34,10 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - docker build -t $image:$version . - docker build -t $image:latest . + #docker build -t $image:$version . + #docker build -t $image:latest . + docker build --no-cache -t $image:$version . + docker build --no-cache -t $image:latest . if [[ ${BUILD_MODE} != "local" ]]; then docker push docker.io/$image:latest docker push docker.io/$image:$version From 40292ef35ea48bde0f08be2e1f6862d62f3c4390 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Thu, 22 Mar 2018 20:36:09 +0100 Subject: [PATCH 017/114] proper order of column in TARGET.QC results --- R/TARGET.QC/R/run_TARGET.QC.R | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index cef44ee..664c997 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -52,12 +52,8 @@ run_TARGET.QC <- function(mapp_thresh, cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"]) cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"]) cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"]) + cov_table_qc <- cbind(cov_table_qc[,"chr"], cov_table_qc[,"sample_name"], cov_table_qc[,"pos_min"], cov_table_qc[,"pos_max"], cov_table_qc[,"read_count"], cov_table_qc[,"target_id"]) + colnames(cov_table_qc) <- c("chr", "sample_name", "pos_min", "pos_max", "read_count", "target_id") write.csv(cov_table_qc, output_cov_table, row.names=F, quote=F) } -# sample_name target_id chr pos_min pos_max read_count -#1 NA19012 193524 Y 25426932 25427053 0 -#2 NA19012 193525 Y 25431556 25431676 0 -#3 NA19012 193526 Y 25535089 25535239 0 -#4 NA19012 193527 Y 25537286 25537526 0 -#5 NA19012 193528 Y 25538793 25538913 0 From 3a7951d95f8f82ffea28618201ba8cf88a146d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Thu, 22 Mar 2018 20:49:43 +0100 Subject: [PATCH 018/114] bugfix --- R/TARGET.QC/R/run_TARGET.QC.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index 664c997..42ff0e6 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -52,8 +52,7 @@ run_TARGET.QC <- function(mapp_thresh, cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"]) cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"]) cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"]) - cov_table_qc <- cbind(cov_table_qc[,"chr"], cov_table_qc[,"sample_name"], cov_table_qc[,"pos_min"], cov_table_qc[,"pos_max"], cov_table_qc[,"read_count"], cov_table_qc[,"target_id"]) - colnames(cov_table_qc) <- c("chr", "sample_name", "pos_min", "pos_max", "read_count", "target_id") + colnames(cov_table_qc) <- c("sample_name", "target_id", "chr", "pos_min", "pos_max", "read_count") write.csv(cov_table_qc, output_cov_table, row.names=F, quote=F) } From 1e5f202af2e4c7be3b30bf45e0cfb2c5c20a9800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 23 Mar 2018 12:57:19 +0100 Subject: [PATCH 019/114] dockerfile for REFERENCE.SAMPLE.SET.SELECTOR --- Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile index 5ff1007..7e82158 100644 --- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile +++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile @@ -1,7 +1,6 @@ -FROM biodatageeks/bdg-spark +FROM biodatageeks/cnv-opt-codex MAINTAINER biodatageeks +RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')" - - - +RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From 86022de887a13f44678ad24a93a29619be13c3fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 23 Mar 2018 14:47:28 +0100 Subject: [PATCH 020/114] new interface to run_REFERENCE.SAMPLE.SET.SELECTOR method --- .../functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 14 +++++- .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 44 ++++++++++++++----- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index 6ac3736..70c9fe4 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -1,4 +1,15 @@ -library(ExomeDepth) + +coverageObj1 <- function(cov_table, sampname, targets_for_chr){ + Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) + for(sample in sampname) { + cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] + cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ] + Y <- cbind(Y, cov_targets_for_sample[,"read_count"]) + } + colnames(Y) <- sampname + rownames(Y) <- targets_for_chr[,"target_id"] + return(list(Y=Y)) +} canoes_method <- function(investigated_sample, Y, num_refs){ if (num_refs == 0) { @@ -14,6 +25,7 @@ canoes_method <- function(investigated_sample, Y, num_refs){ } exomedepth_method <- function(investigated_sample, Y, num_refs, target_length){ + library(ExomeDepth) samples <- colnames(Y) reference_samples <- setdiff(samples, investigated_sample) reference_set <- select.reference.set(test.counts = Y[,investigated_sample], diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index f025b95..cc38ad3 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -1,16 +1,36 @@ -run_REFERENCE.SAMPLE.SET.SELECTOR <- function(investigated_sample, - Y, +run_REFERENCE.SAMPLE.SET.SELECTOR <- function(input_cov_table, select_method, num_refs, - target_length){ - if(select_method == "canoes") { - reference_samples <- canoes_method(investigated_sample, Y, num_refs)$reference_samples - } else if(select_method == "codex") { - #reference_samples <- codex_method(investigated_sample, Y, num_refs)$reference_samples - } else if(select_method == "exomedepth") { - reference_samples <- exomedepth_method(investigated_sample, Y, num_refs, target_length)$reference_samples - } else if(select_method == "clamms") { - #reference_samples <- clamms_method(investigated_sample, Y, num_refs)$reference_samples + output_reference_file){ + + cov_table <- read.csv(input_cov_table) + sampname <- unique(cov_table[,"sample_name"]) + targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] + targets <- targets[!duplicated(targets[,"target_id"]),] + targets <- targets[with(targets, order(target_id)), ] + target_length <- targets[,"pos_max"] - targets[,"pos_min"] + Y <- coverageObj1(cov_table, sampname, targets)$Y + reference_samples <- list() + + for(i in 1:length(sampname)) { + investigated_sample <- as.character(sampname[i]) + if(select_method == "canoes") { + reference_samples_for_investigated_sample <- canoes_method(investigated_sample, Y, num_refs)$reference_samples + reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "codex") { + #reference_samples_for_investigated_sample <- codex_method(investigated_sample, Y, num_refs)$reference_samples + #reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "exomedepth") { + reference_samples_for_investigated_sample <- exomedepth_method(investigated_sample, Y, num_refs, target_length)$reference_samples + reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "clamms") { + #reference_samples_for_investigated_sample <- clamms_method(investigated_sample, Y, num_refs)$reference_samples + #reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } } - reference_samples + resultant_string <- '' + for(i in 1:length(reference_samples)) { + resultant_string <- paste(resultant_string, paste(reference_samples[[i]], collapse=","), '\n', sep="") + } + write(resultant_string, output_reference_file) } From da94fbde2b8f48df0e9042cea8fd7aa55cfd5770 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 23 Mar 2018 15:11:33 +0100 Subject: [PATCH 021/114] change arguments order in run_REFERENCE.SAMPLE.SET.SELECTOR function --- .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index cc38ad3..c4492df 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -1,6 +1,6 @@ -run_REFERENCE.SAMPLE.SET.SELECTOR <- function(input_cov_table, - select_method, +run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, num_refs, + input_cov_table, output_reference_file){ cov_table <- read.csv(input_cov_table) From c75ff28705cbb72e79bc76a6589ee2464d6b1a66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 23 Mar 2018 18:48:08 +0100 Subject: [PATCH 022/114] draft of DAG for CODEX cnv caller --- airflow/dags/codex.py | 52 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100755 airflow/dags/codex.py diff --git a/airflow/dags/codex.py b/airflow/dags/codex.py new file mode 100755 index 0000000..55ee94b --- /dev/null +++ b/airflow/dags/codex.py @@ -0,0 +1,52 @@ +from airflow import DAG +from airflow.operators.bash_operator import BashOperator +from airflow.models import Variable +from datetime import datetime, timedelta + +default_args = { + 'owner': 'biodatageeks', + 'depends_on_past': False, + 'start_date': datetime(2017, 10, 18), + 'email': ['team@biodatageeks.ii.pw.edu.pl'], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 0 +} + +dag = DAG( + 'codex', default_args=default_args, schedule_interval=None) + +############################################## +########## RUN RAW CODEX CNV CALLER ########## +############################################## + +### target qc parameters +mapp_thresh = '0.9' +cov_thresh_from = '20' +cov_thresh_to = '4000' +length_thresh_from = '20' +length_thresh_to = '2000' +gc_thresh_from = '20' +gc_thresh_to = '80' +raw_cov_table = 'input_cov_table.csv' +qc_cov_table = 'output_cov_table.csv' + +### select reference sample set parameters +select_method = 'exomedepth' # "canoes", "codex" or "exomedepth" +num_refs = '30' +reference_sample_set_file = 'reference_sample_set.csv' + +run_codex_caller_cmd= " \ +docker pull biodatageeks/cnv-opt-target-qc; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \ +docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \ +" + +run_codex_caller_task= BashOperator ( + bash_command=run_codex_caller_cmd, + task_id='run_codex_caller_task', + dag=dag +) + +run_codex_caller_task From 3e8801676f171cd4461458f14ff24ac57b569102 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 23 Mar 2018 23:53:30 +0100 Subject: [PATCH 023/114] EXOMEDEPTH package in docker --- R/EXOMEDEPTHCOV/DESCRIPTION | 1 - R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R | 2 +- R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R | 34 ++++++++++----------- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/R/EXOMEDEPTHCOV/DESCRIPTION b/R/EXOMEDEPTHCOV/DESCRIPTION index d4a3025..8305596 100644 --- a/R/EXOMEDEPTHCOV/DESCRIPTION +++ b/R/EXOMEDEPTHCOV/DESCRIPTION @@ -16,7 +16,6 @@ Depends: optparse (== 1.4.4), IRanges (>= 2.0.0), ExomeDepth (>= 1.1.10), - REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1) License: GPL-3 Encoding: UTF-8 LazyData: true diff --git a/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R index 2b38d6b..550b4d0 100644 --- a/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R +++ b/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R @@ -1,6 +1,6 @@ library(ExomeDepth) -coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ +coverageObj1 <- function(cov_table, sampname, targets_for_chr){ Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) for(sample in sampname) { cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] diff --git a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R index 295306e..f187712 100644 --- a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R +++ b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R @@ -1,15 +1,17 @@ library(ExomeDepth) library(methods) -run_EXOMEDEPTHCOV <- function(reference_set_select_method, - num_of_samples_in_reference_set, - cov_table){ +run_EXOMEDEPTHCOV <- function(input_cov_table, + reference_sample_set_file, + output_calls_file){ + con <- file(reference_sample_set_file, open='r') + reference_sample_set <- readLines(con) + cov_table <- read.csv(input_cov_table) sampname <- unique(cov_table[,"sample_name"]) targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] targets <- targets[!duplicated(targets[,"target_id"]),] targets <- targets[with(targets, order(target_id)), ] - calls <- data.frame(matrix(nrow=0, ncol=13)) chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) library(IRanges) @@ -19,20 +21,16 @@ run_EXOMEDEPTHCOV <- function(reference_set_select_method, if (length(ref) == 0) { # 0 elements for specified chromosome in bed next() } - Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y + Y <- coverageObj1(cov_table, sampname, targets_for_chr)$Y - for (actual_sample_id in 1:length(sampname)) { - actual_sample <- sampname[actual_sample_id] - ## ----reference.selection------------------------------------------------- - target_length <- c() - for (i in 1:nrow(Y)) { - target_length <- c(target_length, width(ref[i])) + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { + next() } - reference_samples <- run_REFERENCE.SAMPLE.SET.SELECTOR(actual_sample, - Y, - reference_set_select_method, - num_of_samples_in_reference_set, - target_length) + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + print(samples) + actual_sample <- samples[1] + reference_samples <- samples[-1] ## ----construct.ref------------------------------------------------------- my.matrix <- as.matrix(Y[,reference_samples]) @@ -42,7 +40,7 @@ run_EXOMEDEPTHCOV <- function(reference_set_select_method, ## ----build.complete------------------------------------------------------ all.exons <- new('ExomeDepth', - test = Y[,actual_sample_id], + test = Y[,actual_sample], reference = my.reference.selected, formula = 'cbind(test, reference) ~ 1') @@ -86,5 +84,5 @@ run_EXOMEDEPTHCOV <- function(reference_set_select_method, colnames(calls)[colnames(calls) == 'reads.observed'] <- 'raw_cov' colnames(calls)[colnames(calls) == 'reads.ratio'] <- 'copy_no' calls[colnames(calls) == 'copy_no'] <- round(calls[colnames(calls) == 'raw_cov'] / (calls[colnames(calls) == 'norm_cov'] / 2)) - calls + write.csv(calls, output_calls_file, row.names=F) } From 91c7e3c52c31cfb848c67fc619c64894d8aa48e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sat, 24 Mar 2018 13:18:32 +0100 Subject: [PATCH 024/114] Dockerfile for EXOMEDEPTHCOV package --- Docker/cnv-opt-exomedepthcov/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 Docker/cnv-opt-exomedepthcov/Dockerfile diff --git a/Docker/cnv-opt-exomedepthcov/Dockerfile b/Docker/cnv-opt-exomedepthcov/Dockerfile new file mode 100644 index 0000000..28448f8 --- /dev/null +++ b/Docker/cnv-opt-exomedepthcov/Dockerfile @@ -0,0 +1,5 @@ +FROM biodatageeks/cnv-opt-exomedepth +MAINTAINER biodatageeks + +RUN Rscript -e "install.packages('EXOMEDEPTHCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" + From 862aabecde56668e4b07f803a416372305623795 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 15:57:19 +0200 Subject: [PATCH 025/114] first version of CODEXCOV package in docker image --- Docker/cnv-opt-canoescov/Dockerfile | 18 ++++++ Docker/cnv-opt-codexcov/Dockerfile | 14 +++++ R/CODEXCOV/CODEXCOV.Rproj | 16 ----- R/CODEXCOV/R/run_CODEXCOV.R | 84 +++++++++++++-------------- R/CODEXCOV/man/coverageObj1.Rd | 14 ----- R/CODEXCOV/man/gcmapp1.Rd | 14 ----- R/CODEXCOV/man/normObj1.Rd | 14 ----- R/CODEXCOV/man/normObj2.Rd | 14 ----- R/CODEXCOV/man/qcObj1.Rd | 15 ----- R/CODEXCOV/man/run_CODEXCOV.Rd | 14 ----- R/CODEXCOV/man/segment1.Rd | 14 ----- R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R | 1 - 12 files changed, 71 insertions(+), 161 deletions(-) create mode 100644 Docker/cnv-opt-canoescov/Dockerfile create mode 100644 Docker/cnv-opt-codexcov/Dockerfile delete mode 100755 R/CODEXCOV/CODEXCOV.Rproj delete mode 100644 R/CODEXCOV/man/coverageObj1.Rd delete mode 100644 R/CODEXCOV/man/gcmapp1.Rd delete mode 100644 R/CODEXCOV/man/normObj1.Rd delete mode 100644 R/CODEXCOV/man/normObj2.Rd delete mode 100644 R/CODEXCOV/man/qcObj1.Rd delete mode 100644 R/CODEXCOV/man/run_CODEXCOV.Rd delete mode 100644 R/CODEXCOV/man/segment1.Rd diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile new file mode 100644 index 0000000..b404cab --- /dev/null +++ b/Docker/cnv-opt-canoescov/Dockerfile @@ -0,0 +1,18 @@ +FROM ubuntu:xenial +MAINTAINER biodatageeks + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev + +RUN Rscript -e "install.packages('nnls', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('Hmisc', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('mgcv', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('plyr', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('CANOES', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile new file mode 100644 index 0000000..6f56eb5 --- /dev/null +++ b/Docker/cnv-opt-codexcov/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:xenial +MAINTAINER biodatageeks + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev + +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')" diff --git a/R/CODEXCOV/CODEXCOV.Rproj b/R/CODEXCOV/CODEXCOV.Rproj deleted file mode 100755 index d848a9f..0000000 --- a/R/CODEXCOV/CODEXCOV.Rproj +++ /dev/null @@ -1,16 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: No -SaveWorkspace: No -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -Encoding: UTF-8 - -AutoAppendNewline: Yes -StripTrailingWhitespace: Yes - -BuildType: Package -PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R index 78929de..246aa22 100644 --- a/R/CODEXCOV/R/run_CODEXCOV.R +++ b/R/CODEXCOV/R/run_CODEXCOV.R @@ -9,10 +9,13 @@ run_CODEXCOV <- function(K_from, K_to, lmax, - reference_set_select_method, - num_of_samples_in_reference_set, - cov_table){ - + input_cov_table, + reference_sample_set_file, + output_calls_file){ + + con <- file(reference_sample_set_file, open='r') + reference_sample_set <- readLines(con) + cov_table <- read.csv(input_cov_table) sampname <- unique(cov_table[,"sample_name"]) sampname <- as.character(sampname) targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] @@ -28,6 +31,7 @@ run_CODEXCOV <- function(K_from, if (length(ref) == 0) { # 0 elements for specified chromosome in bed next() } + ################################################### ### code chunk number 4: coverageObj1 ################################################### @@ -39,47 +43,37 @@ run_CODEXCOV <- function(K_from, gcmapp1_result <- gcmapp1(chr, ref) gc <- gcmapp1_result$gc - ################################################### - ### code chunk number 7: normObj1 - ################################################### - normObj_result <- normObj1(Y, gc, K = K_from:K_to) - Yhat <- normObj_result$Yhat - AIC <- normObj_result$AIC - BIC <- normObj_result$BIC - RSS <- normObj_result$RSS - K <- normObj_result$K - - ################################################### - ### code chunk number 8: normObj2 (eval = FALSE) - ################################################### - ## normObj_result <- normObj2(Y, gc, K = 1:9, normal_index=seq(1,45,2)) - ## Yhat <- normObj_result$Yhat - ## AIC <- normObj_result$AIC - ## BIC <- normObj_result$BIC - ## RSS <- normObj_result$RSS - ## K <- normObj_result$K - - ################################################### - ### code chunk number 9: choiceofK (eval = FALSE) - ################################################### - #choiceofK(AIC, BIC, RSS, K, filename = paste("choiceofK_", chr, ".pdf", sep = "")) - - ################################################### - ### code chunk number 10: fig1 - ################################################### - #plot(K, RSS, type = "b", xlab = "Number of latent variables") - #plot(K, AIC, type = "b", xlab = "Number of latent variables") - #plot(K, BIC, type = "b", xlab = "Number of latent variables") - - ################################################### - ### code chunk number 11: segment1 - ################################################### - finalcallIt <- segment1(Y, Yhat, K[which.max(BIC)], K, sampname, - ref, chr, lmax, mode = "integer")$finalcall - if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))} - finalcall <- rbind(finalcall, finalcallIt) - + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { + next() + } + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + actual_sample <- samples[1] + reference_samples <- samples[-1] + samples <- samples[order(samples[,1]),] + Y_subset <- Y[,samples] + + ################################################### + ### code chunk number 7: normObj1 + ################################################### + normObj_result <- normObj1(Y_subset, gc, K = K_from:K_to) + Yhat <- normObj_result$Yhat + AIC <- normObj_result$AIC + BIC <- normObj_result$BIC + RSS <- normObj_result$RSS + K <- normObj_result$K + + ################################################### + ### code chunk number 11: segment1 + ################################################### + finalcallIt <- segment1(Y_subset, Yhat, K[which.max(BIC)], K, samples, + ref, chr, lmax, mode = "integer")$finalcall + finalcallIt <- finalcallIt[finalcallIt[,"sample_name"] == actual_sample,] + if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))} + finalcall <- rbind(finalcall, finalcallIt) + print(finalcall) + } } finalcall <- unify_calls_format(finalcall)$finalcall - finalcall + write.csv(finalcall, output_calls_file, row.names=F) } diff --git a/R/CODEXCOV/man/coverageObj1.Rd b/R/CODEXCOV/man/coverageObj1.Rd deleted file mode 100644 index f6c6b1c..0000000 --- a/R/CODEXCOV/man/coverageObj1.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{coverageObj1} -\alias{coverageObj1} -\title{Function Dexcription} -\usage{ -coverageObj1(cov_file, sampname) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/CODEXCOV/man/gcmapp1.Rd b/R/CODEXCOV/man/gcmapp1.Rd deleted file mode 100644 index 2fa53f1..0000000 --- a/R/CODEXCOV/man/gcmapp1.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{gcmapp1} -\alias{gcmapp1} -\title{Function Dexcription} -\usage{ -gcmapp1(chr, ref) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/CODEXCOV/man/normObj1.Rd b/R/CODEXCOV/man/normObj1.Rd deleted file mode 100644 index 66b0a96..0000000 --- a/R/CODEXCOV/man/normObj1.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{normObj1} -\alias{normObj1} -\title{Function Dexcription} -\usage{ -normObj1(Y_qc, gc_qc, K) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/CODEXCOV/man/normObj2.Rd b/R/CODEXCOV/man/normObj2.Rd deleted file mode 100644 index 4a10d47..0000000 --- a/R/CODEXCOV/man/normObj2.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{normObj2} -\alias{normObj2} -\title{Function Dexcription} -\usage{ -normObj2(Y_qc, gc_qc, K, normal_index) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/CODEXCOV/man/qcObj1.Rd b/R/CODEXCOV/man/qcObj1.Rd deleted file mode 100644 index 806a098..0000000 --- a/R/CODEXCOV/man/qcObj1.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{qcObj1} -\alias{qcObj1} -\title{Function Dexcription} -\usage{ -qcObj1(Y, sampname, chr, ref, mapp, gc, cov_thresh, length_thresh, mapp_thresh, - gc_thresh) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/CODEXCOV/man/run_CODEXCOV.Rd b/R/CODEXCOV/man/run_CODEXCOV.Rd deleted file mode 100644 index f80759e..0000000 --- a/R/CODEXCOV/man/run_CODEXCOV.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/run_CODEXCOV.R -\name{run_CODEXCOV} -\alias{run_CODEXCOV} -\title{Function Dexcription} -\usage{ -run_CODEXCOV(cov_file, sampname) -} -\description{ -Function description. -} -\examples{ -run_codexcov -} diff --git a/R/CODEXCOV/man/segment1.Rd b/R/CODEXCOV/man/segment1.Rd deleted file mode 100644 index c7cf654..0000000 --- a/R/CODEXCOV/man/segment1.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{segment1} -\alias{segment1} -\title{Function Dexcription} -\usage{ -segment1(Y_qc, Yhat, optK, K, sampname_qc, ref_qc, chr, lmax, mode) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R index f187712..a8e8e3c 100644 --- a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R +++ b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R @@ -28,7 +28,6 @@ run_EXOMEDEPTHCOV <- function(input_cov_table, next() } samples <- unlist(strsplit(reference_sample_set[[i]], ',')) - print(samples) actual_sample <- samples[1] reference_samples <- samples[-1] From 626f23abdd385804cfb8d3a58f7d06bef9582cd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 16:50:41 +0200 Subject: [PATCH 026/114] finished dag for ExomeDepth, draft od dag for CANOES --- airflow/dags/canoes.py | 52 ++++++++++++++++++++++++++++++++++ airflow/dags/exomedepth.py | 57 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100755 airflow/dags/canoes.py create mode 100755 airflow/dags/exomedepth.py diff --git a/airflow/dags/canoes.py b/airflow/dags/canoes.py new file mode 100755 index 0000000..7c74842 --- /dev/null +++ b/airflow/dags/canoes.py @@ -0,0 +1,52 @@ +from airflow import DAG +from airflow.operators.bash_operator import BashOperator +from airflow.models import Variable +from datetime import datetime, timedelta + +default_args = { + 'owner': 'biodatageeks', + 'depends_on_past': False, + 'start_date': datetime(2017, 10, 18), + 'email': ['team@biodatageeks.ii.pw.edu.pl'], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 0 +} + +dag = DAG( + 'canoes', default_args=default_args, schedule_interval=None) + +############################################## +########## RUN RAW CANOES CNV CALLER ########## +############################################## + +### target qc parameters +mapp_thresh = '0.9' +cov_thresh_from = '20' +cov_thresh_to = '4000' +length_thresh_from = '20' +length_thresh_to = '2000' +gc_thresh_from = '20' +gc_thresh_to = '80' +raw_cov_table = 'input_cov_table.csv' +qc_cov_table = 'output_cov_table.csv' + +### select reference sample set parameters +select_method = 'exomedepth' # "canoes", "codex" or "exomedepth" +num_refs = '30' +reference_sample_set_file = 'reference_sample_set.csv' + +run_canoes_caller_cmd= " \ +docker pull biodatageeks/cnv-opt-target-qc; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \ +docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \ +" + +run_canoes_caller_task= BashOperator ( + bash_command=run_canoes_caller_cmd, + task_id='run_canoes_caller_task', + dag=dag +) + +run_canoes_caller_task diff --git a/airflow/dags/exomedepth.py b/airflow/dags/exomedepth.py new file mode 100755 index 0000000..e8629be --- /dev/null +++ b/airflow/dags/exomedepth.py @@ -0,0 +1,57 @@ +from airflow import DAG +from airflow.operators.bash_operator import BashOperator +from airflow.models import Variable +from datetime import datetime, timedelta + +default_args = { + 'owner': 'biodatageeks', + 'depends_on_past': False, + 'start_date': datetime(2017, 10, 18), + 'email': ['team@biodatageeks.ii.pw.edu.pl'], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 0 +} + +dag = DAG( + 'exomedepth', default_args=default_args, schedule_interval=None) + +################################################### +########## RUN RAW EXOMEDEPTH CNV CALLER ########## +################################################### + +### target qc parameters +mapp_thresh = '0.9' +cov_thresh_from = '20' +cov_thresh_to = '4000' +length_thresh_from = '20' +length_thresh_to = '2000' +gc_thresh_from = '20' +gc_thresh_to = '80' +raw_cov_table = 'input_cov_table.csv' +qc_cov_table = 'output_cov_table.csv' + +### select reference sample set parameters +select_method = 'exomedepth' # "canoes", "codex" or "exomedepth" +num_refs = '30' +reference_sample_set_file = 'reference_sample_set.csv' + +### exomedepth parameters +output_calls_file = 'calls.csv' + +run_exomedepth_caller_cmd= " \ +docker pull biodatageeks/cnv-opt-target-qc; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \ +docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \ +docker pull biodatageeks/cnv-opt-exomedepthcov; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-exomedepthcov Rscript -e \"library(\'EXOMEDEPTHCOV\');run_EXOMEDEPTHCOV('" + qc_cov_table + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \ +" + +run_exomedepth_caller_task= BashOperator ( + bash_command=run_exomedepth_caller_cmd, + task_id='run_exomedepth_caller_task', + dag=dag +) + +run_exomedepth_caller_task From 7a7beef8b0bd82de81413bad1aeb84882b7888cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 16:51:44 +0200 Subject: [PATCH 027/114] draft of CANOESCOV with new interface --- Docker/cnv-opt-canoescov/Dockerfile | 18 +- Docker/cnv-opt-codexcov/Dockerfile | 14 +- R/CANOESCOV/DESCRIPTION | 3 +- R/CANOESCOV/R/functions_CANOESCOV.R | 487 +--------------------------- R/CANOESCOV/R/run_CANOESCOV.R | 27 +- R/CODEXCOV/DESCRIPTION | 3 +- 6 files changed, 30 insertions(+), 522 deletions(-) diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile index b404cab..44080ff 100644 --- a/Docker/cnv-opt-canoescov/Dockerfile +++ b/Docker/cnv-opt-canoescov/Dockerfile @@ -1,18 +1,4 @@ -FROM ubuntu:xenial +FROM biodatageeks/cnv-opt-canoes MAINTAINER biodatageeks -RUN apt-get update -RUN apt-get install -y software-properties-common -RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 -RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' -RUN apt-get install -y apt-transport-https - -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev - -RUN Rscript -e "install.packages('nnls', repos = 'http://cran.us.r-project.org')" -RUN Rscript -e "install.packages('Hmisc', repos = 'http://cran.us.r-project.org')" -RUN Rscript -e "install.packages('mgcv', repos = 'http://cran.us.r-project.org')" -RUN Rscript -e "install.packages('plyr', repos = 'http://cran.us.r-project.org')" -RUN Rscript -e "install.packages('CANOES', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" +RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile index 6f56eb5..258bbf8 100644 --- a/Docker/cnv-opt-codexcov/Dockerfile +++ b/Docker/cnv-opt-codexcov/Dockerfile @@ -1,14 +1,4 @@ -FROM ubuntu:xenial +FROM biodatageeks/cnv-opt-codex MAINTAINER biodatageeks -RUN apt-get update -RUN apt-get install -y software-properties-common -RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 -RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' -RUN apt-get install -y apt-transport-https - -RUN apt-get update && \ - apt-get upgrade -y && \ - apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev - -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')" +RUN Rscript -e "install.packages('CODEXCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/CANOESCOV/DESCRIPTION b/R/CANOESCOV/DESCRIPTION index 314f419..0fa3115 100644 --- a/R/CANOESCOV/DESCRIPTION +++ b/R/CANOESCOV/DESCRIPTION @@ -18,8 +18,7 @@ Depends: plyr (>= 1.8.4), nnls (>= 1.4.0), Hmisc (>= 4.0.0), - mgcv (>= 1.8.0), - REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1) + mgcv (>= 1.8.0) License: GPL-3 Encoding: UTF-8 LazyData: true diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R index 873e894..b55c138 100644 --- a/R/CANOESCOV/R/functions_CANOESCOV.R +++ b/R/CANOESCOV/R/functions_CANOESCOV.R @@ -11,13 +11,6 @@ coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ return(list(Y=Y)) } -# Constants -NUM.ABNORMAL.STATES=2 -NUM.STATES=3 -DELETION=1 -NORMAL=2 -DUPLICATION=3 - # CallCNVs # Calls CNVs in sample of interest # Arguments: @@ -53,7 +46,7 @@ DUPLICATION=3 # TARGETS: target numbers of CNV in the form start..stop # NUM_TARG: how many targets are in the CNV # Q_SOME: a Phred-scaled quality score for the CNV -CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, reference_set_select_method='canoes', num_of_samples_in_reference_set=30, get.dfs=F, homdel.mean=0.2, target_length){ +CallCNVs <- function(sample.name, reference.samples, counts, p=1e-08, Tnum=6, D=70000, get.dfs=F, homdel.mean=0.2){ if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ stop("First five columns of counts matrix must be target, chromosome, start, end, gc") @@ -97,12 +90,12 @@ CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, reference_se #reference.samples <- names(sort(covariances, # decreasing=T)[1:min(numrefs, length(covariances))]) Y <- data.matrix(counts[,6:ncol(counts)]) - library('REFERENCE.SAMPLE.SET.SELECTOR') - reference.samples <- run_REFERENCE.SAMPLE.SET.SELECTOR(sample.name, - Y, - reference_set_select_method, - num_of_samples_in_reference_set, - target_length) + #library('REFERENCE.SAMPLE.SET.SELECTOR') + #reference.samples <- run_REFERENCE.SAMPLE.SET.SELECTOR(sample.name, + # Y, + # reference_set_select_method, + # num_of_samples_in_reference_set, + # target_length) sample.mean.counts <- mean(counts[, sample.name]) sample.sumcounts <- apply(counts[, reference.samples], 2, sum) # normalize reference samples to sample of interest @@ -168,469 +161,3 @@ CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, reference_se } return(cnvs) } - -# GenotypeCNVs -# Genotype CNVs in sample of interest -# Arguments: -# xcnv -# data frame with the following columns, and one row for each -# CNV to genotype -# INTERVAL: CNV coordinates in the form chr:start-stop -# TARGETS: target numbers of CNV in the form start..stop -# these should correspond to the target numbers in counts -# sample.name: -# sample to genotype CNVs in (should correspond to a column in counts) -# counts: -# count matrix, first five columns should be -# target: consecutive numbers for targets (integer) -# chromosome: chromosome number (integer-valued) -# (support for sex chromosomes to come) -# start: start position of probe (integer) -# end: end position of probe (integer) -# gc: gc content (real between 0 and 1) -# subsequent columns should include counts for each probe for samples -# p: -# average rate of occurrence of CNVs (real) default is 1e-08 -# D: -# expected distance between targets in a CNV (integer) default is 70,000 -# Tnum: -# expected number of targets in a CNV (integer) default is 6 -# numrefs -# maximum number of reference samples to use (integer) default is 30 -# the weighted variance calculations will take a long time if too -# many reference samples are used -# emission.probs and distances are for internal use only -# Returns: -# data frame with the following columns and one row for each genotyped CNV: -# INTERVAL: CNV coordinates in the form chr:start-stop -# NQDEL: a Phred-scaled quality score that sample.name has no deletion -# in the interval -# SQDEL: a Phred-scaled quality score that sample.name has a deletion -# in the interval -# NQDUP and SQDUP: same, but for a duplication -GenotypeCNVs <- function(xcnvs, sample.name, counts, p=1e-08, Tnum=6, - D=70000, numrefs=30, - emission.probs=NULL, - distances=NULL){ - if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} - if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ - stop("First five columns of counts matrix must be target, chromosome, start, end, gc") - } - if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) { - # remove sex chromosomes - cat("Trying to remove sex chromosomes and 'chr' prefixes\n") - counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y")) - if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){ - counts$chromosome <- gsub("chr", "", counts$chromosome) - } - counts$chromosome <- as.numeric(counts$chromosome) - if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) - stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)") - } - library(plyr) - counts <- arrange(counts, chromosome, start) - if (p <= 0){ - stop("parameter p must be positive") - } - if (Tnum <= 0){ - stop("parameter Tnum must be positive") - } - if (D <= 0){ - stop("parameter D must be positive") - } - if (numrefs <= 0){ - stop("parameter numrefs must be positive") - } - num.cnvs <- nrow(xcnvs) - cnv.intervals <- as.character(xcnvs$INTERVAL) - # if no emission probs matrix is passed in, generate a new one - if (is.null(emission.probs)){ - l <- CANOESCOV::CallCNVs(sample.name, counts, p, Tnum=6, D=70000, numrefs=30, get.dfs=T) - emission.probs <- l[['emission.probs']] - distances <- l[['distances']] - } - forward.m <- GetForwardMatrix(emission.probs, distances, p, Tnum, D) - backward.m <- GetBackwardMatrix(emission.probs, distances, p, Tnum, D) - qualities <- matrix(0, nrow=num.cnvs, ncol=5, - dimnames=list(cnv.intervals, - c("INTERVAL", "NQDel", "SQDel", "NQDup", "SQDup"))) - for (i in 1:num.cnvs){ - interval <- as.character(xcnvs[i, "INTERVAL"]) - targets <- as.numeric(strsplit(as.character(xcnvs[i, "TARGETS"]), ".", fixed=T)[[1]][c(1,3)]) - left.target <- targets[1] - right.target <- targets[2] - likelihoods <- GetModifiedLikelihood(forward.m, backward.m, - emission.probs, distances, - left.target, right.target, - c(DUPLICATION, DELETION), p, Tnum, D) - modified.likelihood <- likelihoods[1]; - unmodified.likelihood <- likelihoods[2] - Prob.All.Normal <- exp(modified.likelihood - unmodified.likelihood) - likelihoods <- GetModifiedLikelihood(forward.m, backward.m, - emission.probs, distances, - left.target, right.target, DELETION, p, Tnum, D) - modified.likelihood <- likelihoods[1]; - unmodified.likelihood <- likelihoods[2] - Prob.No.Deletion <- exp(modified.likelihood - unmodified.likelihood) - likelihoods <- GetModifiedLikelihood(forward.m, backward.m, - emission.probs, distances, - left.target, right.target, DUPLICATION, p, Tnum, D) - modified.likelihood <- likelihoods[1]; - unmodified.likelihood <- likelihoods[2] - Prob.No.Duplication <- exp(modified.likelihood - unmodified.likelihood) - # Check if probabilities greater than 1 are numerical error or bug - Phred <- function(prob){ - return(round(min(99, -10 * log10(1 - prob)))) - } - qualities[i, "NQDel"] <- Phred(Prob.No.Deletion) - qualities[i, "SQDel"] <- Phred(Prob.No.Duplication - Prob.All.Normal) - qualities[i, "NQDup"] <- Phred(Prob.No.Duplication) - qualities[i, "SQDup"] <- Phred(Prob.No.Deletion - Prob.All.Normal) - qualities[i, "INTERVAL"] <- interval - } - qualities <- as.data.frame(qualities, stringsAsFactors=F) - qualities$NQDel <- as.integer(qualities$NQDel) - qualities$NQDup <- as.integer(qualities$NQDup) - qualities$SQDel <- as.integer(qualities$SQDel) - qualities$SQDup <- as.integer(qualities$SQDup) - return(qualities) -} - -# returns data frame with distance to each target from the previous target -# (0 in the case of the first target on chromosome 1, a very big number -# for the first target on each other chromosome--this resets the HMM -# for each chromosome) -GetDistances <- function(counts){ - chromosome <- counts[, "chromosome"] - startbase <- counts[, "start"] - num.nonzero.exons <- length(startbase) - distances <- c(0, startbase[2:num.nonzero.exons] - - startbase[1:(num.nonzero.exons - 1)] + - 1000000000000 * (chromosome[2:num.nonzero.exons] - - chromosome[1:(num.nonzero.exons - 1)])) - return(data.frame(target=counts[, "target"], distance=distances)) -} - -EstimateVariance <- function(counts, ref.sample.names, sample.weights){ - library(Hmisc) - counts$var <- apply(counts[, ref.sample.names], 1, wtd.var, sample.weights, normwt=T) - set.seed(1) - counts.subset <- counts[sample(nrow(counts), min(36000, nrow(counts))), ] - library(mgcv) - # can't do gamma regression with negative - counts.subset$var[counts.subset$var==0] <- 0.1 - fit <- gam(var ~ s(mean) + s(gc), family=Gamma(link=log), data=counts.subset) - # we don't want variance less than Poisson - # we take maximum of genome-wide estimate, method of moments estimate - # and Poisson variance - v.estimate <- pmax(predict(fit, counts, type="response"), counts$var, - counts$mean * 1.01) - return(data.frame(target=counts$target, var.estimate=v.estimate)) -} - -EmissionProbs <- function(test.counts, target.means, - var.estimate, targets){ - num.targets <- length(test.counts) - # calculate the means for the deletion, normal and duplication states - state.target.means <- t(apply(data.frame(x=target.means), 1, function(x) c(x*1/2, x, x*3/2))) - # calculate the expected size (given the predicted variance) - size <- target.means ^ 2 / (var.estimate - target.means) - emission.probs <- matrix(NA, num.targets, 4) - colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") - # calculate the emission probabilities given the read count - size.del <- size - size.dup <- size - size.del <- size / 2 - size.dup <- size * 3 / 2 - emission.probs[, "delprob"] <- dnbinom( - test.counts, - mu=state.target.means[, 1], - size=size.del, log=T) - emission.probs[, "normalprob"] <- dnbinom( - test.counts, - mu=state.target.means[, 2], - size=size, log=T) - emission.probs[, "dupprob"] <- dnbinom( - test.counts, - mu=state.target.means[, 3], - size=size.dup, log=T) - emission.probs[, "target"] <- targets - # some values may be infinite as a result of extreme read count - row.all.inf <- which(apply(emission.probs, 1, function(x){all(is.infinite(x))})) - if (length(row.all.inf) > 0){ - for (i in row.all.inf){ - if (test.counts[i] >= state.target.means[i, 3]){ - emission.probs[i, 2:4] <- c(-Inf, -Inf, -0.01) - } - else if (test.counts[i] <= state.target.means[i, 1]){ - emission.probs[i, 2:4] <- c(-0.01, -Inf, -Inf) - } - else emission.probs[i, 2:4] <- c(-Inf, -0.01, -Inf) - } - } - return(emission.probs) -} - -# Viterbi algorithm -Viterbi <- function(emission.probs.matrix, distances, p, Tnum, D){ - targets <- emission.probs.matrix[, 1] - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - num.exons <- dim(emission.probs.matrix)[1] - viterbi.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) - viterbi.pointers <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - viterbi.matrix[1, ] <- initial.state + emission.probs.matrix[1,] - for (i in 2:num.exons) { - temp.matrix <- viterbi.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) - viterbi.matrix[i, ] <- apply(temp.matrix, 2, max) - emission.probs <- c(emission.probs.matrix[i,]) - dim(emission.probs) <- c(NUM.STATES, 1) - viterbi.matrix[i, ] <- viterbi.matrix[i, ] + emission.probs - viterbi.pointers[i, ] <- apply(temp.matrix, 2, which.max) - } - viterbi.states = vector(length = num.exons) - viterbi.states[num.exons] = which.max(viterbi.matrix[num.exons, ]) - for (i in (num.exons - 1):1) { - viterbi.states[i] <- viterbi.pointers[i + 1, viterbi.states[i + 1]] - } - return(data.frame(target=targets, viterbi.state=viterbi.states)) -} - -# returns a transition matrix -# to state -# deletion normal duplication -# deletion -#from state normal -# duplication -GetTransitionMatrix <- function(distance, p, Tnum, D){ - q <- 1 / Tnum - f = exp(-distance/D) - prob.abnormal.abnormal <- f * (1 - q) + (1 - f) * p - prob.abnormal.normal <- f * q + (1 - f) * (1 - 2 * p) - prob.abnormal.diff.abnormal <- (1 - f) * p - prob.normal.normal <- 1 - 2 * p - prob.normal.abnormal <- p - transition.probs <- - c(prob.abnormal.abnormal, prob.abnormal.normal, prob.abnormal.diff.abnormal, - prob.normal.abnormal, prob.normal.normal, prob.normal.abnormal, - prob.abnormal.diff.abnormal, prob.abnormal.normal, prob.abnormal.abnormal) - transition.m = log(matrix(transition.probs, NUM.STATES, NUM.STATES, byrow=TRUE)) - return(transition.m) -} - -# adds two log-space probabilities using the identity -# log (p1 + p2) = log p1 + log(1 + exp(log p2 - log p1)) -AddTwoProbabilities <- function(x, y){ - if (is.infinite(x)) return (y) - if (is.infinite(y)) return (x) - sum.probs <- max(x, y) + log1p(exp(-abs(x - y))) -} - -# adds multiple log-space probabilities -SumProbabilities <- function(x){ - sum.probs <- x[1] - for (i in 2:length(x)){ - sum.probs <- AddTwoProbabilities(sum.probs, x[i]) - } - return(sum.probs) -} - -# finds the data likelihood by summing the product of the corresponding -# forward and backward probabilities at any token (should give the same value -# regardless of the token) -GetLikelihood <- function(forward.matrix, backward.matrix, x){ - SumProbabilities(forward.matrix[x, ] + backward.matrix[x, ]) -} - -# get the forward probabilities -GetForwardMatrix <- function(emission.probs.matrix, distances, p, Tnum, D){ - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - num.exons <- dim(emission.probs.matrix)[1] - forward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold forward probabilities - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - forward.matrix[1, ] <- initial.state + emission.probs.matrix[1, ] - for (i in 2:num.exons){ - # compute matrix with probability we were in state j and are now in state i - # in temp.matrix[j, i] (ignoring emission of current token) - temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) - # find the probability that we are in each of the three states - sum.probs <- apply(temp.matrix, 2, SumProbabilities) - forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] - } - return(forward.matrix) -} - -# get the backward probabilities -GetBackwardMatrix <- function(emission.probs.matrix, distances, - p, Tnum, D){ - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - num.exons <- dim(emission.probs.matrix)[1] - backward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold backward probabilities - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - backward.matrix[num.exons, ] <- rep(0, NUM.STATES) - for (i in (num.exons - 1):1){ - temp.matrix <- GetTransitionMatrix(distances$distance[i+1], p, Tnum, D) + - matrix(backward.matrix[i + 1, ], 3, 3, byrow=T) + - matrix(emission.probs.matrix[i+1, ], 3, 3, byrow=T) - backward.matrix[i, ] <- apply(temp.matrix, 1, SumProbabilities) - } - final.prob <- backward.matrix[1, ] + emission.probs.matrix[1, ] + initial.state - return(backward.matrix) -} - -# find the likelihood of the data given that certain states are disallowed -# between start target and end target -GetModifiedLikelihood <- function(forward.matrix, backward.matrix, emission.probs.matrix, distances, - start.target, end.target, disallowed.states, p, Tnum, D){ - targets <- emission.probs.matrix[, 1] - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - # there may be missing targets in this sample, we genotype the largest stretch of - # targets that lie in the CNV - left.target <- min(which(targets >= start.target)) - right.target <- max(which(targets <= end.target)) - num.exons <- dim(emission.probs.matrix)[1] - unmodified.likelihood <- GetLikelihood(forward.matrix, - backward.matrix, min(right.target + 1, num.exons)) - #right.target or left.target may be empty - - #if (right.target >= left.target) return(c(NA, unmodified.likelihood)) - stopifnot(right.target >= left.target) - modified.emission.probs.matrix <- emission.probs.matrix - modified.emission.probs.matrix[left.target:right.target, - disallowed.states] <- -Inf - - # if the start target is the first target we need to recalculate the - # forward probabilities - # for that target, using the modified emission probabilities - if (left.target == 1){ - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - forward.matrix[1, ] <- initial.state + modified.emission.probs.matrix[1, ] - left.target <- left.target + 1 - } - for (i in seq(left.target, min(right.target + 1, num.exons))){ - # compute matrix with probability we were in state j and are now in state i - # in temp.matrix[j, i] (ignoring emission of current token) - temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) - # find the probability that we are in each of the three states - sum.probs <- apply(temp.matrix, 2, SumProbabilities) - if (!i == (right.target + 1)){ - forward.matrix[i, ] <- sum.probs + modified.emission.probs.matrix[i, ] - } else{ - forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] - } - } - # find the modified likelihood of the sequence - modified.likelihood <- GetLikelihood(forward.matrix, backward.matrix, min(right.target + 1, num.exons)) - return(c(modified.likelihood, unmodified.likelihood)) -} - -SummarizeCNVs <- function(cnv.targets, counts, sample.name, state){ - sample.name <- sample.name - cnv.type <- ifelse(state==3, "DUP", "DEL") - cnv.start <- min(cnv.targets$target) - cnv.end <- max(cnv.targets$target) - cnv.chromosome <- counts[cnv.start, "chromosome"] - cnv.start.base <- counts[cnv.start, "start"] - cnv.start.target <- counts[cnv.start, "target"] - cnv.end.base <- counts[cnv.end, "end"] - cnv.end.target <- counts[cnv.end, "target"] - cnv.kbs <- (cnv.end.base - cnv.start.base) / 1000 - cnv.midbp <- round((cnv.end.base - cnv.start.base) / 2) + cnv.start.base - cnv.targets <- paste(cnv.start.target, "..", cnv.end.target, sep="") - cnv.interval <- paste(cnv.chromosome, ":", cnv.start.base, "-", cnv.end.base, sep="") - num.targets <- cnv.end.target - cnv.start.target + 1 - return(data.frame(sample.name=sample.name, cnv.type=cnv.type, cnv.interval=cnv.interval, - cnv.kbs=cnv.kbs, cnv.chromosome=cnv.chromosome, - cnv.midbp=cnv.midbp, cnv.targets=cnv.targets, num.targets=num.targets)) -} - -PrintCNVs <- function(test.sample.name, viterbi.state, - nonzero.counts){ - consecutiveGroups <- function(sequence){ - num <- length(sequence) - group <- 1 - groups <- rep(0, num) - groups[1] <- group - if (num > 1){ - for (i in 2:num){ - if (!sequence[i] == (sequence[i - 1] + 1)) group <- group + 1 - groups[i] <- group - } - } - return(groups) - } - num.duplications <- 0 - num.deletions <- 0 - for (state in c(1, 3)){ - cnv.targets <- which(viterbi.state$viterbi.state == state) - if (!length(cnv.targets) == 0){ - groups <- consecutiveGroups(cnv.targets) - library(plyr) - cnvs.temp.df <- ddply(data.frame(target=cnv.targets, group=groups), - "group", SummarizeCNVs, nonzero.counts, test.sample.name, - state) - if (state == 1){ - deletions.df <- cnvs.temp.df - if (!is.null(dim(deletions.df))){ - num.deletions <- dim(deletions.df)[1] - } - } else { - duplications.df <- cnvs.temp.df - if (!is.null(dim(duplications.df))){ - num.duplications <- dim(duplications.df)[1] - } - } - } - } - num.calls <- num.deletions + num.duplications - cat(num.calls, "CNVs called in sample", test.sample.name, "\n") - if (num.deletions == 0 & num.duplications == 0){ - df <- data.frame(SAMPLE=character(0), CNV=character(0), INTERVAL=character(0), - KB=numeric(0), CHR=character(0), - MID_BP=numeric(), TARGETS=character(0), NUM_TARG=numeric(0), Q_SOME=numeric(0), MLCN=numeric(0)) - return(df) - } - if (num.deletions > 0 & num.duplications > 0){ - cnvs.df <- rbind(deletions.df, duplications.df) - } else { - ifelse(num.deletions > 0, - cnvs.df <- deletions.df, cnvs.df <- duplications.df) - } - xcnv <- cbind(cnvs.df[, c("sample.name", "cnv.type", "cnv.interval", - "cnv.kbs", "cnv.chromosome", "cnv.midbp", - "cnv.targets", "num.targets")], 0) - colnames(xcnv) <- c("SAMPLE", "CNV", "INTERVAL", "KB", "CHR", "MID_BP", "TARGETS", - "NUM_TARG", "MLCN") - xcnv$Q_SOME <- NA - return(xcnv) -} - -CalcCopyNumber <- function(data, cnvs, homdel.mean){ - for (i in 1:nrow(cnvs)){ - cnv <- cnvs[i, ] - targets <- as.numeric(unlist(strsplit(as.character(cnv$TARGETS), "..", fixed=T))) - cnv.data <- subset(data, target >= targets[1] & target <= targets[2]) - state.target.means <- t(apply(data.frame(x=cnv.data$countsmean), 1, - function(x) c(C1=x*1/2, C2=x, C3=x*3/2, - C4=x * 2, C5=x * 5/2, C6=x*6/2))) - # calculate the expected size (given the predicted variance) - size <- cnv.data$countsmean ^ 2 / (cnv.data$varestimate - cnv.data$countsmean) - emission.probs <- matrix(NA, nrow(cnv.data), 7) - colnames(emission.probs) <- c("C0", "C1", "C2", "C3", "C4", "C5", "C6") - #colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") - # calculate the emission probabilities given the read count - emission.probs[, 1] <- dpois(cnv.data$sample, homdel.mean, log=T) - for (s in 1:6){ - size.state <- size * s/2 - emission.probs[, s+1] <- dnbinom(cnv.data$sample, mu=state.target.means[, s], - size=size.state, log=T) - } - cs <- colSums(emission.probs) - ml.state <- which.max(cs) - 1 - if (ml.state==2){ - ml.state <- ifelse(cnv$CNV=="DEL", 1, 3) - } - cnvs$MLCN[i] <- ml.state - } - return(cnvs) -} diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R index 980f6b1..f6742c7 100644 --- a/R/CANOESCOV/R/run_CANOESCOV.R +++ b/R/CANOESCOV/R/run_CANOESCOV.R @@ -1,10 +1,13 @@ library(methods) library(CODEX) -run_CANOESCOV <- function(reference_set_select_method, - num_of_samples_in_reference_set, - cov_table){ +run_CANOESCOV <- function(input_cov_table, + reference_sample_set_file, + output_calls_file){ + con <- file(reference_sample_set_file, open='r') + reference_sample_set <- readLines(con) + cov_table <- read.csv(input_cov_table) sampname <- unique(cov_table[,"sample_name"]) targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] targets <- targets[!duplicated(targets[,"target_id"]),] @@ -37,12 +40,16 @@ run_CANOESCOV <- function(reference_set_select_method, colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T) xcnv.list <- vector('list', length(sampname)) - for (i in 1:length(sampname)){ - xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=sampname[i], - counts=canoes.reads, - reference_set_select_method=reference_set_select_method, - num_of_samples_in_reference_set=num_of_samples_in_reference_set, - target_length=target_length) + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { + next() + } + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + actual_sample <- samples[1] + reference_samples <- samples[-1] + xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample, + reference.samples=reference_samples, + counts=canoes.reads) } xcnvs <- do.call('rbind', xcnv.list) if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} @@ -80,7 +87,7 @@ run_CANOESCOV <- function(reference_set_select_method, calls[colnames(calls) == 'ed_bp'] <- as.character(unlist(calls[colnames(calls) == 'ed_bp'])) calls[colnames(calls) == 'st_exon'] <- as.character(unlist(calls[colnames(calls) == 'st_exon'])) calls[colnames(calls) == 'ed_exon'] <- as.character(unlist(calls[colnames(calls) == 'ed_exon'])) - calls + write.csv(calls, output_calls_file, row.names=F) } # SAMPLE CNV INTERVAL KB CHR MID_BP TARGETS NUM_TARG MLCN Q_SOME diff --git a/R/CODEXCOV/DESCRIPTION b/R/CODEXCOV/DESCRIPTION index d9e2385..f6516a3 100755 --- a/R/CODEXCOV/DESCRIPTION +++ b/R/CODEXCOV/DESCRIPTION @@ -14,8 +14,7 @@ Depends: devtools (>= 1.13.2), DBI (== 0.8), optparse (== 1.4.4), - CODEX (>= 1.8.0), - REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1) + CODEX (>= 1.8.0) License: GPL-3 Encoding: UTF-8 LazyData: true From 14b19674f65e21de512c5bd3ca385b7f49d5cfe1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 17:44:04 +0200 Subject: [PATCH 028/114] bugfix --- R/CODEXCOV/R/run_CODEXCOV.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R index 246aa22..a1ce428 100644 --- a/R/CODEXCOV/R/run_CODEXCOV.R +++ b/R/CODEXCOV/R/run_CODEXCOV.R @@ -50,7 +50,7 @@ run_CODEXCOV <- function(K_from, samples <- unlist(strsplit(reference_sample_set[[i]], ',')) actual_sample <- samples[1] reference_samples <- samples[-1] - samples <- samples[order(samples[,1]),] + samples <- sort(samples) Y_subset <- Y[,samples] ################################################### From 02e0c01e9f28612218bd8081887f5fb370aea86e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 18:05:50 +0200 Subject: [PATCH 029/114] finished dag file for CODEX cnv caller --- airflow/dags/codex.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/airflow/dags/codex.py b/airflow/dags/codex.py index 55ee94b..1da827f 100755 --- a/airflow/dags/codex.py +++ b/airflow/dags/codex.py @@ -36,11 +36,19 @@ num_refs = '30' reference_sample_set_file = 'reference_sample_set.csv' +### codex parameters +k_from = '1' +k_to = '3' +lmax = '200' +output_calls_file = 'calls.csv' + run_codex_caller_cmd= " \ docker pull biodatageeks/cnv-opt-target-qc; \ docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \ docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \ docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \ +docker pull biodatageeks/cnv-opt-codexcov; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-codexcov Rscript -e \"library(\'CODEXCOV\');run_CODEXCOV(" + k_from + "," + k_to + "," + lmax + ",'" + qc_cov_table + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \ " run_codex_caller_task= BashOperator ( From 066be6833ec83ccb108bda1751d75ce853a73090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 18:15:26 +0200 Subject: [PATCH 030/114] IRanges in docker for CANOESCOV package --- Docker/cnv-opt-canoescov/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile index 44080ff..402047a 100644 --- a/Docker/cnv-opt-canoescov/Dockerfile +++ b/Docker/cnv-opt-canoescov/Dockerfile @@ -1,4 +1,5 @@ FROM biodatageeks/cnv-opt-canoes MAINTAINER biodatageeks +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From f9f4ffd5c65e390c55d250a70ad98479f40e6555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 18:29:19 +0200 Subject: [PATCH 031/114] add getgc function to CANOESCOV package --- Docker/cnv-opt-canoescov/Dockerfile | 6 ++++++ R/CANOESCOV/R/functions_CANOESCOV.R | 19 +++++++++++++++++++ R/CANOESCOV/R/run_CANOESCOV.R | 2 +- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile index 402047a..fa7683b 100644 --- a/Docker/cnv-opt-canoescov/Dockerfile +++ b/Docker/cnv-opt-canoescov/Dockerfile @@ -2,4 +2,10 @@ FROM biodatageeks/cnv-opt-canoes MAINTAINER biodatageeks RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrings')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')" + RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R index b55c138..bac5cc0 100644 --- a/R/CANOESCOV/R/functions_CANOESCOV.R +++ b/R/CANOESCOV/R/functions_CANOESCOV.R @@ -11,6 +11,25 @@ coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ return(list(Y=Y)) } + +# from CODEX package +getgc <- function(chr, ref) { + if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") { + chrtemp <- 23 + } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") { + chrtemp <- 24 + } else { + chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1]) + } + if (length(chrtemp) == 0) + message("Chromosome cannot be found in NCBI Homo sapiens database!") + chrm <- unmasked(Hsapiens[[chrtemp]]) + seqs <- Views(chrm, ref) + af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE) + gc <- round((af[, "G"] + af[, "C"]) * 100,2) + gc +} + # CallCNVs # Calls CNVs in sample of interest # Arguments: diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R index f6742c7..8d45c82 100644 --- a/R/CANOESCOV/R/run_CANOESCOV.R +++ b/R/CANOESCOV/R/run_CANOESCOV.R @@ -1,5 +1,5 @@ library(methods) -library(CODEX) +#library(CODEX) run_CANOESCOV <- function(input_cov_table, reference_sample_set_file, From d510815f4a66cbfc490d3aa4649c9ca95f1fd264 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 19:30:27 +0200 Subject: [PATCH 032/114] missing libraries in getgc function --- R/CANOESCOV/R/functions_CANOESCOV.R | 30 +++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R index bac5cc0..70ff41d 100644 --- a/R/CANOESCOV/R/functions_CANOESCOV.R +++ b/R/CANOESCOV/R/functions_CANOESCOV.R @@ -14,20 +14,22 @@ coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ # from CODEX package getgc <- function(chr, ref) { - if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") { - chrtemp <- 23 - } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") { - chrtemp <- 24 - } else { - chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1]) - } - if (length(chrtemp) == 0) - message("Chromosome cannot be found in NCBI Homo sapiens database!") - chrm <- unmasked(Hsapiens[[chrtemp]]) - seqs <- Views(chrm, ref) - af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE) - gc <- round((af[, "G"] + af[, "C"]) * 100,2) - gc + library(GenomeInfoDb) + library(BSgenome.Hsapiens.UCSC.hg19) + if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") { + chrtemp <- 23 + } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") { + chrtemp <- 24 + } else { + chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1]) + } + if (length(chrtemp) == 0) + message("Chromosome cannot be found in NCBI Homo sapiens database!") + chrm <- unmasked(Hsapiens[[chrtemp]]) + seqs <- Views(chrm, ref) + af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE) + gc <- round((af[, "G"] + af[, "C"]) * 100,2) + gc } # CallCNVs From 43cec0bab5d5b63988d3a3ddcd88e80d7a87d65b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 21:15:24 +0200 Subject: [PATCH 033/114] bugfixes --- Jenkinsfile | 7 ------- R/CANOESCOV/R/functions_CANOESCOV.R | 2 -- R/CANOESCOV/R/run_CANOESCOV.R | 7 ++++++- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index ce4967f..fc11b6f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2,13 +2,6 @@ pipeline { agent any stages { - stage('Building Docker images') { - steps { - echo 'Building Docker images....' - sh './build.sh' - } - } - /*stage('Test R code') { steps { echo 'Testing R code....' diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R index 70ff41d..564f073 100644 --- a/R/CANOESCOV/R/functions_CANOESCOV.R +++ b/R/CANOESCOV/R/functions_CANOESCOV.R @@ -14,8 +14,6 @@ coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ # from CODEX package getgc <- function(chr, ref) { - library(GenomeInfoDb) - library(BSgenome.Hsapiens.UCSC.hg19) if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") { chrtemp <- 23 } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") { diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R index 8d45c82..eb1b243 100644 --- a/R/CANOESCOV/R/run_CANOESCOV.R +++ b/R/CANOESCOV/R/run_CANOESCOV.R @@ -1,5 +1,10 @@ library(methods) -#library(CODEX) +library(IRanges) +library(BSgenome.Hsapiens.UCSC.hg19) +library(Biostrings) +library(Rsamtools) +library(GenomeInfoDb) +library(S4Vectors) run_CANOESCOV <- function(input_cov_table, reference_sample_set_file, From f0a6ea0a5583bc11915022e834e0eba9a5e9c7ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 22:16:12 +0200 Subject: [PATCH 034/114] missing libraries --- R/CANOESCOV/R/functions_CANOESCOV.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R index 564f073..70ff41d 100644 --- a/R/CANOESCOV/R/functions_CANOESCOV.R +++ b/R/CANOESCOV/R/functions_CANOESCOV.R @@ -14,6 +14,8 @@ coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ # from CODEX package getgc <- function(chr, ref) { + library(GenomeInfoDb) + library(BSgenome.Hsapiens.UCSC.hg19) if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") { chrtemp <- 23 } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") { From 43997d9ebf7ea83801b9981281d6449fae10a852 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 26 Mar 2018 22:50:44 +0200 Subject: [PATCH 035/114] libraries in another place --- R/CANOESCOV/R/functions_CANOESCOV.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R index 70ff41d..6b7fac3 100644 --- a/R/CANOESCOV/R/functions_CANOESCOV.R +++ b/R/CANOESCOV/R/functions_CANOESCOV.R @@ -68,6 +68,12 @@ getgc <- function(chr, ref) { # NUM_TARG: how many targets are in the CNV # Q_SOME: a Phred-scaled quality score for the CNV CallCNVs <- function(sample.name, reference.samples, counts, p=1e-08, Tnum=6, D=70000, get.dfs=F, homdel.mean=0.2){ + library(IRanges) + library(BSgenome.Hsapiens.UCSC.hg19) + library(Biostrings) + library(Rsamtools) + library(GenomeInfoDb) + library(S4Vectors) if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ stop("First five columns of counts matrix must be target, chromosome, start, end, gc") From 68507f078bdd8f847c0dcacaa8213353d8ee1d91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 27 Mar 2018 14:51:36 +0200 Subject: [PATCH 036/114] remove libraries from run_CANOESCOV function --- R/CANOESCOV/R/run_CANOESCOV.R | 6 ------ 1 file changed, 6 deletions(-) diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R index eb1b243..f85e619 100644 --- a/R/CANOESCOV/R/run_CANOESCOV.R +++ b/R/CANOESCOV/R/run_CANOESCOV.R @@ -1,10 +1,4 @@ library(methods) -library(IRanges) -library(BSgenome.Hsapiens.UCSC.hg19) -library(Biostrings) -library(Rsamtools) -library(GenomeInfoDb) -library(S4Vectors) run_CANOESCOV <- function(input_cov_table, reference_sample_set_file, From 6e6cc476df17fbbeac45703e60f8891272f80aad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 27 Mar 2018 14:56:47 +0200 Subject: [PATCH 037/114] CODEX in CANOESCOV package (only for tests) --- Docker/cnv-opt-canoescov/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile index fa7683b..b138c36 100644 --- a/Docker/cnv-opt-canoescov/Dockerfile +++ b/Docker/cnv-opt-canoescov/Dockerfile @@ -7,5 +7,6 @@ RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrin RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')" RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From 1b82b56863b5abfd39ab0e74802f549fe35b9b76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 27 Mar 2018 15:37:57 +0200 Subject: [PATCH 038/114] tests for building CANOESCOV docker with CODEX dependencies --- Docker/cnv-opt-canoescov/Dockerfile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile index b138c36..27a19c2 100644 --- a/Docker/cnv-opt-canoescov/Dockerfile +++ b/Docker/cnv-opt-canoescov/Dockerfile @@ -1,12 +1,6 @@ FROM biodatageeks/cnv-opt-canoes MAINTAINER biodatageeks -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrings')" -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')" -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')" RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From 755562282918d2eb9c365bd6423ed3a390f22262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 27 Mar 2018 16:29:15 +0200 Subject: [PATCH 039/114] missing CANOES library --- R/CANOESCOV/R/functions_CANOESCOV.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R index 6b7fac3..e909d2e 100644 --- a/R/CANOESCOV/R/functions_CANOESCOV.R +++ b/R/CANOESCOV/R/functions_CANOESCOV.R @@ -74,6 +74,7 @@ CallCNVs <- function(sample.name, reference.samples, counts, p=1e-08, Tnum=6, D= library(Rsamtools) library(GenomeInfoDb) library(S4Vectors) + library(CANOES) if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ stop("First five columns of counts matrix must be target, chromosome, start, end, gc") From 381555c03049ac7111e6dd6c1f37388ae1dcf999 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 27 Mar 2018 17:36:05 +0200 Subject: [PATCH 040/114] bugfix in CANOESCOV libraries --- Docker/cnv-opt-canoescov/Dockerfile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile index 27a19c2..fa7683b 100644 --- a/Docker/cnv-opt-canoescov/Dockerfile +++ b/Docker/cnv-opt-canoescov/Dockerfile @@ -1,6 +1,11 @@ FROM biodatageeks/cnv-opt-canoes MAINTAINER biodatageeks -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrings')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')" RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From 6e4c1cca7b9cf853c3a037bc85af0cedaadf991b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 12:48:33 +0200 Subject: [PATCH 041/114] first version of new interface to TARGET.QC package --- R/TARGET.QC/R/functions_TARGET.QC.R | 12 ------ R/TARGET.QC/R/run_TARGET.QC.R | 60 +++++++++-------------------- 2 files changed, 19 insertions(+), 53 deletions(-) diff --git a/R/TARGET.QC/R/functions_TARGET.QC.R b/R/TARGET.QC/R/functions_TARGET.QC.R index 63f2278..850803e 100644 --- a/R/TARGET.QC/R/functions_TARGET.QC.R +++ b/R/TARGET.QC/R/functions_TARGET.QC.R @@ -1,17 +1,5 @@ library(CODEX) -coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ - Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) - for(sample in sampname) { - cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] - cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ] - Y <- cbind(Y, cov_targets_for_sample[,"read_count"]) - } - colnames(Y) <- sampname - rownames(Y) <- targets_for_chr[,"target_id"] - return(list(Y=Y)) -} - gcmapp1 <- function(chr, ref){ gc <- getgc(chr, ref) mapp <- getmapp(chr, ref) diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index 42ff0e6..f43787e 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -6,7 +6,9 @@ run_TARGET.QC <- function(mapp_thresh, gc_thresh_from, gc_thresh_to, input_cov_table, - output_cov_table){ + output_cov_table, + input_bed, + output_bed){ #mapp_thresh <- 0.9 #cov_thresh_from <- 20 #cov_thresh_to <- 4000 @@ -14,45 +16,21 @@ run_TARGET.QC <- function(mapp_thresh, #length_thresh_to <- 2000 #gc_thresh_from <- 20 #gc_thresh_to <- 80 - #lmax <- 200 - cov_table <- read.csv(input_cov_table) - sampname <- unique(cov_table[,"sample_name"]) - targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] - targets <- targets[!duplicated(targets[,"target_id"]),] - targets <- targets[with(targets, order(target_id)), ] - cov_table_qc <- matrix(nrow=0, ncol=6) - colnames(cov_table_qc) <- colnames(cov_table) - - chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) - for(chr in chrs) { - targets_for_chr <- targets[targets[,"chr"] == chr,] - ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"]) - if (length(ref) == 0) { # 0 elements for specified chromosome in bed - next() - } - Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y - gcmapp1_result <- gcmapp1(chr, ref) - gc <- gcmapp1_result$gc - mapp <- gcmapp1_result$mapp - - qcObj1_result <- qcObj1(Y, sampname, chr, ref, mapp, gc, cov_thresh = c(cov_thresh_from, cov_thresh_to), - length_thresh = c(length_thresh_from, length_thresh_to), mapp_thresh, - gc_thresh = c(gc_thresh_from, gc_thresh_to)) - Y_qc <- qcObj1_result$Y_qc - sampname_qc <- qcObj1_result$sampname_qc - ref_qc <- qcObj1_result$ref_qc - colnames(Y_qc) <- sampname_qc - for(sample in colnames(Y_qc)) { - new_cov_table_qc_rows <- cbind(sample, rownames(Y_qc), chr, start(ref_qc), end(ref_qc), Y_qc[,sample]) - cov_table_qc <- rbind(cov_table_qc, new_cov_table_qc_rows) - } - } - cov_table_qc <- as.data.frame(cov_table_qc) - cov_table_qc[,"pos_min"] <- strtoi(cov_table_qc[,"pos_min"]) - cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"]) - cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"]) - cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"]) - colnames(cov_table_qc) <- c("sample_name", "target_id", "chr", "pos_min", "pos_max", "read_count") - write.csv(cov_table_qc, output_cov_table, row.names=F, quote=F) + Y <- read.csv(input_cov_table) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) + gcmapp1_result <- gcmapp1(targets[1,"chr"], ref) + gc <- gcmapp1_result$gc + mapp <- gcmapp1_result$mapp + qcObj1_result <- qcObj1(Y, sampname, targets[1,"chr"], ref, mapp, gc, cov_thresh = c(cov_thresh_from, cov_thresh_to), + length_thresh = c(length_thresh_from, length_thresh_to), mapp_thresh, + gc_thresh = c(gc_thresh_from, gc_thresh_to)) + Y_qc <- qcObj1_result$Y_qc + sampname_qc <- qcObj1_result$sampname_qc + ref_qc <- qcObj1_result$ref_qc + colnames(Y_qc) <- sampname_qc + write.csv(Y_qc, output_cov_table, row.names=F, quote=F) + write.csv(ref_qc, output_bed, row.names=F, quote=F) } From 9a7c04394b64f825fe6044535c67a305e71d8b92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 12:56:59 +0200 Subject: [PATCH 042/114] first version of new interface to REFERENCE.SAMPLE.SET.SELECTOR package --- .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 12 ------------ .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 12 +++++------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index 70c9fe4..d92bf29 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -1,16 +1,4 @@ -coverageObj1 <- function(cov_table, sampname, targets_for_chr){ - Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) - for(sample in sampname) { - cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] - cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ] - Y <- cbind(Y, cov_targets_for_sample[,"read_count"]) - } - colnames(Y) <- sampname - rownames(Y) <- targets_for_chr[,"target_id"] - return(list(Y=Y)) -} - canoes_method <- function(investigated_sample, Y, num_refs){ if (num_refs == 0) { num_refs <- 30 # in CANOES application num_refs is default set to 30 diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index c4492df..4783c73 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -1,15 +1,13 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, num_refs, input_cov_table, + input_bed, output_reference_file){ - cov_table <- read.csv(input_cov_table) - sampname <- unique(cov_table[,"sample_name"]) - targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] - targets <- targets[!duplicated(targets[,"target_id"]),] - targets <- targets[with(targets, order(target_id)), ] - target_length <- targets[,"pos_max"] - targets[,"pos_min"] - Y <- coverageObj1(cov_table, sampname, targets)$Y + Y <- read.csv(input_cov_table) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + target_length <- targets[,"st_bp"] - targets[,"ed_bp"] reference_samples <- list() for(i in 1:length(sampname)) { From b069d278afc5d53269e77b9788270793ebeac15c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 13:18:31 +0200 Subject: [PATCH 043/114] test of forcing rebuilding specified docker --- build.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index f2dba85..c2c3df5 100755 --- a/build.sh +++ b/build.sh @@ -34,8 +34,9 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - #docker build -t $image:$version . - #docker build -t $image:latest . + if [[ ${image} == "biodatageeks/target-qc" ]]; then + docker rmi $image + fi docker build --no-cache -t $image:$version . docker build --no-cache -t $image:latest . if [[ ${BUILD_MODE} != "local" ]]; then From 7999948f0aa8b98f77ef256e4bafd837bf208dcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 13:21:39 +0200 Subject: [PATCH 044/114] proper name of packege --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index c2c3df5..462c3d9 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/target-qc" ]]; then + if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]]; then docker rmi $image fi docker build --no-cache -t $image:$version . From bc168e90f32b414c54918df1ff93ae9c6cf3d8fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 13:25:24 +0200 Subject: [PATCH 045/114] another test of forcing docker rebuilding --- build.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/build.sh b/build.sh index 462c3d9..8460c5f 100755 --- a/build.sh +++ b/build.sh @@ -34,9 +34,7 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]]; then - docker rmi $image - fi + docker rmi biodatageeks/cnv-opt-target-qc docker build --no-cache -t $image:$version . docker build --no-cache -t $image:latest . if [[ ${BUILD_MODE} != "local" ]]; then From 200a8f678f506df1302e3eadc3f47baa7a3a369f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 13:38:41 +0200 Subject: [PATCH 046/114] another test of rebuilding docker image --- Docker/cnv-opt-target-qc/Dockerfile | 2 ++ build.sh | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile index 8391b91..e78cc1a 100644 --- a/Docker/cnv-opt-target-qc/Dockerfile +++ b/Docker/cnv-opt-target-qc/Dockerfile @@ -1,4 +1,6 @@ FROM biodatageeks/cnv-opt-codex MAINTAINER biodatageeks +ARG CACHE_DATE=not_a_date + RUN Rscript -e "install.packages('TARGET.QC', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/build.sh b/build.sh index 8460c5f..e8b07ff 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,11 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - docker rmi biodatageeks/cnv-opt-target-qc + if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]]; then + echo "Rebuilf of ${image} image forced..." + docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . + docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . + fi docker build --no-cache -t $image:$version . docker build --no-cache -t $image:latest . if [[ ${BUILD_MODE} != "local" ]]; then From d3dd1407a227d969ab3364eb5e9046bd42ed6110 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 13:52:55 +0200 Subject: [PATCH 047/114] force to rebuild cnv-opt-reference-sample-set-selector docker --- Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 2 ++ build.sh | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile index 7e82158..716a5f4 100644 --- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile +++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile @@ -1,6 +1,8 @@ FROM biodatageeks/cnv-opt-codex MAINTAINER biodatageeks +ARG CACHE_DATE=not_a_date + RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')" RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/build.sh b/build.sh index e8b07ff..7e5be95 100755 --- a/build.sh +++ b/build.sh @@ -34,13 +34,14 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]]; then + if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]] || [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then echo "Rebuilf of ${image} image forced..." docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . + else + docker build --no-cache -t $image:$version . + docker build --no-cache -t $image:latest . fi - docker build --no-cache -t $image:$version . - docker build --no-cache -t $image:latest . if [[ ${BUILD_MODE} != "local" ]]; then docker push docker.io/$image:latest docker push docker.io/$image:$version From b3d1bc92bb2a6c72dfe21ff4ff8a1fed768102ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 13:54:07 +0200 Subject: [PATCH 048/114] bugfix --- R/TARGET.QC/R/run_TARGET.QC.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index f43787e..6d56276 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -31,6 +31,6 @@ run_TARGET.QC <- function(mapp_thresh, ref_qc <- qcObj1_result$ref_qc colnames(Y_qc) <- sampname_qc write.csv(Y_qc, output_cov_table, row.names=F, quote=F) - write.csv(ref_qc, output_bed, row.names=F, quote=F) + write.csv(ref[rownames(ref_qc),], output_bed, row.names=F, quote=F) } From 2247ef34da81b31b46de0b33b396ac3b1e0589d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 13:59:31 +0200 Subject: [PATCH 049/114] bugfix one more time --- R/TARGET.QC/R/run_TARGET.QC.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index 6d56276..b6cab3a 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -31,6 +31,6 @@ run_TARGET.QC <- function(mapp_thresh, ref_qc <- qcObj1_result$ref_qc colnames(Y_qc) <- sampname_qc write.csv(Y_qc, output_cov_table, row.names=F, quote=F) - write.csv(ref[rownames(ref_qc),], output_bed, row.names=F, quote=F) + write.csv(targets[rownames(ref_qc),], output_bed, row.names=F, quote=F) } From 961e8b38e975c911873e9e71ba38c09d6e8447f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 14:14:57 +0200 Subject: [PATCH 050/114] propoer indices --- R/TARGET.QC/R/run_TARGET.QC.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index b6cab3a..2c3da28 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -19,6 +19,8 @@ run_TARGET.QC <- function(mapp_thresh, Y <- read.csv(input_cov_table) sampname <- colnames(Y) targets <- read.delim(input_bed) + rownames(Y) <- 1:nrow(Y) + rownames(targets) <- 1:nrow(targets) ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) gcmapp1_result <- gcmapp1(targets[1,"chr"], ref) gc <- gcmapp1_result$gc @@ -31,6 +33,6 @@ run_TARGET.QC <- function(mapp_thresh, ref_qc <- qcObj1_result$ref_qc colnames(Y_qc) <- sampname_qc write.csv(Y_qc, output_cov_table, row.names=F, quote=F) - write.csv(targets[rownames(ref_qc),], output_bed, row.names=F, quote=F) + write.csv(targets[rownames(Y_qc),], output_bed, row.names=F, quote=F) } From 3da288a44fb00f8ea2308fa7c982644c7885a483 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 14:22:29 +0200 Subject: [PATCH 051/114] bugfix in saveing qc_bed to file --- R/TARGET.QC/R/run_TARGET.QC.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index 2c3da28..f5560c6 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -33,6 +33,6 @@ run_TARGET.QC <- function(mapp_thresh, ref_qc <- qcObj1_result$ref_qc colnames(Y_qc) <- sampname_qc write.csv(Y_qc, output_cov_table, row.names=F, quote=F) - write.csv(targets[rownames(Y_qc),], output_bed, row.names=F, quote=F) + write.table(targets[rownames(Y_qc),], output_bed, row.names=F, quote=F, sep="\t") } From 54415b6a20ce591ed04058a0d5c62acebc1c465d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 14:32:53 +0200 Subject: [PATCH 052/114] progress in selecting reference sample set --- .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index 4783c73..37d63e5 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -12,6 +12,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, for(i in 1:length(sampname)) { investigated_sample <- as.character(sampname[i]) + print(paste("Processing ", investigated_sample, " sample ...", sep="")) if(select_method == "canoes") { reference_samples_for_investigated_sample <- canoes_method(investigated_sample, Y, num_refs)$reference_samples reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) From ef805c789eef5cf3a525bf620ada4bdee3be5099 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 15:19:39 +0200 Subject: [PATCH 053/114] firsst version of new interface for exomedepthcov package --- Docker/cnv-opt-exomedepthcov/Dockerfile | 2 + R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R | 15 ---- R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R | 81 ++++++++++----------- build.sh | 2 +- 4 files changed, 40 insertions(+), 60 deletions(-) diff --git a/Docker/cnv-opt-exomedepthcov/Dockerfile b/Docker/cnv-opt-exomedepthcov/Dockerfile index 28448f8..9d2d135 100644 --- a/Docker/cnv-opt-exomedepthcov/Dockerfile +++ b/Docker/cnv-opt-exomedepthcov/Dockerfile @@ -1,5 +1,7 @@ FROM biodatageeks/cnv-opt-exomedepth MAINTAINER biodatageeks +ARG CACHE_DATE=not_a_date + RUN Rscript -e "install.packages('EXOMEDEPTHCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R index 550b4d0..e69de29 100644 --- a/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R +++ b/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R @@ -1,15 +0,0 @@ -library(ExomeDepth) - -coverageObj1 <- function(cov_table, sampname, targets_for_chr){ - Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) - for(sample in sampname) { - cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] - cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ] - Y <- cbind(Y, cov_targets_for_sample[,"read_count"]) - } - colnames(Y) <- sampname - rownames(Y) <- targets_for_chr[,"target_id"] - return(list(Y=Y)) -} - - diff --git a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R index a8e8e3c..2933dc2 100644 --- a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R +++ b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R @@ -2,62 +2,55 @@ library(ExomeDepth) library(methods) run_EXOMEDEPTHCOV <- function(input_cov_table, + input_bed, reference_sample_set_file, output_calls_file){ con <- file(reference_sample_set_file, open='r') reference_sample_set <- readLines(con) - cov_table <- read.csv(input_cov_table) - sampname <- unique(cov_table[,"sample_name"]) - targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] - targets <- targets[!duplicated(targets[,"target_id"]),] - targets <- targets[with(targets, order(target_id)), ] + Y <- read.csv(input_cov_table) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + rownames(Y) <- 1:nrow(Y) + rownames(targets) <- 1:nrow(targets) calls <- data.frame(matrix(nrow=0, ncol=13)) - chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) library(IRanges) - for(chr in chrs) { - targets_for_chr <- targets[targets[,"chr"] == chr,] - ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"]) - if (length(ref) == 0) { # 0 elements for specified chromosome in bed + ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) + + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { next() } - Y <- coverageObj1(cov_table, sampname, targets_for_chr)$Y - - for (i in 1:length(reference_sample_set)) { - if (reference_sample_set[[i]] == '') { - next() - } - samples <- unlist(strsplit(reference_sample_set[[i]], ',')) - actual_sample <- samples[1] - reference_samples <- samples[-1] + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + actual_sample <- samples[1] + reference_samples <- samples[-1] - ## ----construct.ref------------------------------------------------------- - my.matrix <- as.matrix(Y[,reference_samples]) - my.reference.selected <- apply(X = my.matrix, - MAR = 1, - FUN = sum) + ## ----construct.ref------------------------------------------------------- + my.matrix <- as.matrix(Y[,reference_samples]) + my.reference.selected <- apply(X = my.matrix, + MAR = 1, + FUN = sum) - ## ----build.complete------------------------------------------------------ - all.exons <- new('ExomeDepth', - test = Y[,actual_sample], - reference = my.reference.selected, - formula = 'cbind(test, reference) ~ 1') + ## ----build.complete------------------------------------------------------ + all.exons <- new('ExomeDepth', + test = Y[,actual_sample], + reference = my.reference.selected, + formula = 'cbind(test, reference) ~ 1') - ## ----call.CNVs----------------------------------------------------------- - all.exons <- ExomeDepth::CallCNVs(x = all.exons, - transition.probability = 10^-4, - chromosome = rep(chr, nrow(Y)), - start = start(ref), - end = end(ref), - name = rep('name', nrow(Y))) - print(all.exons@CNV.calls) - if (nrow(all.exons@CNV.calls) > 0) { - actual_sample_column <- data.frame(matrix(rep(actual_sample, nrow(all.exons@CNV.calls)), nrow=nrow(all.exons@CNV.calls))) - callsIt <- cbind(actual_sample_column, all.exons@CNV.calls) - colnames(callsIt) <- c(c, colnames(all.exons@CNV.calls)) - if (nrow(calls)==0){calls <- data.frame(matrix(nrow=0, ncol=ncol(callsIt)))} - calls <- rbind(calls, callsIt) - } + ## ----call.CNVs----------------------------------------------------------- + all.exons <- ExomeDepth::CallCNVs(x = all.exons, + transition.probability = 10^-4, + chromosome = rep(targets[1,'chr'], nrow(Y)), + start = start(ref), + end = end(ref), + name = rep('name', nrow(Y))) + print(all.exons@CNV.calls) + if (nrow(all.exons@CNV.calls) > 0) { + actual_sample_column <- data.frame(matrix(rep(actual_sample, nrow(all.exons@CNV.calls)), nrow=nrow(all.exons@CNV.calls))) + callsIt <- cbind(actual_sample_column, all.exons@CNV.calls) + colnames(callsIt) <- c(c, colnames(all.exons@CNV.calls)) + if (nrow(calls)==0){calls <- data.frame(matrix(nrow=0, ncol=ncol(callsIt)))} + calls <- rbind(calls, callsIt) } } # unify names of output columns diff --git a/build.sh b/build.sh index 7e5be95..a8668ac 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]] || [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then + if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]] || [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-exomedepthcov" ]]; then echo "Rebuilf of ${image} image forced..." docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . From bd4c87a5083909dea9ccafdb43a3a0760312d083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 15:32:35 +0200 Subject: [PATCH 054/114] some clean up in EXOMEDEPTHCOV package --- R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R index 2933dc2..5694e4b 100644 --- a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R +++ b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R @@ -14,8 +14,6 @@ run_EXOMEDEPTHCOV <- function(input_cov_table, rownames(Y) <- 1:nrow(Y) rownames(targets) <- 1:nrow(targets) calls <- data.frame(matrix(nrow=0, ncol=13)) - library(IRanges) - ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) for (i in 1:length(reference_sample_set)) { if (reference_sample_set[[i]] == '') { @@ -40,9 +38,9 @@ run_EXOMEDEPTHCOV <- function(input_cov_table, ## ----call.CNVs----------------------------------------------------------- all.exons <- ExomeDepth::CallCNVs(x = all.exons, transition.probability = 10^-4, - chromosome = rep(targets[1,'chr'], nrow(Y)), - start = start(ref), - end = end(ref), + chromosome = targets[,"chr"], + start = targets[,"st_bp"], + end = targets[,"ed_bp"], name = rep('name', nrow(Y))) print(all.exons@CNV.calls) if (nrow(all.exons@CNV.calls) > 0) { From bb2b86d4a63160e4cb26ef2315fd4b6c70f7e5e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 15:43:04 +0200 Subject: [PATCH 055/114] first version new interface to CODEXCOV package --- Docker/cnv-opt-codexcov/Dockerfile | 2 + R/CODEXCOV/R/functions_CODEXCOV.R | 21 -------- R/CODEXCOV/R/run_CODEXCOV.R | 83 +++++++++++++----------------- build.sh | 2 +- 4 files changed, 38 insertions(+), 70 deletions(-) diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile index 258bbf8..f06904c 100644 --- a/Docker/cnv-opt-codexcov/Dockerfile +++ b/Docker/cnv-opt-codexcov/Dockerfile @@ -1,4 +1,6 @@ FROM biodatageeks/cnv-opt-codex MAINTAINER biodatageeks +ARG CACHE_DATE=not_a_date + RUN Rscript -e "install.packages('CODEXCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/CODEXCOV/R/functions_CODEXCOV.R b/R/CODEXCOV/R/functions_CODEXCOV.R index 3f7522c..25ad2de 100644 --- a/R/CODEXCOV/R/functions_CODEXCOV.R +++ b/R/CODEXCOV/R/functions_CODEXCOV.R @@ -1,26 +1,5 @@ library(CODEX) -#' Function Dexcription -#' -#' Function description. -#' @param cov_file -#' @param sampname -#' @keywords -#' @export -#' @examples -#' coverageObj1 -coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ - Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) - for(sample in sampname) { - cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] - cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ] - Y <- cbind(Y, cov_targets_for_sample[,"read_count"]) - } - colnames(Y) <- sampname - rownames(Y) <- targets_for_chr[,"target_id"] - return(list(Y=Y)) -} - #' Function Dexcription #' #' Function description. diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R index a1ce428..cbb5d0a 100644 --- a/R/CODEXCOV/R/run_CODEXCOV.R +++ b/R/CODEXCOV/R/run_CODEXCOV.R @@ -10,69 +10,56 @@ run_CODEXCOV <- function(K_from, K_to, lmax, input_cov_table, + input_bed, reference_sample_set_file, output_calls_file){ con <- file(reference_sample_set_file, open='r') reference_sample_set <- readLines(con) - cov_table <- read.csv(input_cov_table) - sampname <- unique(cov_table[,"sample_name"]) - sampname <- as.character(sampname) - targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] - targets <- targets[!duplicated(targets[,"target_id"]),] - targets <- targets[with(targets, order(target_id)), ] + Y <- read.csv(input_cov_table) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + rownames(Y) <- 1:nrow(Y) + rownames(targets) <- 1:nrow(targets) finalcall <- matrix(nrow=0, ncol=13) - chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) - - for(chr in chrs) { - targets_for_chr <- targets[targets[,"chr"] == chr,] - ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"]) - if (length(ref) == 0) { # 0 elements for specified chromosome in bed + ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) + + ################################################### + ### code chunk number 5: gcmapp1 + ################################################### + gcmapp1_result <- gcmapp1(targets[1,'chr'], ref) + gc <- gcmapp1_result$gc + + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { next() } + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + actual_sample <- samples[1] + reference_samples <- samples[-1] + samples <- sort(samples) + Y_subset <- Y[,samples] ################################################### - ### code chunk number 4: coverageObj1 + ### code chunk number 7: normObj1 ################################################### - Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y + normObj_result <- normObj1(Y_subset, gc, K = K_from:K_to) + Yhat <- normObj_result$Yhat + AIC <- normObj_result$AIC + BIC <- normObj_result$BIC + RSS <- normObj_result$RSS + K <- normObj_result$K ################################################### - ### code chunk number 5: gcmapp1 + ### code chunk number 11: segment1 ################################################### - gcmapp1_result <- gcmapp1(chr, ref) - gc <- gcmapp1_result$gc - - for (i in 1:length(reference_sample_set)) { - if (reference_sample_set[[i]] == '') { - next() - } - samples <- unlist(strsplit(reference_sample_set[[i]], ',')) - actual_sample <- samples[1] - reference_samples <- samples[-1] - samples <- sort(samples) - Y_subset <- Y[,samples] - - ################################################### - ### code chunk number 7: normObj1 - ################################################### - normObj_result <- normObj1(Y_subset, gc, K = K_from:K_to) - Yhat <- normObj_result$Yhat - AIC <- normObj_result$AIC - BIC <- normObj_result$BIC - RSS <- normObj_result$RSS - K <- normObj_result$K - - ################################################### - ### code chunk number 11: segment1 - ################################################### - finalcallIt <- segment1(Y_subset, Yhat, K[which.max(BIC)], K, samples, - ref, chr, lmax, mode = "integer")$finalcall - finalcallIt <- finalcallIt[finalcallIt[,"sample_name"] == actual_sample,] - if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))} - finalcall <- rbind(finalcall, finalcallIt) - print(finalcall) - } + finalcallIt <- segment1(Y_subset, Yhat, K[which.max(BIC)], K, samples, + ref, targets[1,'chr'], lmax, mode = "integer")$finalcall + finalcallIt <- finalcallIt[finalcallIt[,"sample_name"] == actual_sample,] + if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))} + finalcall <- rbind(finalcall, finalcallIt) + print(finalcall) } finalcall <- unify_calls_format(finalcall)$finalcall write.csv(finalcall, output_calls_file, row.names=F) diff --git a/build.sh b/build.sh index a8668ac..6a3bee1 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]] || [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-exomedepthcov" ]]; then + if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then echo "Rebuilf of ${image} image forced..." docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . From 100b6f19c6e92fe5c0302e07a559fcf9880fe9b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 16:04:02 +0200 Subject: [PATCH 056/114] first version of new interface to CANOESCOV package --- R/CANOESCOV/R/functions_CANOESCOV.R | 13 ----- R/CANOESCOV/R/run_CANOESCOV.R | 80 ++++++++++++++--------------- R/CODEXCOV/R/run_CODEXCOV.R | 1 - 3 files changed, 38 insertions(+), 56 deletions(-) diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R index e909d2e..f3195a7 100644 --- a/R/CANOESCOV/R/functions_CANOESCOV.R +++ b/R/CANOESCOV/R/functions_CANOESCOV.R @@ -1,17 +1,4 @@ -coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ - Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) - for(sample in sampname) { - cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] - cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ] - Y <- cbind(Y, cov_targets_for_sample[,"read_count"]) - } - colnames(Y) <- sampname - rownames(Y) <- targets_for_chr[,"target_id"] - return(list(Y=Y)) -} - - # from CODEX package getgc <- function(chr, ref) { library(GenomeInfoDb) diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R index f85e619..403ab35 100644 --- a/R/CANOESCOV/R/run_CANOESCOV.R +++ b/R/CANOESCOV/R/run_CANOESCOV.R @@ -1,59 +1,55 @@ library(methods) run_CANOESCOV <- function(input_cov_table, + input_bed, reference_sample_set_file, output_calls_file){ con <- file(reference_sample_set_file, open='r') reference_sample_set <- readLines(con) - cov_table <- read.csv(input_cov_table) - sampname <- unique(cov_table[,"sample_name"]) - targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] - targets <- targets[!duplicated(targets[,"target_id"]),] - targets <- targets[with(targets, order(target_id)), ] + Y <- read.csv(input_cov_table) + targets <- read.delim(input_bed) + rownames(Y) <- 1:nrow(Y) + rownames(targets) <- 1:nrow(targets) calls <- data.frame(matrix(nrow=0, ncol=13)) - chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) - for(chr in chrs) { - targets_for_chr <- targets[targets[,"chr"] == chr,] - ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"]) - if (length(ref) == 0) { # 0 elements for specified chromosome in bed - next() - } - Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y - Y <- cbind(rep(chr, nrow(Y)), start(ref), end(ref), Y) - target_length <- c() - for (i in 1:nrow(Y)) { - target_length <- c(target_length, width(ref[i])) - } + chr <- targets[1,'chr'] + ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) + if (length(ref) == 0) { # 0 elements for specified chromosome in bed + next() + } + Y <- cbind(rep(chr, nrow(Y)), start(ref), end(ref), Y) + target_length <- c() + for (i in 1:nrow(Y)) { + target_length <- c(target_length, width(ref[i])) + } - # TODO better transformation - write.table(Y, file=paste('cov_', chr, '.tsv', sep=""), quote=FALSE, sep="\t", col.names = F, row.names = F) - canoes.reads <- read.table(paste('cov_', chr, '.tsv', sep="")) + # TODO better transformation + write.table(Y, file=paste('cov_', chr, '.tsv', sep=""), quote=FALSE, sep="\t", col.names = F, row.names = F) + canoes.reads <- read.table(paste('cov_', chr, '.tsv', sep="")) - gc <- getgc(chr, ref) - target <- seq(1, nrow(Y)) - canoes.reads <- cbind(target, gc, canoes.reads) - sampname <- as.vector(sampname) - names(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) - colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) - write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T) - xcnv.list <- vector('list', length(sampname)) - for (i in 1:length(reference_sample_set)) { - if (reference_sample_set[[i]] == '') { - next() - } - samples <- unlist(strsplit(reference_sample_set[[i]], ',')) - actual_sample <- samples[1] - reference_samples <- samples[-1] - xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample, - reference.samples=reference_samples, - counts=canoes.reads) + gc <- getgc(chr, ref) + target <- seq(1, nrow(Y)) + canoes.reads <- cbind(target, gc, canoes.reads) + sampname <- as.vector(sampname) + names(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) + colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) + write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T) + xcnv.list <- vector('list', length(sampname)) + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { + next() } - xcnvs <- do.call('rbind', xcnv.list) - if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} - calls <- rbind(calls, xcnvs) + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + actual_sample <- samples[1] + reference_samples <- samples[-1] + xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample, + reference.samples=reference_samples, + counts=canoes.reads) } + xcnvs <- do.call('rbind', xcnv.list) + if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} + calls <- rbind(calls, xcnvs) # unify results format if (nrow(calls) != 0) { diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R index cbb5d0a..0d8fa58 100644 --- a/R/CODEXCOV/R/run_CODEXCOV.R +++ b/R/CODEXCOV/R/run_CODEXCOV.R @@ -17,7 +17,6 @@ run_CODEXCOV <- function(K_from, con <- file(reference_sample_set_file, open='r') reference_sample_set <- readLines(con) Y <- read.csv(input_cov_table) - sampname <- colnames(Y) targets <- read.delim(input_bed) rownames(Y) <- 1:nrow(Y) rownames(targets) <- 1:nrow(targets) From b7ff8056ceabf1af0af9bb25ef164adcb66f8deb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 16:18:31 +0200 Subject: [PATCH 057/114] force to rebuild CANOESCOV package --- Docker/cnv-opt-canoescov/Dockerfile | 2 ++ Docker/cnv-opt-exomedepthcov/Dockerfile | 2 -- Docker/cnv-opt-target-qc/Dockerfile | 2 -- build.sh | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile index fa7683b..06d4dea 100644 --- a/Docker/cnv-opt-canoescov/Dockerfile +++ b/Docker/cnv-opt-canoescov/Dockerfile @@ -8,4 +8,6 @@ RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtool RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')" +RUN pwd + RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/Docker/cnv-opt-exomedepthcov/Dockerfile b/Docker/cnv-opt-exomedepthcov/Dockerfile index 9d2d135..28448f8 100644 --- a/Docker/cnv-opt-exomedepthcov/Dockerfile +++ b/Docker/cnv-opt-exomedepthcov/Dockerfile @@ -1,7 +1,5 @@ FROM biodatageeks/cnv-opt-exomedepth MAINTAINER biodatageeks -ARG CACHE_DATE=not_a_date - RUN Rscript -e "install.packages('EXOMEDEPTHCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile index e78cc1a..8391b91 100644 --- a/Docker/cnv-opt-target-qc/Dockerfile +++ b/Docker/cnv-opt-target-qc/Dockerfile @@ -1,6 +1,4 @@ FROM biodatageeks/cnv-opt-codex MAINTAINER biodatageeks -ARG CACHE_DATE=not_a_date - RUN Rscript -e "install.packages('TARGET.QC', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/build.sh b/build.sh index 6a3bee1..797e83c 100755 --- a/build.sh +++ b/build.sh @@ -35,7 +35,7 @@ do if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then - echo "Rebuilf of ${image} image forced..." + echo "Rebuild of ${image} image forced..." docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . else From f6c966f3c47d61703e2785405687fc94e83d022e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 17:51:08 +0200 Subject: [PATCH 058/114] bugfix --- R/CANOESCOV/R/run_CANOESCOV.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R index 403ab35..62e2cfd 100644 --- a/R/CANOESCOV/R/run_CANOESCOV.R +++ b/R/CANOESCOV/R/run_CANOESCOV.R @@ -8,6 +8,7 @@ run_CANOESCOV <- function(input_cov_table, con <- file(reference_sample_set_file, open='r') reference_sample_set <- readLines(con) Y <- read.csv(input_cov_table) + sampname <- colnames(Y) targets <- read.delim(input_bed) rownames(Y) <- 1:nrow(Y) rownames(targets) <- 1:nrow(targets) From e4b6f80b1a248cdc424103e23175c73771f11911 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 17:53:49 +0200 Subject: [PATCH 059/114] forcing rebuilding only specified dockers - to speed up dev process --- build.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/build.sh b/build.sh index 797e83c..bcf6c28 100755 --- a/build.sh +++ b/build.sh @@ -36,11 +36,11 @@ do cd $dir if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then echo "Rebuild of ${image} image forced..." - docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . - docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . + docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . + docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . else - docker build --no-cache -t $image:$version . - docker build --no-cache -t $image:latest . + docker build -t $image:$version . + docker build -t $image:latest . fi if [[ ${BUILD_MODE} != "local" ]]; then docker push docker.io/$image:latest From 7c0447ff46984d71af2e8d2a8b7284a5b6edc828 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 22:56:44 +0200 Subject: [PATCH 060/114] coverage table as matrix --- .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index 37d63e5..bed1ec8 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -4,7 +4,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, input_bed, output_reference_file){ - Y <- read.csv(input_cov_table) + Y <- data.matrix(read.csv(input_cov_table)) sampname <- colnames(Y) targets <- read.delim(input_bed) target_length <- targets[,"st_bp"] - targets[,"ed_bp"] From 820a7512736e7d4b76a8e90f516ad85d7a023a1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 30 Mar 2018 23:11:17 +0200 Subject: [PATCH 061/114] code clean up --- Docker/cnv-opt-canoescov/Dockerfile | 2 -- build.sh | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile index 06d4dea..fa7683b 100644 --- a/Docker/cnv-opt-canoescov/Dockerfile +++ b/Docker/cnv-opt-canoescov/Dockerfile @@ -8,6 +8,4 @@ RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtool RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')" -RUN pwd - RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/build.sh b/build.sh index bcf6c28..fc80400 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then + if [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then echo "Rebuild of ${image} image forced..." docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . From f2646eaa1e7ce3d090759f638cdb9811b115983b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sun, 1 Apr 2018 14:46:47 +0200 Subject: [PATCH 062/114] bugfix --- R/CODEXCOV/R/run_CODEXCOV.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R index 0d8fa58..e5565ae 100644 --- a/R/CODEXCOV/R/run_CODEXCOV.R +++ b/R/CODEXCOV/R/run_CODEXCOV.R @@ -38,7 +38,7 @@ run_CODEXCOV <- function(K_from, actual_sample <- samples[1] reference_samples <- samples[-1] samples <- sort(samples) - Y_subset <- Y[,samples] + Y_subset <- as.matrix(Y[,samples]) ################################################### ### code chunk number 7: normObj1 From f89d041567459035a3f07739348916251fe46326 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sun, 1 Apr 2018 14:51:01 +0200 Subject: [PATCH 063/114] force to reload CODEXCOV package --- Docker/cnv-opt-codexcov/Dockerfile | 2 ++ build.sh | 12 ++++++------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile index f06904c..93989f6 100644 --- a/Docker/cnv-opt-codexcov/Dockerfile +++ b/Docker/cnv-opt-codexcov/Dockerfile @@ -3,4 +3,6 @@ MAINTAINER biodatageeks ARG CACHE_DATE=not_a_date +RUN pwd + RUN Rscript -e "install.packages('CODEXCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/build.sh b/build.sh index fc80400..d9b6ccf 100755 --- a/build.sh +++ b/build.sh @@ -34,14 +34,14 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then - echo "Rebuild of ${image} image forced..." - docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . - docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . - else + #if [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then + # echo "Rebuild of ${image} image forced..." + # docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . + # docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . + #else docker build -t $image:$version . docker build -t $image:latest . - fi + #fi if [[ ${BUILD_MODE} != "local" ]]; then docker push docker.io/$image:latest docker push docker.io/$image:$version From 7b9c82dd177cb63fc451feaa5557cc0613e75365 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sun, 1 Apr 2018 18:36:56 +0200 Subject: [PATCH 064/114] dags for codex, canoes and exomedept callers --- Docker/cnv-opt-codexcov/Dockerfile | 4 ---- airflow/dags/canoes.py | 15 +++++++++++---- airflow/dags/codex.py | 12 +++++++----- airflow/dags/exomedepth.py | 12 +++++++----- 4 files changed, 25 insertions(+), 18 deletions(-) diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile index 93989f6..258bbf8 100644 --- a/Docker/cnv-opt-codexcov/Dockerfile +++ b/Docker/cnv-opt-codexcov/Dockerfile @@ -1,8 +1,4 @@ FROM biodatageeks/cnv-opt-codex MAINTAINER biodatageeks -ARG CACHE_DATE=not_a_date - -RUN pwd - RUN Rscript -e "install.packages('CODEXCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/airflow/dags/canoes.py b/airflow/dags/canoes.py index 7c74842..fe08612 100755 --- a/airflow/dags/canoes.py +++ b/airflow/dags/canoes.py @@ -28,19 +28,26 @@ length_thresh_to = '2000' gc_thresh_from = '20' gc_thresh_to = '80' -raw_cov_table = 'input_cov_table.csv' -qc_cov_table = 'output_cov_table.csv' +raw_cov_table = 'raw_cov_table.csv' +qc_cov_table = 'qc_cov_table.csv' +raw_bed = 'raw_bed.bed' +qc_bed = 'qc_bed.bed' ### select reference sample set parameters select_method = 'exomedepth' # "canoes", "codex" or "exomedepth" num_refs = '30' reference_sample_set_file = 'reference_sample_set.csv' +### canoes parameters +output_calls_file = 'calls.csv' + run_canoes_caller_cmd= " \ docker pull biodatageeks/cnv-opt-target-qc; \ -docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "','" + raw_bed + "','" + qc_bed + "')\"; \ docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \ -docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "')\"; \ +docker pull biodatageeks/cnv-opt-canoescov; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-canoescov Rscript -e \"library(\'CANOESCOV\');run_CANOESCOV('" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \ " run_canoes_caller_task= BashOperator ( diff --git a/airflow/dags/codex.py b/airflow/dags/codex.py index 1da827f..210b3bb 100755 --- a/airflow/dags/codex.py +++ b/airflow/dags/codex.py @@ -28,8 +28,10 @@ length_thresh_to = '2000' gc_thresh_from = '20' gc_thresh_to = '80' -raw_cov_table = 'input_cov_table.csv' -qc_cov_table = 'output_cov_table.csv' +raw_cov_table = 'raw_cov_table.csv' +qc_cov_table = 'qc_cov_table.csv' +raw_bed = 'raw_bed.bed' +qc_bed = 'qc_bed.bed' ### select reference sample set parameters select_method = 'exomedepth' # "canoes", "codex" or "exomedepth" @@ -44,11 +46,11 @@ run_codex_caller_cmd= " \ docker pull biodatageeks/cnv-opt-target-qc; \ -docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "','" + raw_bed + "','" + qc_bed + "')\"; \ docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \ -docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "')\"; \ docker pull biodatageeks/cnv-opt-codexcov; \ -docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-codexcov Rscript -e \"library(\'CODEXCOV\');run_CODEXCOV(" + k_from + "," + k_to + "," + lmax + ",'" + qc_cov_table + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-codexcov Rscript -e \"library(\'CODEXCOV\');run_CODEXCOV(" + k_from + "," + k_to + "," + lmax + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \ " run_codex_caller_task= BashOperator ( diff --git a/airflow/dags/exomedepth.py b/airflow/dags/exomedepth.py index e8629be..7952c2b 100755 --- a/airflow/dags/exomedepth.py +++ b/airflow/dags/exomedepth.py @@ -28,8 +28,10 @@ length_thresh_to = '2000' gc_thresh_from = '20' gc_thresh_to = '80' -raw_cov_table = 'input_cov_table.csv' -qc_cov_table = 'output_cov_table.csv' +raw_cov_table = 'raw_cov_table.csv' +qc_cov_table = 'qc_cov_table.csv' +raw_bed = 'raw_bed.bed' +qc_bed = 'qc_bed.bed' ### select reference sample set parameters select_method = 'exomedepth' # "canoes", "codex" or "exomedepth" @@ -41,11 +43,11 @@ run_exomedepth_caller_cmd= " \ docker pull biodatageeks/cnv-opt-target-qc; \ -docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "','" + raw_bed + "','" + qc_bed + "')\"; \ docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \ -docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "')\"; \ docker pull biodatageeks/cnv-opt-exomedepthcov; \ -docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-exomedepthcov Rscript -e \"library(\'EXOMEDEPTHCOV\');run_EXOMEDEPTHCOV('" + qc_cov_table + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-exomedepthcov Rscript -e \"library(\'EXOMEDEPTHCOV\');run_EXOMEDEPTHCOV('" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \ " run_exomedepth_caller_task= BashOperator ( From ab5bd6d7fbeb5f1586d47e52e5637af9d0822707 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 6 Apr 2018 19:45:36 +0200 Subject: [PATCH 065/114] init version of EXOMECOPYCOV package --- Docker/cnv-opt-exomecopy/Dockerfile | 14 +++ Docker/cnv-opt-exomecopycov/Dockerfile | 5 + Jenkinsfile | 1 + R/EXOMECOPYCOV/DESCRIPTION | 22 ++++ R/EXOMECOPYCOV/NAMESPACE | 2 + R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R | 20 ++++ R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 134 ++++++++++++++++++++++ 7 files changed, 198 insertions(+) create mode 100644 Docker/cnv-opt-exomecopy/Dockerfile create mode 100644 Docker/cnv-opt-exomecopycov/Dockerfile create mode 100644 R/EXOMECOPYCOV/DESCRIPTION create mode 100644 R/EXOMECOPYCOV/NAMESPACE create mode 100644 R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R create mode 100644 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R diff --git a/Docker/cnv-opt-exomecopy/Dockerfile b/Docker/cnv-opt-exomecopy/Dockerfile new file mode 100644 index 0000000..777cc0d --- /dev/null +++ b/Docker/cnv-opt-exomecopy/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:xenial +MAINTAINER biodatageeks + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev + +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges');biocLite('exomeCopy')" diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile new file mode 100644 index 0000000..11b58f5 --- /dev/null +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -0,0 +1,5 @@ +FROM biodatageeks/cnv-opt-exomecopycov +MAINTAINER biodatageeks + +RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" + diff --git a/Jenkinsfile b/Jenkinsfile index fc11b6f..70682e1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -21,6 +21,7 @@ pipeline { sh "cd R && R CMD build REFERENCE.SAMPLE.SET.SELECTOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file REFERENCE.SAMPLE.SET.SELECTOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/REFERENCE.SAMPLE.SET.SELECTOR_0.0.1.tar.gz" sh "cd R && R CMD build CODEXCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CODEXCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CODEXCOV_0.0.1.tar.gz" sh "cd R && R CMD build EXOMEDEPTHCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file EXOMEDEPTHCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/EXOMEDEPTHCOV_0.0.1.tar.gz" + sh "cd R && R CMD build EXOMECOPYCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file EXOMECOPYCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/EXOMECOPYCOV_0.0.1.tar.gz" sh "cd R && R CMD build CANOESCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOESCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOESCOV_0.0.1.tar.gz" sh "cd R && R CMD build CANOES/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOES_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOES_0.0.1.tar.gz" sh "cd R && R CMD build CNVCALLER.RUNNER/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.RUNNER_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.RUNNER_0.0.1.tar.gz" diff --git a/R/EXOMECOPYCOV/DESCRIPTION b/R/EXOMECOPYCOV/DESCRIPTION new file mode 100644 index 0000000..c33386d --- /dev/null +++ b/R/EXOMECOPYCOV/DESCRIPTION @@ -0,0 +1,22 @@ +Package: EXOMECOPYCOV +Title: EXOMECOPY Package With Interface To External Coverage File +Version: 0.0.1 +Authors@R: c( + person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")), + person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")), + person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut"))) +Description: An extended implementation of the exomeCopy package in R. It extends + original implementation by using external coverage file, which should + speed up calculations for running application with multiple sets of input + parameters. +Depends: + R (>= 3.2.3), + devtools (>= 1.13.2), + DBI (== 0.8), + optparse (== 1.4.4), + IRanges (>= 2.0.0), + exomeCopy (== 1.22) +License: GPL-3 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.0.1.9000 diff --git a/R/EXOMECOPYCOV/NAMESPACE b/R/EXOMECOPYCOV/NAMESPACE new file mode 100644 index 0000000..884a631 --- /dev/null +++ b/R/EXOMECOPYCOV/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: fake comment so roxygen2 overwrites silently. +exportPattern("^[^\\.]") diff --git a/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R new file mode 100644 index 0000000..e1799d8 --- /dev/null +++ b/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R @@ -0,0 +1,20 @@ + +# from CODEX package +getgc <- function(chr, ref) { + library(GenomeInfoDb) + library(BSgenome.Hsapiens.UCSC.hg19) + if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") { + chrtemp <- 23 + } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") { + chrtemp <- 24 + } else { + chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1]) + } + if (length(chrtemp) == 0) + message("Chromosome cannot be found in NCBI Homo sapiens database!") + chrm <- unmasked(Hsapiens[[chrtemp]]) + seqs <- Views(chrm, ref) + af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE) + gc <- round((af[, "G"] + af[, "C"]) * 100,2) + gc +} diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R new file mode 100644 index 0000000..a361fc3 --- /dev/null +++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R @@ -0,0 +1,134 @@ +library(methods) +library(exomeCopy) + +run_EXOMECOPYCOV <- function(input_cov_table, + input_bed, + reference_sample_set_file, + output_calls_file){ + + con <- file(reference_sample_set_file, open='r') + reference_sample_set <- readLines(con) + Y <- read.csv(input_cov_table) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + rownames(Y) <- 1:nrow(Y) + rownames(targets) <- 1:nrow(targets) + chr <- targets[1,'chr'] + ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) + if (length(ref) == 0) { # 0 elements for specified chromosome in bed + next() + } + sample.names <- sampname[,1] + target <- GRanges(seqname = chr, IRanges(start = start(ref) + 1, end = end(ref))) + gc <- getgc(chr, ref) + + rdata <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2) + + for(sample.name in sample.names) { + rdata[[sample.name]] <- Y[,sample.name] + } + + rdata[["bg"]] <- generateBackground(sample.names, rdata, median) + rdata[["log.bg"]] <- log(rdata$bg + .1) + rdata[["bg.sd"]] <- generateBackground(sample.names, rdata, sd) + + fit.list <- lapply(sample.names, function(sample.name) { + lapply(seqlevels(target), function(seq.name) { + exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq"), S = 0:4, d = 2) + }) + }) + compiled.segments <- compileCopyCountSegments(fit.list) + print(compiled.segments) + + + + + + + + + + + + + + + + #calls <- data.frame(matrix(nrow=0, ncol=13)) + #chr <- targets[1,'chr'] + #ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) + #if (length(ref) == 0) { # 0 elements for specified chromosome in bed + # next() + #} + #Y <- cbind(rep(chr, nrow(Y)), start(ref), end(ref), Y) + #target_length <- c() + #for (i in 1:nrow(Y)) { + # target_length <- c(target_length, width(ref[i])) + #} + + # TODO better transformation + #write.table(Y, file=paste('cov_', chr, '.tsv', sep=""), quote=FALSE, sep="\t", col.names = F, row.names = F) + #canoes.reads <- read.table(paste('cov_', chr, '.tsv', sep="")) + + #gc <- getgc(chr, ref) + #target <- seq(1, nrow(Y)) + #canoes.reads <- cbind(target, gc, canoes.reads) + #sampname <- as.vector(sampname) + #names(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) + #colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) + #write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T) + #xcnv.list <- vector('list', length(sampname)) + #for (i in 1:length(reference_sample_set)) { + # if (reference_sample_set[[i]] == '') { + # next() + # } + # samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + # actual_sample <- samples[1] + # reference_samples <- samples[-1] + # xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample, + # reference.samples=reference_samples, + # counts=canoes.reads) + #} + #xcnvs <- do.call('rbind', xcnv.list) + #if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} + #calls <- rbind(calls, xcnvs) + + # unify results format + #if (nrow(calls) != 0) { + # calls[colnames(calls) == 'CNV'] <- as.character(unlist(calls[colnames(calls) == 'CNV'])) + # calls[calls == 'DEL'] <- 'del' + # calls[calls == 'DUP'] <- 'dup' + #} + #colnames(calls)[colnames(calls) == 'SAMPLE'] <- 'sample_name' + #targets <- as.vector(calls[colnames(calls) == 'TARGETS']) + #targets <- as.character(unlist(targets)) + #splitted_targets <- do.call(rbind, strsplit(targets, '..', fixed = TRUE)) + #calls <- cbind(calls, splitted_targets) + #colnames(calls)[colnames(calls) == '1'] <- 'st_exon' + #colnames(calls)[colnames(calls) == '2'] <- 'ed_exon' + #intervals <- as.vector(calls[colnames(calls) == 'INTERVAL']) + #intervals <- as.character(unlist(intervals)) + #splitted_intervals <- do.call(rbind, strsplit(intervals, c(':'), fixed = TRUE)) + #intervals <- as.vector(splitted_intervals[,2]) + #intervals <- as.character(unlist(intervals)) + #splitted_intervals <- do.call(rbind, strsplit(intervals, c('-'), fixed = TRUE)) + #calls <- cbind(calls, splitted_intervals) + #colnames(calls)[colnames(calls) == '1'] <- 'st_bp' + #colnames(calls)[colnames(calls) == '2'] <- 'ed_bp' + #colnames(calls)[colnames(calls) == 'CNV'] <- 'cnv' + #calls <- calls[,-which(names(calls) %in% c('KB', 'MID_BP', 'NUM_TARG', 'Q_SOME', 'TARGETS', 'INTERVAL'))] + #colnames(calls)[colnames(calls) == 'CHR'] <- 'chr' + #colnames(calls)[colnames(calls) == 'MLCN'] <- 'copy_no' + #calls[colnames(calls) == 'sample_name'] <- as.character(unlist(calls[colnames(calls) == 'sample_name'])) + #calls[colnames(calls) == 'st_bp'] <- as.character(unlist(calls[colnames(calls) == 'st_bp'])) + #calls[colnames(calls) == 'ed_bp'] <- as.character(unlist(calls[colnames(calls) == 'ed_bp'])) + #calls[colnames(calls) == 'st_exon'] <- as.character(unlist(calls[colnames(calls) == 'st_exon'])) + #calls[colnames(calls) == 'ed_exon'] <- as.character(unlist(calls[colnames(calls) == 'ed_exon'])) + #write.csv(calls, output_calls_file, row.names=F) +} + +# SAMPLE CNV INTERVAL KB CHR MID_BP TARGETS NUM_TARG MLCN Q_SOME +#1 S2 DEL 22:25713988-25756059 42.071 22 25735024 1132..1137 6 1 99 +#2 S3 DEL 22:24373138-24384231 11.093 22 24378684 936..942 7 0 77 + + From d7c1a6a9b07b2945b4075dfa85c6b28b747f83f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Fri, 6 Apr 2018 19:57:37 +0200 Subject: [PATCH 066/114] bugfix in Dockerfile --- Docker/cnv-opt-exomecopycov/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index 11b58f5..d810948 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -1,4 +1,4 @@ -FROM biodatageeks/cnv-opt-exomecopycov +FROM biodatageeks/cnv-opt-exomecopy MAINTAINER biodatageeks RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From 63f76e7d4fdd48cbb58b1a2fa3ac40d92ecc7532 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sat, 7 Apr 2018 12:02:51 +0200 Subject: [PATCH 067/114] bugfix in exomeCopy version --- R/EXOMECOPYCOV/DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/EXOMECOPYCOV/DESCRIPTION b/R/EXOMECOPYCOV/DESCRIPTION index c33386d..54ec59c 100644 --- a/R/EXOMECOPYCOV/DESCRIPTION +++ b/R/EXOMECOPYCOV/DESCRIPTION @@ -15,7 +15,7 @@ Depends: DBI (== 0.8), optparse (== 1.4.4), IRanges (>= 2.0.0), - exomeCopy (== 1.22) + exomeCopy (== 1.24) License: GPL-3 Encoding: UTF-8 LazyData: true From ae2061ea2ada9062f44656a7a64398eb43be64d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sat, 7 Apr 2018 12:07:16 +0200 Subject: [PATCH 068/114] force to reload EXOMECOPYCOV package --- build.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/build.sh b/build.sh index d9b6ccf..907919c 100755 --- a/build.sh +++ b/build.sh @@ -34,14 +34,14 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - #if [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then - # echo "Rebuild of ${image} image forced..." - # docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . - # docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . - #else + if [[ ${image} == "biodatageeks/cnv-opt-exomecopycov" ]]; then + echo "Rebuild of ${image} image forced..." + docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . + docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . + else docker build -t $image:$version . docker build -t $image:latest . - #fi + fi if [[ ${BUILD_MODE} != "local" ]]; then docker push docker.io/$image:latest docker push docker.io/$image:$version From f21852fe89653bda19e0d2965885543d50ef3c29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sat, 7 Apr 2018 12:11:16 +0200 Subject: [PATCH 069/114] bugfix in forcing reloading package --- Docker/cnv-opt-exomecopycov/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index d810948..c157714 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -1,5 +1,7 @@ FROM biodatageeks/cnv-opt-exomecopy MAINTAINER biodatageeks +ARG CACHE_DATE=unknown + RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From 5933e4e796d313041656ae08b130325c772d237c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sat, 7 Apr 2018 12:18:52 +0200 Subject: [PATCH 070/114] bugfix in sampname type --- R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R index a361fc3..79ce779 100644 --- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R +++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R @@ -9,7 +9,7 @@ run_EXOMECOPYCOV <- function(input_cov_table, con <- file(reference_sample_set_file, open='r') reference_sample_set <- readLines(con) Y <- read.csv(input_cov_table) - sampname <- colnames(Y) + sample.names <- colnames(Y) targets <- read.delim(input_bed) rownames(Y) <- 1:nrow(Y) rownames(targets) <- 1:nrow(targets) @@ -18,7 +18,6 @@ run_EXOMECOPYCOV <- function(input_cov_table, if (length(ref) == 0) { # 0 elements for specified chromosome in bed next() } - sample.names <- sampname[,1] target <- GRanges(seqname = chr, IRanges(start = start(ref) + 1, end = end(ref))) gc <- getgc(chr, ref) From 9acf2004b2125d52d2785d5833156700e1e10077 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sat, 7 Apr 2018 12:28:09 +0200 Subject: [PATCH 071/114] missing libraries --- Docker/cnv-opt-exomecopy/Dockerfile | 3 ++- Docker/cnv-opt-exomecopycov/Dockerfile | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Docker/cnv-opt-exomecopy/Dockerfile b/Docker/cnv-opt-exomecopy/Dockerfile index 777cc0d..4c14ac5 100644 --- a/Docker/cnv-opt-exomecopy/Dockerfile +++ b/Docker/cnv-opt-exomecopy/Dockerfile @@ -11,4 +11,5 @@ RUN apt-get update && \ apt-get upgrade -y && \ apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges');biocLite('exomeCopy')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('exomeCopy')" diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index c157714..94bba90 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -3,5 +3,8 @@ MAINTAINER biodatageeks ARG CACHE_DATE=unknown +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" + RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From e2f5d9a52888f8b0faa543bbd0c12683e147a63c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sat, 7 Apr 2018 15:26:24 +0200 Subject: [PATCH 072/114] exomeCopy in 1.22 version --- Docker/cnv-opt-exomecopy/Dockerfile | 6 +++++- R/EXOMECOPYCOV/DESCRIPTION | 2 +- R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 7 ++++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Docker/cnv-opt-exomecopy/Dockerfile b/Docker/cnv-opt-exomecopy/Dockerfile index 4c14ac5..2b10782 100644 --- a/Docker/cnv-opt-exomecopy/Dockerfile +++ b/Docker/cnv-opt-exomecopy/Dockerfile @@ -9,7 +9,11 @@ RUN apt-get install -y apt-transport-https RUN apt-get update && \ apt-get upgrade -y && \ - apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev wget RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" + +RUN wget http://bioconductor.org/packages/3.5/bioc/src/contrib/exomeCopy_1.22.0.tar.gz +RUN Rscript -e "install.packages('exomeCopy_1.22.0.tar.gz', repos = NULL, type='source')" + RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('exomeCopy')" diff --git a/R/EXOMECOPYCOV/DESCRIPTION b/R/EXOMECOPYCOV/DESCRIPTION index 54ec59c..c33386d 100644 --- a/R/EXOMECOPYCOV/DESCRIPTION +++ b/R/EXOMECOPYCOV/DESCRIPTION @@ -15,7 +15,7 @@ Depends: DBI (== 0.8), optparse (== 1.4.4), IRanges (>= 2.0.0), - exomeCopy (== 1.24) + exomeCopy (== 1.22) License: GPL-3 Encoding: UTF-8 LazyData: true diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R index 79ce779..a7e9178 100644 --- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R +++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R @@ -27,13 +27,14 @@ run_EXOMECOPYCOV <- function(input_cov_table, rdata[[sample.name]] <- Y[,sample.name] } - rdata[["bg"]] <- generateBackground(sample.names, rdata, median) + rdata[["bg"]] <- width(ref) # generateBackground(sample.names, rdata, median) rdata[["log.bg"]] <- log(rdata$bg + .1) - rdata[["bg.sd"]] <- generateBackground(sample.names, rdata, sd) + rdata[["width"]] <- width(ref) fit.list <- lapply(sample.names, function(sample.name) { lapply(seqlevels(target), function(seq.name) { - exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq"), S = 0:4, d = 2) + print(paste("Processing sample: ", sample.name, sep="")) + exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2) }) }) compiled.segments <- compileCopyCountSegments(fit.list) From 1412d072aa5a3a68fde9b1057ddc1807a76b70d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sat, 7 Apr 2018 15:52:29 +0200 Subject: [PATCH 073/114] missing libraries --- Docker/cnv-opt-exomecopy/Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Docker/cnv-opt-exomecopy/Dockerfile b/Docker/cnv-opt-exomecopy/Dockerfile index 2b10782..0333434 100644 --- a/Docker/cnv-opt-exomecopy/Dockerfile +++ b/Docker/cnv-opt-exomecopy/Dockerfile @@ -12,8 +12,9 @@ RUN apt-get update && \ apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev wget RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomicRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" RUN wget http://bioconductor.org/packages/3.5/bioc/src/contrib/exomeCopy_1.22.0.tar.gz RUN Rscript -e "install.packages('exomeCopy_1.22.0.tar.gz', repos = NULL, type='source')" - -RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('exomeCopy')" From 0e8fa237cac6c19ea46313f37096e4d96b192341 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Sun, 8 Apr 2018 18:34:31 +0200 Subject: [PATCH 074/114] writing detected CNVs to output file --- R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R | 14 ++++ R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 97 +---------------------- 2 files changed, 17 insertions(+), 94 deletions(-) diff --git a/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R index e1799d8..bcfdd0d 100644 --- a/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R +++ b/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R @@ -18,3 +18,17 @@ getgc <- function(chr, ref) { gc <- round((af[, "G"] + af[, "C"]) * 100,2) gc } + +unify_calls_format <- function(compiled.segments, chr){ + calls <- matrix(nrow=length(compiled.segments$sample.name), ncol=7) + colnames(calls) <- c('sample_name', 'chr', 'st_bp', 'ed_bp', 'cnv', 'copy_no', 'log_odds') + calls[,'sample_name'] <- compiled.segments$sample.name + calls[,'chr'] <- rep(chr, nrow(calls)) + calls[,'st_bp'] <- unlist(start(ranges(compiled.segments))) + calls[,'ed_bp'] <- unlist(end(ranges(compiled.segments))) + calls[,'copy_no'] <- compiled.segments$copy.count + calls[,'cnv'] <- ifelse(calls[,'copy_no'] > 2, 'dup', 'del') + calls[,'log_odds'] <- compiled.segments$log.odds + calls <- subset(calls, calls[,'copy_no'] != "2") + return(list(calls=calls)) +} diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R index a7e9178..1578feb 100644 --- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R +++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R @@ -27,7 +27,7 @@ run_EXOMECOPYCOV <- function(input_cov_table, rdata[[sample.name]] <- Y[,sample.name] } - rdata[["bg"]] <- width(ref) # generateBackground(sample.names, rdata, median) + rdata[["bg"]] <- generateBackground(sample.names, rdata, median) rdata[["log.bg"]] <- log(rdata$bg + .1) rdata[["width"]] <- width(ref) @@ -38,97 +38,6 @@ run_EXOMECOPYCOV <- function(input_cov_table, }) }) compiled.segments <- compileCopyCountSegments(fit.list) - print(compiled.segments) - - - - - - - - - - - - - - - - #calls <- data.frame(matrix(nrow=0, ncol=13)) - #chr <- targets[1,'chr'] - #ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) - #if (length(ref) == 0) { # 0 elements for specified chromosome in bed - # next() - #} - #Y <- cbind(rep(chr, nrow(Y)), start(ref), end(ref), Y) - #target_length <- c() - #for (i in 1:nrow(Y)) { - # target_length <- c(target_length, width(ref[i])) - #} - - # TODO better transformation - #write.table(Y, file=paste('cov_', chr, '.tsv', sep=""), quote=FALSE, sep="\t", col.names = F, row.names = F) - #canoes.reads <- read.table(paste('cov_', chr, '.tsv', sep="")) - - #gc <- getgc(chr, ref) - #target <- seq(1, nrow(Y)) - #canoes.reads <- cbind(target, gc, canoes.reads) - #sampname <- as.vector(sampname) - #names(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) - #colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) - #write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T) - #xcnv.list <- vector('list', length(sampname)) - #for (i in 1:length(reference_sample_set)) { - # if (reference_sample_set[[i]] == '') { - # next() - # } - # samples <- unlist(strsplit(reference_sample_set[[i]], ',')) - # actual_sample <- samples[1] - # reference_samples <- samples[-1] - # xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample, - # reference.samples=reference_samples, - # counts=canoes.reads) - #} - #xcnvs <- do.call('rbind', xcnv.list) - #if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} - #calls <- rbind(calls, xcnvs) - - # unify results format - #if (nrow(calls) != 0) { - # calls[colnames(calls) == 'CNV'] <- as.character(unlist(calls[colnames(calls) == 'CNV'])) - # calls[calls == 'DEL'] <- 'del' - # calls[calls == 'DUP'] <- 'dup' - #} - #colnames(calls)[colnames(calls) == 'SAMPLE'] <- 'sample_name' - #targets <- as.vector(calls[colnames(calls) == 'TARGETS']) - #targets <- as.character(unlist(targets)) - #splitted_targets <- do.call(rbind, strsplit(targets, '..', fixed = TRUE)) - #calls <- cbind(calls, splitted_targets) - #colnames(calls)[colnames(calls) == '1'] <- 'st_exon' - #colnames(calls)[colnames(calls) == '2'] <- 'ed_exon' - #intervals <- as.vector(calls[colnames(calls) == 'INTERVAL']) - #intervals <- as.character(unlist(intervals)) - #splitted_intervals <- do.call(rbind, strsplit(intervals, c(':'), fixed = TRUE)) - #intervals <- as.vector(splitted_intervals[,2]) - #intervals <- as.character(unlist(intervals)) - #splitted_intervals <- do.call(rbind, strsplit(intervals, c('-'), fixed = TRUE)) - #calls <- cbind(calls, splitted_intervals) - #colnames(calls)[colnames(calls) == '1'] <- 'st_bp' - #colnames(calls)[colnames(calls) == '2'] <- 'ed_bp' - #colnames(calls)[colnames(calls) == 'CNV'] <- 'cnv' - #calls <- calls[,-which(names(calls) %in% c('KB', 'MID_BP', 'NUM_TARG', 'Q_SOME', 'TARGETS', 'INTERVAL'))] - #colnames(calls)[colnames(calls) == 'CHR'] <- 'chr' - #colnames(calls)[colnames(calls) == 'MLCN'] <- 'copy_no' - #calls[colnames(calls) == 'sample_name'] <- as.character(unlist(calls[colnames(calls) == 'sample_name'])) - #calls[colnames(calls) == 'st_bp'] <- as.character(unlist(calls[colnames(calls) == 'st_bp'])) - #calls[colnames(calls) == 'ed_bp'] <- as.character(unlist(calls[colnames(calls) == 'ed_bp'])) - #calls[colnames(calls) == 'st_exon'] <- as.character(unlist(calls[colnames(calls) == 'st_exon'])) - #calls[colnames(calls) == 'ed_exon'] <- as.character(unlist(calls[colnames(calls) == 'ed_exon'])) - #write.csv(calls, output_calls_file, row.names=F) + calls <- unify_calls_format(compiled.segments, chr)$calls + write.csv(calls, output_calls_file, row.names=F) } - -# SAMPLE CNV INTERVAL KB CHR MID_BP TARGETS NUM_TARG MLCN Q_SOME -#1 S2 DEL 22:25713988-25756059 42.071 22 25735024 1132..1137 6 1 99 -#2 S3 DEL 22:24373138-24384231 11.093 22 24378684 936..942 7 0 77 - - From c446c1f0804d7a3797fa6a064ea0271274937aa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 30 Apr 2018 12:30:28 +0200 Subject: [PATCH 075/114] random method of selecting reference sample set --- .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 7 +++++++ .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 3 +++ 2 files changed, 10 insertions(+) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index d92bf29..7bc6826 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -30,3 +30,10 @@ exomedepth_method <- function(investigated_sample, Y, num_refs, target_length){ } return(list(reference_samples=reference_samples)) } + +random_method <- function(investigated_sample, Y, num_refs){ + samples <- colnames(Y) + reference_samples <- setdiff(samples, investigated_sample) + reference_samples <- reference_samples[sample(1:length(reference_samples), num_refs, replace=F)] + return(list(reference_samples=reference_samples)) +} diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index bed1ec8..08bec95 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -25,6 +25,9 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, } else if(select_method == "clamms") { #reference_samples_for_investigated_sample <- clamms_method(investigated_sample, Y, num_refs)$reference_samples #reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "random") { + reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs, target_length)$reference_samples + reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) } } resultant_string <- '' From 45e10ba018796ba0599f8cbd5f03be83818d9c8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 30 Apr 2018 12:36:24 +0200 Subject: [PATCH 076/114] reload package for selecting reference sample set --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index 907919c..b7313d0 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/cnv-opt-exomecopycov" ]]; then + if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then echo "Rebuild of ${image} image forced..." docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . From 2856a8db1f2db49927be3bc94fff362f9fbe81d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 30 Apr 2018 12:39:30 +0200 Subject: [PATCH 077/114] bugfix --- Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile index 716a5f4..d854276 100644 --- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile +++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile @@ -1,7 +1,7 @@ FROM biodatageeks/cnv-opt-codex MAINTAINER biodatageeks -ARG CACHE_DATE=not_a_date +ARG CACHE_DATE=not_a_specified_date RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')" From 2989118dc87cd584a5672e99e47db53f6f8fcd06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 30 Apr 2018 12:45:19 +0200 Subject: [PATCH 078/114] bugfix --- .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index 08bec95..12e94c2 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -26,7 +26,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, #reference_samples_for_investigated_sample <- clamms_method(investigated_sample, Y, num_refs)$reference_samples #reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) } else if(select_method == "random") { - reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs, target_length)$reference_samples + reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs)$reference_samples reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) } } From 7bb4e4b9c2885f3adf7f5800e2bda6427cb2f4a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 30 Apr 2018 13:48:29 +0200 Subject: [PATCH 079/114] select reference set based on CANOES method with cov threshold --- .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 11 +++++++++++ .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index 7bc6826..417758b 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -12,6 +12,17 @@ canoes_method <- function(investigated_sample, Y, num_refs){ return(list(reference_samples=reference_samples)) } +canoes_cov_thresh_method <- function(investigated_sample, Y, cov_thresh){ + samples <- colnames(Y) + cov <- cor(Y[, samples], Y[, samples]) + reference_samples <- setdiff(samples, investigated_sample) + covariances <- cov[investigated_sample, reference_samples] + num_refs <- sum(covariances > cov_thresh) + reference_samples <- names(sort(covariances, + decreasing=T)[1:num_refs]) + return(list(reference_samples=reference_samples)) +} + exomedepth_method <- function(investigated_sample, Y, num_refs, target_length){ library(ExomeDepth) samples <- colnames(Y) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index 12e94c2..02254b3 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -1,5 +1,6 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, num_refs, + cov_thresh, input_cov_table, input_bed, output_reference_file){ @@ -29,6 +30,10 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs)$reference_samples reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) } + } else if(select_method == "canoes_cov_thresh") { + reference_samples_for_investigated_sample <- canoes_cov_thresh_method(investigated_sample, Y, cov_thresh)$reference_samples + reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } } resultant_string <- '' for(i in 1:length(reference_samples)) { From c54d4361ae0fc296ad31ee2ee2f5cdc09e0ab66e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 30 Apr 2018 13:54:01 +0200 Subject: [PATCH 080/114] bugfix --- .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index 02254b3..44ff820 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -29,7 +29,6 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, } else if(select_method == "random") { reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs)$reference_samples reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) - } } else if(select_method == "canoes_cov_thresh") { reference_samples_for_investigated_sample <- canoes_cov_thresh_method(investigated_sample, Y, cov_thresh)$reference_samples reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) From 7ed767f0a8a0c6d76256478e4506ad1011fa1476 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 12:05:59 +0200 Subject: [PATCH 081/114] investigated reference sample set in EXOMECOPYCOV package (first version) --- R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 36 ++++++++++++++++++----------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R index 1578feb..735e972 100644 --- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R +++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R @@ -9,7 +9,6 @@ run_EXOMECOPYCOV <- function(input_cov_table, con <- file(reference_sample_set_file, open='r') reference_sample_set <- readLines(con) Y <- read.csv(input_cov_table) - sample.names <- colnames(Y) targets <- read.delim(input_bed) rownames(Y) <- 1:nrow(Y) rownames(targets) <- 1:nrow(targets) @@ -21,23 +20,34 @@ run_EXOMECOPYCOV <- function(input_cov_table, target <- GRanges(seqname = chr, IRanges(start = start(ref) + 1, end = end(ref))) gc <- getgc(chr, ref) - rdata <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2) + rdata_org <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2) - for(sample.name in sample.names) { - rdata[[sample.name]] <- Y[,sample.name] - } + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { + next() + } + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + actual_sample <- samples[1] + reference_samples <- samples[-1] + samples <- sort(samples) + rdata <- rdata_org + + for(sample.name in samples) { + rdata[[sample.name]] <- Y[,sample.name] + } - rdata[["bg"]] <- generateBackground(sample.names, rdata, median) - rdata[["log.bg"]] <- log(rdata$bg + .1) - rdata[["width"]] <- width(ref) + rdata[["bg"]] <- generateBackground(samples, rdata, median) + rdata[["log.bg"]] <- log(rdata$bg + .1) + rdata[["width"]] <- width(ref) - fit.list <- lapply(sample.names, function(sample.name) { lapply(seqlevels(target), function(seq.name) { - print(paste("Processing sample: ", sample.name, sep="")) - exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2) + print(paste("Processing sample: ", actual_sample, sep="")) + exomeCopy(rdata, actual_sample, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2) }) - }) - compiled.segments <- compileCopyCountSegments(fit.list) + compiled.segments <- compileCopyCountSegments(fit.list) + print(compiled.segments) + + } calls <- unify_calls_format(compiled.segments, chr)$calls write.csv(calls, output_calls_file, row.names=F) } From 43ead0d6ac68c39051605022c7c2fc2bbc7410ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 12:14:28 +0200 Subject: [PATCH 082/114] reload package --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index b7313d0..907919c 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then + if [[ ${image} == "biodatageeks/cnv-opt-exomecopycov" ]]; then echo "Rebuild of ${image} image forced..." docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . From ca70a8135c2db398f26917095a61b48c02d72f1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 12:16:38 +0200 Subject: [PATCH 083/114] manual forcing package reload --- Docker/cnv-opt-exomecopycov/Dockerfile | 2 -- build.sh | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index 94bba90..c12fa2f 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -1,8 +1,6 @@ FROM biodatageeks/cnv-opt-exomecopy MAINTAINER biodatageeks -ARG CACHE_DATE=unknown - RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" diff --git a/build.sh b/build.sh index 907919c..b7313d0 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/cnv-opt-exomecopycov" ]]; then + if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then echo "Rebuild of ${image} image forced..." docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . From 407109435f45a749e1d2ffb9796160d10adfe8a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 12:32:00 +0200 Subject: [PATCH 084/114] DBI package greater or equal to 0.8 --- R/CANOESCOV/DESCRIPTION | 2 +- R/CNVCALLER.EVALUATOR/DESCRIPTION | 2 +- R/CNVCALLER.RUNNER/DESCRIPTION | 2 +- R/CODEXCOV/DESCRIPTION | 2 +- R/EXOMECOPYCOV/DESCRIPTION | 2 +- R/EXOMEDEPTHCOV/DESCRIPTION | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R/CANOESCOV/DESCRIPTION b/R/CANOESCOV/DESCRIPTION index 0fa3115..27c73ca 100644 --- a/R/CANOESCOV/DESCRIPTION +++ b/R/CANOESCOV/DESCRIPTION @@ -12,7 +12,7 @@ Description: An extended implementation of the CANOES package in R. It extends Depends: R (>= 3.2.3), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4), IRanges (>= 2.0.0), plyr (>= 1.8.4), diff --git a/R/CNVCALLER.EVALUATOR/DESCRIPTION b/R/CNVCALLER.EVALUATOR/DESCRIPTION index f16487e..5dda4e9 100644 --- a/R/CNVCALLER.EVALUATOR/DESCRIPTION +++ b/R/CNVCALLER.EVALUATOR/DESCRIPTION @@ -9,7 +9,7 @@ Description: A package to evaluate CNV callers results. Depends: R (>= 3.2.3), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4) License: GPL-3 Encoding: UTF-8 diff --git a/R/CNVCALLER.RUNNER/DESCRIPTION b/R/CNVCALLER.RUNNER/DESCRIPTION index aa62f1f..1f68fdc 100644 --- a/R/CNVCALLER.RUNNER/DESCRIPTION +++ b/R/CNVCALLER.RUNNER/DESCRIPTION @@ -13,7 +13,7 @@ Depends: EXOMEDEPTHCOV (>= 0.0.1), CANOESCOV (>= 0.0.1), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4) License: GPL-3 Encoding: UTF-8 diff --git a/R/CODEXCOV/DESCRIPTION b/R/CODEXCOV/DESCRIPTION index f6516a3..aacb351 100755 --- a/R/CODEXCOV/DESCRIPTION +++ b/R/CODEXCOV/DESCRIPTION @@ -12,7 +12,7 @@ Description: An extended implementation of the CODEX package in R. It extends Depends: R (>= 3.2.3), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4), CODEX (>= 1.8.0) License: GPL-3 diff --git a/R/EXOMECOPYCOV/DESCRIPTION b/R/EXOMECOPYCOV/DESCRIPTION index c33386d..1c27c1d 100644 --- a/R/EXOMECOPYCOV/DESCRIPTION +++ b/R/EXOMECOPYCOV/DESCRIPTION @@ -12,7 +12,7 @@ Description: An extended implementation of the exomeCopy package in R. It extend Depends: R (>= 3.2.3), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4), IRanges (>= 2.0.0), exomeCopy (== 1.22) diff --git a/R/EXOMEDEPTHCOV/DESCRIPTION b/R/EXOMEDEPTHCOV/DESCRIPTION index 8305596..4627623 100644 --- a/R/EXOMEDEPTHCOV/DESCRIPTION +++ b/R/EXOMEDEPTHCOV/DESCRIPTION @@ -12,7 +12,7 @@ Description: An extended implementation of the ExomeDepth package in R. It exten Depends: R (>= 3.2.3), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4), IRanges (>= 2.0.0), ExomeDepth (>= 1.1.10), From 6bcb7f7c39a175603419b5c1ca1e9e6e570d4031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 12:36:24 +0200 Subject: [PATCH 085/114] force to reload package --- Docker/cnv-opt-exomecopycov/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index c12fa2f..8e189ed 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -4,5 +4,7 @@ MAINTAINER biodatageeks RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" +RUN pwd + RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From fc5fbd2f6c6e1782ff6973a3864e1477ca584dcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 12:41:44 +0200 Subject: [PATCH 086/114] bugfix --- Docker/cnv-opt-exomecopycov/Dockerfile | 2 -- R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index 8e189ed..c12fa2f 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -4,7 +4,5 @@ MAINTAINER biodatageeks RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" -RUN pwd - RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R index 735e972..0ff48d0 100644 --- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R +++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R @@ -40,7 +40,7 @@ run_EXOMECOPYCOV <- function(input_cov_table, rdata[["log.bg"]] <- log(rdata$bg + .1) rdata[["width"]] <- width(ref) - lapply(seqlevels(target), function(seq.name) { + fit.list <- lapply(seqlevels(target), function(seq.name) { print(paste("Processing sample: ", actual_sample, sep="")) exomeCopy(rdata, actual_sample, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2) }) From 0662e415f3da46a84e69c6151849f46c1fd53208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 12:45:22 +0200 Subject: [PATCH 087/114] reload package in docker container --- Docker/cnv-opt-exomecopycov/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index c12fa2f..8aa0833 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -4,5 +4,8 @@ MAINTAINER biodatageeks RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" +RUN pwd +RUN pwd + RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From bf7ac7c5927eee27c72426c33b24569f26f42d0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 12:51:29 +0200 Subject: [PATCH 088/114] bugfix --- R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R index 0ff48d0..dd20bbb 100644 --- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R +++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R @@ -40,9 +40,12 @@ run_EXOMECOPYCOV <- function(input_cov_table, rdata[["log.bg"]] <- log(rdata$bg + .1) rdata[["width"]] <- width(ref) - fit.list <- lapply(seqlevels(target), function(seq.name) { - print(paste("Processing sample: ", actual_sample, sep="")) - exomeCopy(rdata, actual_sample, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2) + sample.name <- c(actual_sample) + fit.list <- lapply(samples, function(sample.name) { + lapply(seqlevels(target), function(seq.name) { + print(paste("Processing sample: ", sample.name, sep="")) + exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2) + }) }) compiled.segments <- compileCopyCountSegments(fit.list) print(compiled.segments) From 6c771b4264a3de7bd474b0723364558fa4f8e3f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 12:53:58 +0200 Subject: [PATCH 089/114] reload package --- Docker/cnv-opt-exomecopycov/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index 8aa0833..4391d8f 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -4,6 +4,7 @@ MAINTAINER biodatageeks RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" +RUN pwd RUN pwd RUN pwd From 0b0d37a916828a91bfed2c39b10651c42badad4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 12:59:41 +0200 Subject: [PATCH 090/114] bugfix one more time --- Docker/cnv-opt-exomecopycov/Dockerfile | 1 + R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index 4391d8f..611a814 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -7,6 +7,7 @@ RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome RUN pwd RUN pwd RUN pwd +RUN pwd RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R index dd20bbb..0fea56d 100644 --- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R +++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R @@ -40,7 +40,7 @@ run_EXOMECOPYCOV <- function(input_cov_table, rdata[["log.bg"]] <- log(rdata$bg + .1) rdata[["width"]] <- width(ref) - sample.name <- c(actual_sample) + samples <- c(actual_sample) fit.list <- lapply(samples, function(sample.name) { lapply(seqlevels(target), function(seq.name) { print(paste("Processing sample: ", sample.name, sep="")) From a88e5dbfbb2481a22d365b9d00024a7c3f573032 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 13:12:08 +0200 Subject: [PATCH 091/114] merging results --- Docker/cnv-opt-exomecopycov/Dockerfile | 1 + R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 11 +++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index 611a814..a89564d 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -8,6 +8,7 @@ RUN pwd RUN pwd RUN pwd RUN pwd +RUN pwd RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R index 0fea56d..af45307 100644 --- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R +++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R @@ -20,7 +20,8 @@ run_EXOMECOPYCOV <- function(input_cov_table, target <- GRanges(seqname = chr, IRanges(start = start(ref) + 1, end = end(ref))) gc <- getgc(chr, ref) - rdata_org <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2) + rdata_org <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2) + finalcall <- matrix(nrow=0, ncol=13) for (i in 1:length(reference_sample_set)) { if (reference_sample_set[[i]] == '') { @@ -48,9 +49,11 @@ run_EXOMECOPYCOV <- function(input_cov_table, }) }) compiled.segments <- compileCopyCountSegments(fit.list) - print(compiled.segments) + finalcallIt <- unify_calls_format(compiled.segments, chr)$calls + if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))} + finalcall <- rbind(finalcall, finalcallIt) + print(finalcallIt) } - calls <- unify_calls_format(compiled.segments, chr)$calls - write.csv(calls, output_calls_file, row.names=F) + write.csv(finalcall, output_calls_file, row.names=F) } From 5d62b84b6c29d3f67880c24fb9dfdc98d338f013 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 16 May 2018 13:19:49 +0200 Subject: [PATCH 092/114] clean up Dockerfile --- Docker/cnv-opt-exomecopycov/Dockerfile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile index a89564d..c12fa2f 100644 --- a/Docker/cnv-opt-exomecopycov/Dockerfile +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -4,11 +4,5 @@ MAINTAINER biodatageeks RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" -RUN pwd -RUN pwd -RUN pwd -RUN pwd -RUN pwd - RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From 53f94dc7d59292dcfc9ea713309d03a3ddc23202 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 21 May 2018 16:49:15 +0200 Subject: [PATCH 093/114] kmeans in reference sample set selector - first version --- .../functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 30 +++++++++++++++++++ .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 3 ++ 2 files changed, 33 insertions(+) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index 417758b..ef5d0d6 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -48,3 +48,33 @@ random_method <- function(investigated_sample, Y, num_refs){ reference_samples <- reference_samples[sample(1:length(reference_samples), num_refs, replace=F)] return(list(reference_samples=reference_samples)) } + +kmeans_method <- function(investigated_sample, Y, number_of_clusters){ + samples <- colnames(Y) + cov <- cor(Y[, samples], Y[, samples]) + d <- cov + for(i in 1:nrow(d)) { + d[i,] <- cov[samples[i], samples] + } + d <- 1-d + c <- c() + for(i in 1:ncol(d)-1) { + c <- c(c, d[(i+1):nrow(d),i]) + } + d <- dist(d) + for(i in 1:length(d)) { + d[i] <- c[i] + } + km1 <- kmeans(d, number_of_clusters, nstart=100) + cluster_id <- km1$cluster[investigated_sample] + reference_samples <- c() + list_index <- 1 + for(i in km1$cluster) { + if(i == cluster_id) { + reference_samples <- c(reference_samples, sampname_qc[list_index]) + } + list_index <- list_index + 1 + } + reference_samples <- setdiff(reference_samples, investigated_sample) + return(list(reference_samples=reference_samples)) +} diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index 44ff820..4677b37 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -32,6 +32,9 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, } else if(select_method == "canoes_cov_thresh") { reference_samples_for_investigated_sample <- canoes_cov_thresh_method(investigated_sample, Y, cov_thresh)$reference_samples reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "kmeans") { + reference_samples_for_investigated_sample <- kmeans_method(investigated_sample, Y, num_refs)$reference_samples + reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) } } resultant_string <- '' From 5878140830bb2fbd43f323a86070de82335330be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 21 May 2018 16:51:30 +0200 Subject: [PATCH 094/114] force to reload Docker image --- Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile index d854276..7775656 100644 --- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile +++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile @@ -5,4 +5,6 @@ ARG CACHE_DATE=not_a_specified_date RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')" +RUN pwd + RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" From ef8dace968b01f61d60f11cffefbd279c78246af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 21 May 2018 17:11:03 +0200 Subject: [PATCH 095/114] bugfix --- Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 1 + .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile index 7775656..8b6f712 100644 --- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile +++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile @@ -5,6 +5,7 @@ ARG CACHE_DATE=not_a_specified_date RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')" +RUN pwd RUN pwd RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index ef5d0d6..d5150ed 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -71,7 +71,7 @@ kmeans_method <- function(investigated_sample, Y, number_of_clusters){ list_index <- 1 for(i in km1$cluster) { if(i == cluster_id) { - reference_samples <- c(reference_samples, sampname_qc[list_index]) + reference_samples <- c(reference_samples, samples[list_index]) } list_index <- list_index + 1 } From ab4c0e11457ab0e280f75a75e3e464c8389f8e7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 21 May 2018 17:22:41 +0200 Subject: [PATCH 096/114] speed up kmeans --- .../cnv-opt-reference-sample-set-selector/Dockerfile | 1 + .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 11 ++++++++--- .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 5 ++++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile index 8b6f712..2945c1d 100644 --- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile +++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile @@ -5,6 +5,7 @@ ARG CACHE_DATE=not_a_specified_date RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')" +RUN pwd RUN pwd RUN pwd diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index d5150ed..6ef310b 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -49,7 +49,7 @@ random_method <- function(investigated_sample, Y, num_refs){ return(list(reference_samples=reference_samples)) } -kmeans_method <- function(investigated_sample, Y, number_of_clusters){ +kmeans_select_groups <- function(Y, number_of_clusters){ samples <- colnames(Y) cov <- cor(Y[, samples], Y[, samples]) d <- cov @@ -66,10 +66,15 @@ kmeans_method <- function(investigated_sample, Y, number_of_clusters){ d[i] <- c[i] } km1 <- kmeans(d, number_of_clusters, nstart=100) - cluster_id <- km1$cluster[investigated_sample] + return(list(clusters=km1$cluster)) +} + +kmeans_method <- function(investigated_sample, Y, kmeans_clusters){ + samples <- colnames(Y) + cluster_id <- kmeans_clusters[investigated_sample] reference_samples <- c() list_index <- 1 - for(i in km1$cluster) { + for(i in kmeans_clusters) { if(i == cluster_id) { reference_samples <- c(reference_samples, samples[list_index]) } diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index 4677b37..9cc24f2 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -10,6 +10,9 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, targets <- read.delim(input_bed) target_length <- targets[,"st_bp"] - targets[,"ed_bp"] reference_samples <- list() + if() { + kmeans_clusters <- kmeans_select_groups(Y, num_refs) + } for(i in 1:length(sampname)) { investigated_sample <- as.character(sampname[i]) @@ -33,7 +36,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, reference_samples_for_investigated_sample <- canoes_cov_thresh_method(investigated_sample, Y, cov_thresh)$reference_samples reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) } else if(select_method == "kmeans") { - reference_samples_for_investigated_sample <- kmeans_method(investigated_sample, Y, num_refs)$reference_samples + reference_samples_for_investigated_sample <- kmeans_method(investigated_sample, Y, kmeans_clusters)$reference_samples reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) } } From 769805ac2ee48497624fd9c7d358559a98560742 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 21 May 2018 17:29:51 +0200 Subject: [PATCH 097/114] bugfix --- Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 1 + .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile index 2945c1d..92b7bcc 100644 --- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile +++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile @@ -8,5 +8,6 @@ RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project RUN pwd RUN pwd RUN pwd +RUN pwd RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index 9cc24f2..12053e5 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -10,7 +10,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, targets <- read.delim(input_bed) target_length <- targets[,"st_bp"] - targets[,"ed_bp"] reference_samples <- list() - if() { + if(select_method == "kmeans") { kmeans_clusters <- kmeans_select_groups(Y, num_refs) } From 57b9604c6f495d73971832933763d59dbf03df3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 21 May 2018 17:37:37 +0200 Subject: [PATCH 098/114] changes in order to find a bug --- .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index 6ef310b..995acbf 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -70,8 +70,11 @@ kmeans_select_groups <- function(Y, number_of_clusters){ } kmeans_method <- function(investigated_sample, Y, kmeans_clusters){ + print(kmeans_clusters) samples <- colnames(Y) + print(samples) cluster_id <- kmeans_clusters[investigated_sample] + print(cluster_id) reference_samples <- c() list_index <- 1 for(i in kmeans_clusters) { From 85c550ef713f9058721c80969b7b9684030b031c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 21 May 2018 17:52:40 +0200 Subject: [PATCH 099/114] bugfix --- .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index 995acbf..e9b7379 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -66,18 +66,18 @@ kmeans_select_groups <- function(Y, number_of_clusters){ d[i] <- c[i] } km1 <- kmeans(d, number_of_clusters, nstart=100) - return(list(clusters=km1$cluster)) + return(list(clusters=km1)) } kmeans_method <- function(investigated_sample, Y, kmeans_clusters){ print(kmeans_clusters) samples <- colnames(Y) print(samples) - cluster_id <- kmeans_clusters[investigated_sample] + cluster_id <- kmeans_clusters$cluster[investigated_sample] print(cluster_id) reference_samples <- c() list_index <- 1 - for(i in kmeans_clusters) { + for(i in kmeans_clusters$cluster) { if(i == cluster_id) { reference_samples <- c(reference_samples, samples[list_index]) } From bc2a00201b039be19f4966eedf7d99c165cb5fda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 21 May 2018 18:15:09 +0200 Subject: [PATCH 100/114] changes in order to detect bug --- .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index e9b7379..f0d64f5 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -73,6 +73,7 @@ kmeans_method <- function(investigated_sample, Y, kmeans_clusters){ print(kmeans_clusters) samples <- colnames(Y) print(samples) + print(kmeans_clusters$cluster) cluster_id <- kmeans_clusters$cluster[investigated_sample] print(cluster_id) reference_samples <- c() From e408aeaf99d006bebe268be6e080f0d01c484e52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Mon, 21 May 2018 18:48:23 +0200 Subject: [PATCH 101/114] code clean up, bugfix --- Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 5 ----- .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 4 ---- .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R | 2 +- 3 files changed, 1 insertion(+), 10 deletions(-) diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile index 92b7bcc..d854276 100644 --- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile +++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile @@ -5,9 +5,4 @@ ARG CACHE_DATE=not_a_specified_date RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')" -RUN pwd -RUN pwd -RUN pwd -RUN pwd - RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index f0d64f5..4863a15 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -70,12 +70,8 @@ kmeans_select_groups <- function(Y, number_of_clusters){ } kmeans_method <- function(investigated_sample, Y, kmeans_clusters){ - print(kmeans_clusters) samples <- colnames(Y) - print(samples) - print(kmeans_clusters$cluster) cluster_id <- kmeans_clusters$cluster[investigated_sample] - print(cluster_id) reference_samples <- c() list_index <- 1 for(i in kmeans_clusters$cluster) { diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index 12053e5..202c284 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -11,7 +11,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, target_length <- targets[,"st_bp"] - targets[,"ed_bp"] reference_samples <- list() if(select_method == "kmeans") { - kmeans_clusters <- kmeans_select_groups(Y, num_refs) + kmeans_clusters <- kmeans_select_groups(Y, num_refs)$clusters } for(i in 1:length(sampname)) { From a7b6352455119e828291421d58a125119569ea48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Thu, 11 Oct 2018 09:02:38 +0200 Subject: [PATCH 102/114] CNV.SIMULATOR package init --- Docker/cnv-opt-cnv-simulator/Dockerfile | 6 + R/CNV.SIMULATOR/DESCRIPTION | 18 + R/CNV.SIMULATOR/NAMESPACE | 2 + R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R | 685 ++++++++++++++++++++ R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 32 + 5 files changed, 743 insertions(+) create mode 100644 Docker/cnv-opt-cnv-simulator/Dockerfile create mode 100644 R/CNV.SIMULATOR/DESCRIPTION create mode 100644 R/CNV.SIMULATOR/NAMESPACE create mode 100644 R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R create mode 100644 R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R diff --git a/Docker/cnv-opt-cnv-simulator/Dockerfile b/Docker/cnv-opt-cnv-simulator/Dockerfile new file mode 100644 index 0000000..eec7fc9 --- /dev/null +++ b/Docker/cnv-opt-cnv-simulator/Dockerfile @@ -0,0 +1,6 @@ +FROM biodatageeks/cnv-opt-codex +MAINTAINER biodatageeks + +ARG CACHE_DATE=not_a_specified_date + +RUN Rscript -e "install.packages('CNV.SIMULATOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/CNV.SIMULATOR/DESCRIPTION b/R/CNV.SIMULATOR/DESCRIPTION new file mode 100644 index 0000000..0824435 --- /dev/null +++ b/R/CNV.SIMULATOR/DESCRIPTION @@ -0,0 +1,18 @@ +Package: CANOES +Title: CANOES Package +Version: 0.0.1 +Authors@R: c( + person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")), + person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")), + person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut"))) +Description: An implementation of the CANOES package in R. +Depends: + R (>= 3.2.3), + plyr (>= 1.8.4), + nnls (>= 1.4.0), + Hmisc (>= 4.0.0), + mgcv (>= 1.8.0) +License: GPL-3 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.0.1.9000 diff --git a/R/CNV.SIMULATOR/NAMESPACE b/R/CNV.SIMULATOR/NAMESPACE new file mode 100644 index 0000000..884a631 --- /dev/null +++ b/R/CNV.SIMULATOR/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: fake comment so roxygen2 overwrites silently. +exportPattern("^[^\\.]") diff --git a/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R new file mode 100644 index 0000000..b3077b4 --- /dev/null +++ b/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R @@ -0,0 +1,685 @@ +# Constants +NUM.ABNORMAL.STATES=2 +NUM.STATES=3 +DELETION=1 +NORMAL=2 +DUPLICATION=3 + +# PlotCNV +# Plots count data for targets of interest +# highlights sample of interest in red, +# highlights area of interest with a black line +# highlights probe locations with black dots +# Arguments: +# counts: +# count matrix, with column "target" with target numbers +# and sample data in columns 6:end +# sample.name: +# sample of interest (will be highlighted in red in figure) +# (should correspond to a column in counts) +# targets: +# targets of interest in the form start.target..end.target +# offset: +# number of targets to add on either end (default=1) +# Returns: +# returns nothing +PlotCNV <- function(counts, sample.name, targets, offset=1){ + sample.name <- as.character(sample.name) + if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} + if (length(setdiff("target", names(counts)[1:5]) > 0)){ + stop("counts matrix must have column named target") + } + t <- as.character(targets) + start.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[1]) + end.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[2]) + if (!start.target %in% counts$target){ + stop("no data for start.target in counts matrix") + } + if (!end.target %in% counts$target){ + stop("no data for end.target in counts matrix") + } + if ((start.target - offset) %in% counts$target){ + start.target <- start.target - offset + } + if ((end.target + offset) %in% counts$target){ + end.target <- end.target + offset + } + ref.sample.names <- setdiff(as.character(names(counts)[-seq(1,5)]), + sample.name) + data <- subset(counts, target >= start.target & target <= end.target) + sample.data <- data[, sample.name] + means <- apply(data[, ref.sample.names], 1, mean) + sd <- sqrt(apply(data[, ref.sample.names], 1, var)) + refs.z.scores <- matrix(NA, nrow(data), length(ref.sample.names)) + sample.z.score <- numeric(length = nrow(data)) + for (i in seq(1, dim(data)[1])){ + refs.z.scores[i, ] <- as.numeric((data[i, ref.sample.names] - means[i]) / + max(0.000001, sd[i])) + sample.z.score[i] <- (sample.data[i] - means[i]) / max(0.000001, sd[i]) + } + ylim <- max(abs(refs.z.scores), abs(sample.z.score)) + plot(seq(-6, 6), seq(-6, 6), + xlim=c(data[1, "start"], data[dim(data)[1], "start"]), + ylim=c(-ylim - 0.1, ylim + 0.1), type="n", xlab="", ylab="Z-score") + for (i in seq(1, length(ref.sample.names))){ + lines(data[, "start"], refs.z.scores[, i], col="#2f4f4f85") + } + lines(data[, "start"], sample.z.score, col="red", lwd=3) + points(data[, "start"], rep(-ylim - 0.05, length(data[, "start"])), pch=20) + lines( c(data[1 + offset, "start"], data[nrow(data) - offset, "end"]) , + c(ylim+0.2, ylim+0.2), lwd=2) + title(main=paste("Sample ", sample.name, ", ", + counts$chromosome[start.target], ":", + data$start[1], "-", data$end[nrow(data)], sep="")) +} + +# CallCNVs +# Calls CNVs in sample of interest +# Arguments: +# sample.name: +# sample to call CNVs in (should correspond to a column in counts) +# counts: +# count matrix, first five columns should be +# target: consecutive numbers for targets (integer) +# chromosome: chromosome number (integer-valued) +# (support for sex chromosomes to come) +# start: start position of probe (integer) +# end: end position of probe (integer) +# gc: gc content (real between 0 and 1) +# subsequent columns should include counts for each probe for samples +# p: +# average rate of occurrence of CNVs (real) default is 1e-08 +# D: +# expected distance between targets in a CNV (integer) default is 70,000 +# Tnum: +# expected number of targets in a CNV (integer) default is 6 +# numrefs +# maximum number of reference samples to use (integer) default is 30 +# the weighted variance calculations will take a long time if too +# many reference samples are used +# Returns: +# data frame with the following columns: +# SAMPLE: name of sample +# CNV: DEL of DUP +# INTERVAL: CNV coordinates in the form chr:start-stop +# KB: length of CNV in kilobases +# CHR: chromosome +# MID_BP: middle base pair of CNV +# TARGETS: target numbers of CNV in the form start..stop +# NUM_TARG: how many targets are in the CNV +# Q_SOME: a Phred-scaled quality score for the CNV +CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, numrefs=30, get.dfs=F, homdel.mean=0.2){ + if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} + if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ + stop("First five columns of counts matrix must be target, chromosome, start, end, gc") + } + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) { + # remove sex chromosomes + cat("Trying to remove sex chromosomes and 'chr' prefixes\n") + counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y")) + if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){ + counts$chromosome <- gsub("chr", "", counts$chromosome) + } + counts$chromosome <- as.numeric(counts$chromosome) + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) + stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)") + } + library(plyr) + counts <- arrange(counts, chromosome, start) + if (p <= 0){ + stop("parameter p must be positive") + } + if (Tnum <= 0){ + stop("parameter Tnum must be positive") + } + if (D <= 0){ + stop("parameter D must be positive") + } + if (numrefs <= 0){ + stop("parameter numrefs must be positive") + } + sample.names <- colnames(counts)[-seq(1,5)] + # find mean coverage of probes + mean.counts <- mean(apply(counts[, sample.names], 2, mean)) + # normalize counts; round so we can use negative binomial + counts[, sample.names] <- apply(counts[, sample.names], 2, + function(x, mean.counts) + round(x * mean.counts / mean(x)), mean.counts) + # calculate covariance of read count across samples + cov <- cor(counts[, sample.names], counts[, sample.names]) + reference.samples <- setdiff(sample.names, sample.name) + covariances <- cov[sample.name, reference.samples] + reference.samples <- names(sort(covariances, + decreasing=T)[1:min(numrefs, length(covariances))]) + sample.mean.counts <- mean(counts[, sample.name]) + sample.sumcounts <- apply(counts[, reference.samples], 2, sum) + # normalize reference samples to sample of interest + counts[, reference.samples] <- apply(counts[, reference.samples], 2, + function(x, sample.mean.counts) + round(x * sample.mean.counts / + mean(x)), sample.mean.counts) + # select reference samples and weightings using non-negative least squares + b <- counts[, sample.name] + A <- as.matrix(counts[, reference.samples]) + library(nnls) + all <- nnls(A, b)$x + est <- matrix(0, nrow=50, ncol=length(reference.samples)) + set.seed(1) + for (i in 1:50){ + d <- sample(nrow(A), min(500, nrow(A))) + est[i, ] <- nnls(A[d, ], b[d])$x + } + weights <- colMeans(est) + sample.weights <- weights / sum(weights) + library(Hmisc) + # calculate weighted mean of read count + # this is used to calculate emission probabilities + counts$mean <- apply(counts[, reference.samples], + 1, wtd.mean, sample.weights) + targets <- counts$target + # exclude probes with all zero counts + nonzero.rows <- counts$mean > 0 + nonzero.rows.df <- data.frame(target=counts$target, + nonzero.rows=nonzero.rows) + + counts <- counts[nonzero.rows, ] + # get the distances between consecutive probes + distances <- GetDistances(counts) + # estimate the read count variance at each probe + var.estimate <- EstimateVariance(counts, reference.samples, + sample.weights) + emission.probs <- EmissionProbs(counts[, sample.name], + counts$mean, var.estimate$var.estimate, + counts[, "target"]) + if (get.dfs){ + return(list(emission.probs=emission.probs, distances=distances)) + } + # call CNVs with the Viterbi algorithm + viterbi.state <- Viterbi(emission.probs, distances, p, Tnum, D) + # format the CNVs + cnvs <- PrintCNVs(sample.name, viterbi.state, + counts) + # if there aren't too many CNVs, calculate the Q_SOME + if (nrow(cnvs) > 0 & nrow(cnvs) <= 50){ + qualities <- GenotypeCNVs(cnvs, sample.name, counts, p, Tnum, D, numrefs, + emission.probs=emission.probs, + distances=distances) + for (i in 1:nrow(cnvs)){ + cnvs$Q_SOME[i] <- ifelse(cnvs$CNV[i]=="DEL", qualities[i, "SQDel"], + qualities[i, "SQDup"]) + } + } + data <- as.data.frame(cbind(counts$target, counts$mean, var.estimate$var.estimate, counts[, sample.name])) + names(data) <- c("target", "countsmean", "varestimate", "sample") + if (nrow(cnvs) > 0){ + cnvs <- CalcCopyNumber(data, cnvs, homdel.mean) + } + return(cnvs) +} + +# GenotypeCNVs +# Genotype CNVs in sample of interest +# Arguments: +# xcnv +# data frame with the following columns, and one row for each +# CNV to genotype +# INTERVAL: CNV coordinates in the form chr:start-stop +# TARGETS: target numbers of CNV in the form start..stop +# these should correspond to the target numbers in counts +# sample.name: +# sample to genotype CNVs in (should correspond to a column in counts) +# counts: +# count matrix, first five columns should be +# target: consecutive numbers for targets (integer) +# chromosome: chromosome number (integer-valued) +# (support for sex chromosomes to come) +# start: start position of probe (integer) +# end: end position of probe (integer) +# gc: gc content (real between 0 and 1) +# subsequent columns should include counts for each probe for samples +# p: +# average rate of occurrence of CNVs (real) default is 1e-08 +# D: +# expected distance between targets in a CNV (integer) default is 70,000 +# Tnum: +# expected number of targets in a CNV (integer) default is 6 +# numrefs +# maximum number of reference samples to use (integer) default is 30 +# the weighted variance calculations will take a long time if too +# many reference samples are used +# emission.probs and distances are for internal use only +# Returns: +# data frame with the following columns and one row for each genotyped CNV: +# INTERVAL: CNV coordinates in the form chr:start-stop +# NQDEL: a Phred-scaled quality score that sample.name has no deletion +# in the interval +# SQDEL: a Phred-scaled quality score that sample.name has a deletion +# in the interval +# NQDUP and SQDUP: same, but for a duplication +GenotypeCNVs <- function(xcnvs, sample.name, counts, p=1e-08, Tnum=6, + D=70000, numrefs=30, + emission.probs=NULL, + distances=NULL){ + if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} + if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ + stop("First five columns of counts matrix must be target, chromosome, start, end, gc") + } + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) { + # remove sex chromosomes + cat("Trying to remove sex chromosomes and 'chr' prefixes\n") + counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y")) + if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){ + counts$chromosome <- gsub("chr", "", counts$chromosome) + } + counts$chromosome <- as.numeric(counts$chromosome) + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) + stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)") + } + library(plyr) + counts <- arrange(counts, chromosome, start) + if (p <= 0){ + stop("parameter p must be positive") + } + if (Tnum <= 0){ + stop("parameter Tnum must be positive") + } + if (D <= 0){ + stop("parameter D must be positive") + } + if (numrefs <= 0){ + stop("parameter numrefs must be positive") + } + num.cnvs <- nrow(xcnvs) + cnv.intervals <- as.character(xcnvs$INTERVAL) + # if no emission probs matrix is passed in, generate a new one + if (is.null(emission.probs)){ + l <- CallCNVs(sample.name, counts, p, Tnum=6, D=70000, numrefs=30, get.dfs=T) + emission.probs <- l[['emission.probs']] + distances <- l[['distances']] + } + forward.m <- GetForwardMatrix(emission.probs, distances, p, Tnum, D) + backward.m <- GetBackwardMatrix(emission.probs, distances, p, Tnum, D) + qualities <- matrix(0, nrow=num.cnvs, ncol=5, + dimnames=list(cnv.intervals, + c("INTERVAL", "NQDel", "SQDel", "NQDup", "SQDup"))) + for (i in 1:num.cnvs){ + interval <- as.character(xcnvs[i, "INTERVAL"]) + targets <- as.numeric(strsplit(as.character(xcnvs[i, "TARGETS"]), ".", fixed=T)[[1]][c(1,3)]) + left.target <- targets[1] + right.target <- targets[2] + likelihoods <- GetModifiedLikelihood(forward.m, backward.m, + emission.probs, distances, + left.target, right.target, + c(DUPLICATION, DELETION), p, Tnum, D) + modified.likelihood <- likelihoods[1]; + unmodified.likelihood <- likelihoods[2] + Prob.All.Normal <- exp(modified.likelihood - unmodified.likelihood) + likelihoods <- GetModifiedLikelihood(forward.m, backward.m, + emission.probs, distances, + left.target, right.target, DELETION, p, Tnum, D) + modified.likelihood <- likelihoods[1]; + unmodified.likelihood <- likelihoods[2] + Prob.No.Deletion <- exp(modified.likelihood - unmodified.likelihood) + likelihoods <- GetModifiedLikelihood(forward.m, backward.m, + emission.probs, distances, + left.target, right.target, DUPLICATION, p, Tnum, D) + modified.likelihood <- likelihoods[1]; + unmodified.likelihood <- likelihoods[2] + Prob.No.Duplication <- exp(modified.likelihood - unmodified.likelihood) + # Check if probabilities greater than 1 are numerical error or bug + Phred <- function(prob){ + return(round(min(99, -10 * log10(1 - prob)))) + } + qualities[i, "NQDel"] <- Phred(Prob.No.Deletion) + qualities[i, "SQDel"] <- Phred(Prob.No.Duplication - Prob.All.Normal) + qualities[i, "NQDup"] <- Phred(Prob.No.Duplication) + qualities[i, "SQDup"] <- Phred(Prob.No.Deletion - Prob.All.Normal) + qualities[i, "INTERVAL"] <- interval + } + qualities <- as.data.frame(qualities, stringsAsFactors=F) + qualities$NQDel <- as.integer(qualities$NQDel) + qualities$NQDup <- as.integer(qualities$NQDup) + qualities$SQDel <- as.integer(qualities$SQDel) + qualities$SQDup <- as.integer(qualities$SQDup) + return(qualities) +} + +# returns data frame with distance to each target from the previous target +# (0 in the case of the first target on chromosome 1, a very big number +# for the first target on each other chromosome--this resets the HMM +# for each chromosome) +GetDistances <- function(counts){ + chromosome <- counts[, "chromosome"] + startbase <- counts[, "start"] + num.nonzero.exons <- length(startbase) + distances <- c(0, startbase[2:num.nonzero.exons] - + startbase[1:(num.nonzero.exons - 1)] + + 1000000000000 * (chromosome[2:num.nonzero.exons] - + chromosome[1:(num.nonzero.exons - 1)])) + return(data.frame(target=counts[, "target"], distance=distances)) +} + +EstimateVariance <- function(counts, ref.sample.names, sample.weights){ + library(Hmisc) + counts$var <- apply(counts[, ref.sample.names], 1, wtd.var, sample.weights, normwt=T) + set.seed(1) + counts.subset <- counts[sample(nrow(counts), min(36000, nrow(counts))), ] + library(mgcv) + # can't do gamma regression with negative + counts.subset$var[counts.subset$var==0] <- 0.1 + fit <- gam(var ~ s(mean) + s(gc), family=Gamma(link=log), data=counts.subset) + # we don't want variance less than Poisson + # we take maximum of genome-wide estimate, method of moments estimate + # and Poisson variance + v.estimate <- pmax(predict(fit, counts, type="response"), counts$var, + counts$mean * 1.01) + return(data.frame(target=counts$target, var.estimate=v.estimate)) +} + +EmissionProbs <- function(test.counts, target.means, + var.estimate, targets){ + num.targets <- length(test.counts) + # calculate the means for the deletion, normal and duplication states + state.target.means <- t(apply(data.frame(x=target.means), 1, function(x) c(x*1/2, x, x*3/2))) + # calculate the expected size (given the predicted variance) + size <- target.means ^ 2 / (var.estimate - target.means) + emission.probs <- matrix(NA, num.targets, 4) + colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") + # calculate the emission probabilities given the read count + size.del <- size + size.dup <- size + size.del <- size / 2 + size.dup <- size * 3 / 2 + emission.probs[, "delprob"] <- dnbinom( + test.counts, + mu=state.target.means[, 1], + size=size.del, log=T) + emission.probs[, "normalprob"] <- dnbinom( + test.counts, + mu=state.target.means[, 2], + size=size, log=T) + emission.probs[, "dupprob"] <- dnbinom( + test.counts, + mu=state.target.means[, 3], + size=size.dup, log=T) + emission.probs[, "target"] <- targets + # some values may be infinite as a result of extreme read count + row.all.inf <- which(apply(emission.probs, 1, function(x){all(is.infinite(x))})) + if (length(row.all.inf) > 0){ + for (i in row.all.inf){ + if (test.counts[i] >= state.target.means[i, 3]){ + emission.probs[i, 2:4] <- c(-Inf, -Inf, -0.01) + } + else if (test.counts[i] <= state.target.means[i, 1]){ + emission.probs[i, 2:4] <- c(-0.01, -Inf, -Inf) + } + else emission.probs[i, 2:4] <- c(-Inf, -0.01, -Inf) + } + } + return(emission.probs) +} + +# Viterbi algorithm +Viterbi <- function(emission.probs.matrix, distances, p, Tnum, D){ + targets <- emission.probs.matrix[, 1] + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + num.exons <- dim(emission.probs.matrix)[1] + viterbi.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) + viterbi.pointers <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + viterbi.matrix[1, ] <- initial.state + emission.probs.matrix[1,] + for (i in 2:num.exons) { + temp.matrix <- viterbi.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) + viterbi.matrix[i, ] <- apply(temp.matrix, 2, max) + emission.probs <- c(emission.probs.matrix[i,]) + dim(emission.probs) <- c(NUM.STATES, 1) + viterbi.matrix[i, ] <- viterbi.matrix[i, ] + emission.probs + viterbi.pointers[i, ] <- apply(temp.matrix, 2, which.max) + } + viterbi.states = vector(length = num.exons) + viterbi.states[num.exons] = which.max(viterbi.matrix[num.exons, ]) + for (i in (num.exons - 1):1) { + viterbi.states[i] <- viterbi.pointers[i + 1, viterbi.states[i + 1]] + } + return(data.frame(target=targets, viterbi.state=viterbi.states)) +} + +# returns a transition matrix +# to state +# deletion normal duplication +# deletion +#from state normal +# duplication +GetTransitionMatrix <- function(distance, p, Tnum, D){ + q <- 1 / Tnum + f = exp(-distance/D) + prob.abnormal.abnormal <- f * (1 - q) + (1 - f) * p + prob.abnormal.normal <- f * q + (1 - f) * (1 - 2 * p) + prob.abnormal.diff.abnormal <- (1 - f) * p + prob.normal.normal <- 1 - 2 * p + prob.normal.abnormal <- p + transition.probs <- + c(prob.abnormal.abnormal, prob.abnormal.normal, prob.abnormal.diff.abnormal, + prob.normal.abnormal, prob.normal.normal, prob.normal.abnormal, + prob.abnormal.diff.abnormal, prob.abnormal.normal, prob.abnormal.abnormal) + transition.m = log(matrix(transition.probs, NUM.STATES, NUM.STATES, byrow=TRUE)) + return(transition.m) +} + +# adds two log-space probabilities using the identity +# log (p1 + p2) = log p1 + log(1 + exp(log p2 - log p1)) +AddTwoProbabilities <- function(x, y){ + if (is.infinite(x)) return (y) + if (is.infinite(y)) return (x) + sum.probs <- max(x, y) + log1p(exp(-abs(x - y))) +} + +# adds multiple log-space probabilities +SumProbabilities <- function(x){ + sum.probs <- x[1] + for (i in 2:length(x)){ + sum.probs <- AddTwoProbabilities(sum.probs, x[i]) + } + return(sum.probs) +} + +# finds the data likelihood by summing the product of the corresponding +# forward and backward probabilities at any token (should give the same value +# regardless of the token) +GetLikelihood <- function(forward.matrix, backward.matrix, x){ + SumProbabilities(forward.matrix[x, ] + backward.matrix[x, ]) +} + +# get the forward probabilities +GetForwardMatrix <- function(emission.probs.matrix, distances, p, Tnum, D){ + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + num.exons <- dim(emission.probs.matrix)[1] + forward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold forward probabilities + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + forward.matrix[1, ] <- initial.state + emission.probs.matrix[1, ] + for (i in 2:num.exons){ + # compute matrix with probability we were in state j and are now in state i + # in temp.matrix[j, i] (ignoring emission of current token) + temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) + # find the probability that we are in each of the three states + sum.probs <- apply(temp.matrix, 2, SumProbabilities) + forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] + } + return(forward.matrix) +} + +# get the backward probabilities +GetBackwardMatrix <- function(emission.probs.matrix, distances, + p, Tnum, D){ + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + num.exons <- dim(emission.probs.matrix)[1] + backward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold backward probabilities + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + backward.matrix[num.exons, ] <- rep(0, NUM.STATES) + for (i in (num.exons - 1):1){ + temp.matrix <- GetTransitionMatrix(distances$distance[i+1], p, Tnum, D) + + matrix(backward.matrix[i + 1, ], 3, 3, byrow=T) + + matrix(emission.probs.matrix[i+1, ], 3, 3, byrow=T) + backward.matrix[i, ] <- apply(temp.matrix, 1, SumProbabilities) + } + final.prob <- backward.matrix[1, ] + emission.probs.matrix[1, ] + initial.state + return(backward.matrix) +} + +# find the likelihood of the data given that certain states are disallowed +# between start target and end target +GetModifiedLikelihood <- function(forward.matrix, backward.matrix, emission.probs.matrix, distances, + start.target, end.target, disallowed.states, p, Tnum, D){ + targets <- emission.probs.matrix[, 1] + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + # there may be missing targets in this sample, we genotype the largest stretch of + # targets that lie in the CNV + left.target <- min(which(targets >= start.target)) + right.target <- max(which(targets <= end.target)) + num.exons <- dim(emission.probs.matrix)[1] + unmodified.likelihood <- GetLikelihood(forward.matrix, + backward.matrix, min(right.target + 1, num.exons)) + #right.target or left.target may be empty + + #if (right.target >= left.target) return(c(NA, unmodified.likelihood)) + stopifnot(right.target >= left.target) + modified.emission.probs.matrix <- emission.probs.matrix + modified.emission.probs.matrix[left.target:right.target, + disallowed.states] <- -Inf + + # if the start target is the first target we need to recalculate the + # forward probabilities + # for that target, using the modified emission probabilities + if (left.target == 1){ + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + forward.matrix[1, ] <- initial.state + modified.emission.probs.matrix[1, ] + left.target <- left.target + 1 + } + for (i in seq(left.target, min(right.target + 1, num.exons))){ + # compute matrix with probability we were in state j and are now in state i + # in temp.matrix[j, i] (ignoring emission of current token) + temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) + # find the probability that we are in each of the three states + sum.probs <- apply(temp.matrix, 2, SumProbabilities) + if (!i == (right.target + 1)){ + forward.matrix[i, ] <- sum.probs + modified.emission.probs.matrix[i, ] + } else{ + forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] + } + } + # find the modified likelihood of the sequence + modified.likelihood <- GetLikelihood(forward.matrix, backward.matrix, min(right.target + 1, num.exons)) + return(c(modified.likelihood, unmodified.likelihood)) +} + +SummarizeCNVs <- function(cnv.targets, counts, sample.name, state){ + sample.name <- sample.name + cnv.type <- ifelse(state==3, "DUP", "DEL") + cnv.start <- min(cnv.targets$target) + cnv.end <- max(cnv.targets$target) + cnv.chromosome <- counts[cnv.start, "chromosome"] + cnv.start.base <- counts[cnv.start, "start"] + cnv.start.target <- counts[cnv.start, "target"] + cnv.end.base <- counts[cnv.end, "end"] + cnv.end.target <- counts[cnv.end, "target"] + cnv.kbs <- (cnv.end.base - cnv.start.base) / 1000 + cnv.midbp <- round((cnv.end.base - cnv.start.base) / 2) + cnv.start.base + cnv.targets <- paste(cnv.start.target, "..", cnv.end.target, sep="") + cnv.interval <- paste(cnv.chromosome, ":", cnv.start.base, "-", cnv.end.base, sep="") + num.targets <- cnv.end.target - cnv.start.target + 1 + return(data.frame(sample.name=sample.name, cnv.type=cnv.type, cnv.interval=cnv.interval, + cnv.kbs=cnv.kbs, cnv.chromosome=cnv.chromosome, + cnv.midbp=cnv.midbp, cnv.targets=cnv.targets, num.targets=num.targets)) +} + +PrintCNVs <- function(test.sample.name, viterbi.state, + nonzero.counts){ + consecutiveGroups <- function(sequence){ + num <- length(sequence) + group <- 1 + groups <- rep(0, num) + groups[1] <- group + if (num > 1){ + for (i in 2:num){ + if (!sequence[i] == (sequence[i - 1] + 1)) group <- group + 1 + groups[i] <- group + } + } + return(groups) + } + num.duplications <- 0 + num.deletions <- 0 + for (state in c(1, 3)){ + cnv.targets <- which(viterbi.state$viterbi.state == state) + if (!length(cnv.targets) == 0){ + groups <- consecutiveGroups(cnv.targets) + library(plyr) + cnvs.temp.df <- ddply(data.frame(target=cnv.targets, group=groups), + "group", SummarizeCNVs, nonzero.counts, test.sample.name, + state) + if (state == 1){ + deletions.df <- cnvs.temp.df + if (!is.null(dim(deletions.df))){ + num.deletions <- dim(deletions.df)[1] + } + } else { + duplications.df <- cnvs.temp.df + if (!is.null(dim(duplications.df))){ + num.duplications <- dim(duplications.df)[1] + } + } + } + } + num.calls <- num.deletions + num.duplications + cat(num.calls, "CNVs called in sample", test.sample.name, "\n") + if (num.deletions == 0 & num.duplications == 0){ + df <- data.frame(SAMPLE=character(0), CNV=character(0), INTERVAL=character(0), + KB=numeric(0), CHR=character(0), + MID_BP=numeric(), TARGETS=character(0), NUM_TARG=numeric(0), Q_SOME=numeric(0), MLCN=numeric(0)) + return(df) + } + if (num.deletions > 0 & num.duplications > 0){ + cnvs.df <- rbind(deletions.df, duplications.df) + } else { + ifelse(num.deletions > 0, + cnvs.df <- deletions.df, cnvs.df <- duplications.df) + } + xcnv <- cbind(cnvs.df[, c("sample.name", "cnv.type", "cnv.interval", + "cnv.kbs", "cnv.chromosome", "cnv.midbp", + "cnv.targets", "num.targets")], 0) + colnames(xcnv) <- c("SAMPLE", "CNV", "INTERVAL", "KB", "CHR", "MID_BP", "TARGETS", + "NUM_TARG", "MLCN") + xcnv$Q_SOME <- NA + return(xcnv) +} + +CalcCopyNumber <- function(data, cnvs, homdel.mean){ + for (i in 1:nrow(cnvs)){ + cnv <- cnvs[i, ] + targets <- as.numeric(unlist(strsplit(as.character(cnv$TARGETS), "..", fixed=T))) + cnv.data <- subset(data, target >= targets[1] & target <= targets[2]) + state.target.means <- t(apply(data.frame(x=cnv.data$countsmean), 1, + function(x) c(C1=x*1/2, C2=x, C3=x*3/2, + C4=x * 2, C5=x * 5/2, C6=x*6/2))) + # calculate the expected size (given the predicted variance) + size <- cnv.data$countsmean ^ 2 / (cnv.data$varestimate - cnv.data$countsmean) + emission.probs <- matrix(NA, nrow(cnv.data), 7) + colnames(emission.probs) <- c("C0", "C1", "C2", "C3", "C4", "C5", "C6") + #colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") + # calculate the emission probabilities given the read count + emission.probs[, 1] <- dpois(cnv.data$sample, homdel.mean, log=T) + for (s in 1:6){ + size.state <- size * s/2 + emission.probs[, s+1] <- dnbinom(cnv.data$sample, mu=state.target.means[, s], + size=size.state, log=T) + } + cs <- colSums(emission.probs) + ml.state <- which.max(cs) - 1 + if (ml.state==2){ + ml.state <- ifelse(cnv$CNV=="DEL", 1, 3) + } + cnvs$MLCN[i] <- ml.state + } + return(cnvs) +} + diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R new file mode 100644 index 0000000..65ddb36 --- /dev/null +++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R @@ -0,0 +1,32 @@ +Test <- function(){ + # read in the data + gc <- read.table("gc.txt")$V2 + canoes.reads <- read.table("canoes.reads.txt") + # rename the columns of canoes.reads + sample.names <- paste("S", seq(1:26), sep="") + names(canoes.reads) <- c("chromosome", "start", "end", sample.names) + # create a vector of consecutive target ids + target <- seq(1, nrow(canoes.reads)) + # combine the data into one data frame + canoes.reads <- cbind(target, gc, canoes.reads) + # call CNVs in each sample + # create a vector to hold the results for each sample + xcnv.list <- vector('list', length(sample.names)) + for (i in 1:length(sample.names)){ + xcnv.list[[i]] <- CallCNVs(sample.names[i], canoes.reads) + } + # combine the results into one data frame + xcnvs <- do.call('rbind', xcnv.list) + # inspect the first two CNV calls + print(head(xcnvs, 2)) + # plot all the CNV calls to a pdf + pdf("CNVplots.pdf") + for (i in 1:nrow(xcnvs)){ + PlotCNV(canoes.reads, xcnvs[i, "SAMPLE"], xcnvs[i, "TARGETS"]) + } + dev.off() + # genotype all the CNVs calls made above in sample S2 + genotyping.S2 <- GenotypeCNVs(xcnvs, "S2", canoes.reads) + # inspect the genotype scores for the first two CNV calls + print(head(genotyping.S2, 2)) +} From 6dfcaa802870124cc601e5e076dc7731378e9e28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Thu, 11 Oct 2018 09:03:29 +0200 Subject: [PATCH 103/114] CNV.SIMULATOR package --- Jenkinsfile | 1 + R/CNV.SIMULATOR/DESCRIPTION | 6 +- R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R | 803 +++----------------- R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 67 +- 4 files changed, 159 insertions(+), 718 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 70682e1..cc0bf7e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -26,6 +26,7 @@ pipeline { sh "cd R && R CMD build CANOES/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOES_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOES_0.0.1.tar.gz" sh "cd R && R CMD build CNVCALLER.RUNNER/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.RUNNER_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.RUNNER_0.0.1.tar.gz" sh "cd R && R CMD build CNVCALLER.EVALUATOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.EVALUATOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.EVALUATOR_0.0.1.tar.gz" + sh "cd R && R CMD build CNV.SIMULATOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNV.SIMULATOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNV.SIMULATOR_0.0.1.tar.gz" } } diff --git a/R/CNV.SIMULATOR/DESCRIPTION b/R/CNV.SIMULATOR/DESCRIPTION index 0824435..0acb0de 100644 --- a/R/CNV.SIMULATOR/DESCRIPTION +++ b/R/CNV.SIMULATOR/DESCRIPTION @@ -1,11 +1,11 @@ -Package: CANOES -Title: CANOES Package +Package: CNV.SIMULATOR +Title: CNV.SIMULATOR A Package To Generate Artificial CNVs Version: 0.0.1 Authors@R: c( person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")), person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")), person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut"))) -Description: An implementation of the CANOES package in R. +Description: An package to generate artificial CNVs. Depends: R (>= 3.2.3), plyr (>= 1.8.4), diff --git a/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R index b3077b4..779ba29 100644 --- a/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R +++ b/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R @@ -1,685 +1,120 @@ -# Constants -NUM.ABNORMAL.STATES=2 -NUM.STATES=3 -DELETION=1 -NORMAL=2 -DUPLICATION=3 - -# PlotCNV -# Plots count data for targets of interest -# highlights sample of interest in red, -# highlights area of interest with a black line -# highlights probe locations with black dots -# Arguments: -# counts: -# count matrix, with column "target" with target numbers -# and sample data in columns 6:end -# sample.name: -# sample of interest (will be highlighted in red in figure) -# (should correspond to a column in counts) -# targets: -# targets of interest in the form start.target..end.target -# offset: -# number of targets to add on either end (default=1) -# Returns: -# returns nothing -PlotCNV <- function(counts, sample.name, targets, offset=1){ - sample.name <- as.character(sample.name) - if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} - if (length(setdiff("target", names(counts)[1:5]) > 0)){ - stop("counts matrix must have column named target") - } - t <- as.character(targets) - start.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[1]) - end.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[2]) - if (!start.target %in% counts$target){ - stop("no data for start.target in counts matrix") - } - if (!end.target %in% counts$target){ - stop("no data for end.target in counts matrix") - } - if ((start.target - offset) %in% counts$target){ - start.target <- start.target - offset - } - if ((end.target + offset) %in% counts$target){ - end.target <- end.target + offset - } - ref.sample.names <- setdiff(as.character(names(counts)[-seq(1,5)]), - sample.name) - data <- subset(counts, target >= start.target & target <= end.target) - sample.data <- data[, sample.name] - means <- apply(data[, ref.sample.names], 1, mean) - sd <- sqrt(apply(data[, ref.sample.names], 1, var)) - refs.z.scores <- matrix(NA, nrow(data), length(ref.sample.names)) - sample.z.score <- numeric(length = nrow(data)) - for (i in seq(1, dim(data)[1])){ - refs.z.scores[i, ] <- as.numeric((data[i, ref.sample.names] - means[i]) / - max(0.000001, sd[i])) - sample.z.score[i] <- (sample.data[i] - means[i]) / max(0.000001, sd[i]) - } - ylim <- max(abs(refs.z.scores), abs(sample.z.score)) - plot(seq(-6, 6), seq(-6, 6), - xlim=c(data[1, "start"], data[dim(data)[1], "start"]), - ylim=c(-ylim - 0.1, ylim + 0.1), type="n", xlab="", ylab="Z-score") - for (i in seq(1, length(ref.sample.names))){ - lines(data[, "start"], refs.z.scores[, i], col="#2f4f4f85") - } - lines(data[, "start"], sample.z.score, col="red", lwd=3) - points(data[, "start"], rep(-ylim - 0.05, length(data[, "start"])), pch=20) - lines( c(data[1 + offset, "start"], data[nrow(data) - offset, "end"]) , - c(ylim+0.2, ylim+0.2), lwd=2) - title(main=paste("Sample ", sample.name, ", ", - counts$chromosome[start.target], ":", - data$start[1], "-", data$end[nrow(data)], sep="")) -} - -# CallCNVs -# Calls CNVs in sample of interest -# Arguments: -# sample.name: -# sample to call CNVs in (should correspond to a column in counts) -# counts: -# count matrix, first five columns should be -# target: consecutive numbers for targets (integer) -# chromosome: chromosome number (integer-valued) -# (support for sex chromosomes to come) -# start: start position of probe (integer) -# end: end position of probe (integer) -# gc: gc content (real between 0 and 1) -# subsequent columns should include counts for each probe for samples -# p: -# average rate of occurrence of CNVs (real) default is 1e-08 -# D: -# expected distance between targets in a CNV (integer) default is 70,000 -# Tnum: -# expected number of targets in a CNV (integer) default is 6 -# numrefs -# maximum number of reference samples to use (integer) default is 30 -# the weighted variance calculations will take a long time if too -# many reference samples are used -# Returns: -# data frame with the following columns: -# SAMPLE: name of sample -# CNV: DEL of DUP -# INTERVAL: CNV coordinates in the form chr:start-stop -# KB: length of CNV in kilobases -# CHR: chromosome -# MID_BP: middle base pair of CNV -# TARGETS: target numbers of CNV in the form start..stop -# NUM_TARG: how many targets are in the CNV -# Q_SOME: a Phred-scaled quality score for the CNV -CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, numrefs=30, get.dfs=F, homdel.mean=0.2){ - if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} - if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ - stop("First five columns of counts matrix must be target, chromosome, start, end, gc") - } - if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) { - # remove sex chromosomes - cat("Trying to remove sex chromosomes and 'chr' prefixes\n") - counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y")) - if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){ - counts$chromosome <- gsub("chr", "", counts$chromosome) - } - counts$chromosome <- as.numeric(counts$chromosome) - if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) - stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)") - } - library(plyr) - counts <- arrange(counts, chromosome, start) - if (p <= 0){ - stop("parameter p must be positive") - } - if (Tnum <= 0){ - stop("parameter Tnum must be positive") - } - if (D <= 0){ - stop("parameter D must be positive") - } - if (numrefs <= 0){ - stop("parameter numrefs must be positive") - } - sample.names <- colnames(counts)[-seq(1,5)] - # find mean coverage of probes - mean.counts <- mean(apply(counts[, sample.names], 2, mean)) - # normalize counts; round so we can use negative binomial - counts[, sample.names] <- apply(counts[, sample.names], 2, - function(x, mean.counts) - round(x * mean.counts / mean(x)), mean.counts) - # calculate covariance of read count across samples - cov <- cor(counts[, sample.names], counts[, sample.names]) - reference.samples <- setdiff(sample.names, sample.name) - covariances <- cov[sample.name, reference.samples] - reference.samples <- names(sort(covariances, - decreasing=T)[1:min(numrefs, length(covariances))]) - sample.mean.counts <- mean(counts[, sample.name]) - sample.sumcounts <- apply(counts[, reference.samples], 2, sum) - # normalize reference samples to sample of interest - counts[, reference.samples] <- apply(counts[, reference.samples], 2, - function(x, sample.mean.counts) - round(x * sample.mean.counts / - mean(x)), sample.mean.counts) - # select reference samples and weightings using non-negative least squares - b <- counts[, sample.name] - A <- as.matrix(counts[, reference.samples]) - library(nnls) - all <- nnls(A, b)$x - est <- matrix(0, nrow=50, ncol=length(reference.samples)) - set.seed(1) - for (i in 1:50){ - d <- sample(nrow(A), min(500, nrow(A))) - est[i, ] <- nnls(A[d, ], b[d])$x - } - weights <- colMeans(est) - sample.weights <- weights / sum(weights) - library(Hmisc) - # calculate weighted mean of read count - # this is used to calculate emission probabilities - counts$mean <- apply(counts[, reference.samples], - 1, wtd.mean, sample.weights) - targets <- counts$target - # exclude probes with all zero counts - nonzero.rows <- counts$mean > 0 - nonzero.rows.df <- data.frame(target=counts$target, - nonzero.rows=nonzero.rows) - - counts <- counts[nonzero.rows, ] - # get the distances between consecutive probes - distances <- GetDistances(counts) - # estimate the read count variance at each probe - var.estimate <- EstimateVariance(counts, reference.samples, - sample.weights) - emission.probs <- EmissionProbs(counts[, sample.name], - counts$mean, var.estimate$var.estimate, - counts[, "target"]) - if (get.dfs){ - return(list(emission.probs=emission.probs, distances=distances)) - } - # call CNVs with the Viterbi algorithm - viterbi.state <- Viterbi(emission.probs, distances, p, Tnum, D) - # format the CNVs - cnvs <- PrintCNVs(sample.name, viterbi.state, - counts) - # if there aren't too many CNVs, calculate the Q_SOME - if (nrow(cnvs) > 0 & nrow(cnvs) <= 50){ - qualities <- GenotypeCNVs(cnvs, sample.name, counts, p, Tnum, D, numrefs, - emission.probs=emission.probs, - distances=distances) - for (i in 1:nrow(cnvs)){ - cnvs$Q_SOME[i] <- ifelse(cnvs$CNV[i]=="DEL", qualities[i, "SQDel"], - qualities[i, "SQDup"]) - } - } - data <- as.data.frame(cbind(counts$target, counts$mean, var.estimate$var.estimate, counts[, sample.name])) - names(data) <- c("target", "countsmean", "varestimate", "sample") - if (nrow(cnvs) > 0){ - cnvs <- CalcCopyNumber(data, cnvs, homdel.mean) - } - return(cnvs) -} - -# GenotypeCNVs -# Genotype CNVs in sample of interest -# Arguments: -# xcnv -# data frame with the following columns, and one row for each -# CNV to genotype -# INTERVAL: CNV coordinates in the form chr:start-stop -# TARGETS: target numbers of CNV in the form start..stop -# these should correspond to the target numbers in counts -# sample.name: -# sample to genotype CNVs in (should correspond to a column in counts) -# counts: -# count matrix, first five columns should be -# target: consecutive numbers for targets (integer) -# chromosome: chromosome number (integer-valued) -# (support for sex chromosomes to come) -# start: start position of probe (integer) -# end: end position of probe (integer) -# gc: gc content (real between 0 and 1) -# subsequent columns should include counts for each probe for samples -# p: -# average rate of occurrence of CNVs (real) default is 1e-08 -# D: -# expected distance between targets in a CNV (integer) default is 70,000 -# Tnum: -# expected number of targets in a CNV (integer) default is 6 -# numrefs -# maximum number of reference samples to use (integer) default is 30 -# the weighted variance calculations will take a long time if too -# many reference samples are used -# emission.probs and distances are for internal use only -# Returns: -# data frame with the following columns and one row for each genotyped CNV: -# INTERVAL: CNV coordinates in the form chr:start-stop -# NQDEL: a Phred-scaled quality score that sample.name has no deletion -# in the interval -# SQDEL: a Phred-scaled quality score that sample.name has a deletion -# in the interval -# NQDUP and SQDUP: same, but for a duplication -GenotypeCNVs <- function(xcnvs, sample.name, counts, p=1e-08, Tnum=6, - D=70000, numrefs=30, - emission.probs=NULL, - distances=NULL){ - if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} - if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ - stop("First five columns of counts matrix must be target, chromosome, start, end, gc") - } - if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) { - # remove sex chromosomes - cat("Trying to remove sex chromosomes and 'chr' prefixes\n") - counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y")) - if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){ - counts$chromosome <- gsub("chr", "", counts$chromosome) - } - counts$chromosome <- as.numeric(counts$chromosome) - if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) - stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)") - } - library(plyr) - counts <- arrange(counts, chromosome, start) - if (p <= 0){ - stop("parameter p must be positive") - } - if (Tnum <= 0){ - stop("parameter Tnum must be positive") - } - if (D <= 0){ - stop("parameter D must be positive") - } - if (numrefs <= 0){ - stop("parameter numrefs must be positive") - } - num.cnvs <- nrow(xcnvs) - cnv.intervals <- as.character(xcnvs$INTERVAL) - # if no emission probs matrix is passed in, generate a new one - if (is.null(emission.probs)){ - l <- CallCNVs(sample.name, counts, p, Tnum=6, D=70000, numrefs=30, get.dfs=T) - emission.probs <- l[['emission.probs']] - distances <- l[['distances']] - } - forward.m <- GetForwardMatrix(emission.probs, distances, p, Tnum, D) - backward.m <- GetBackwardMatrix(emission.probs, distances, p, Tnum, D) - qualities <- matrix(0, nrow=num.cnvs, ncol=5, - dimnames=list(cnv.intervals, - c("INTERVAL", "NQDel", "SQDel", "NQDup", "SQDup"))) - for (i in 1:num.cnvs){ - interval <- as.character(xcnvs[i, "INTERVAL"]) - targets <- as.numeric(strsplit(as.character(xcnvs[i, "TARGETS"]), ".", fixed=T)[[1]][c(1,3)]) - left.target <- targets[1] - right.target <- targets[2] - likelihoods <- GetModifiedLikelihood(forward.m, backward.m, - emission.probs, distances, - left.target, right.target, - c(DUPLICATION, DELETION), p, Tnum, D) - modified.likelihood <- likelihoods[1]; - unmodified.likelihood <- likelihoods[2] - Prob.All.Normal <- exp(modified.likelihood - unmodified.likelihood) - likelihoods <- GetModifiedLikelihood(forward.m, backward.m, - emission.probs, distances, - left.target, right.target, DELETION, p, Tnum, D) - modified.likelihood <- likelihoods[1]; - unmodified.likelihood <- likelihoods[2] - Prob.No.Deletion <- exp(modified.likelihood - unmodified.likelihood) - likelihoods <- GetModifiedLikelihood(forward.m, backward.m, - emission.probs, distances, - left.target, right.target, DUPLICATION, p, Tnum, D) - modified.likelihood <- likelihoods[1]; - unmodified.likelihood <- likelihoods[2] - Prob.No.Duplication <- exp(modified.likelihood - unmodified.likelihood) - # Check if probabilities greater than 1 are numerical error or bug - Phred <- function(prob){ - return(round(min(99, -10 * log10(1 - prob)))) - } - qualities[i, "NQDel"] <- Phred(Prob.No.Deletion) - qualities[i, "SQDel"] <- Phred(Prob.No.Duplication - Prob.All.Normal) - qualities[i, "NQDup"] <- Phred(Prob.No.Duplication) - qualities[i, "SQDup"] <- Phred(Prob.No.Deletion - Prob.All.Normal) - qualities[i, "INTERVAL"] <- interval - } - qualities <- as.data.frame(qualities, stringsAsFactors=F) - qualities$NQDel <- as.integer(qualities$NQDel) - qualities$NQDup <- as.integer(qualities$NQDup) - qualities$SQDel <- as.integer(qualities$SQDel) - qualities$SQDup <- as.integer(qualities$SQDup) - return(qualities) -} - -# returns data frame with distance to each target from the previous target -# (0 in the case of the first target on chromosome 1, a very big number -# for the first target on each other chromosome--this resets the HMM -# for each chromosome) -GetDistances <- function(counts){ - chromosome <- counts[, "chromosome"] - startbase <- counts[, "start"] - num.nonzero.exons <- length(startbase) - distances <- c(0, startbase[2:num.nonzero.exons] - - startbase[1:(num.nonzero.exons - 1)] + - 1000000000000 * (chromosome[2:num.nonzero.exons] - - chromosome[1:(num.nonzero.exons - 1)])) - return(data.frame(target=counts[, "target"], distance=distances)) -} - -EstimateVariance <- function(counts, ref.sample.names, sample.weights){ - library(Hmisc) - counts$var <- apply(counts[, ref.sample.names], 1, wtd.var, sample.weights, normwt=T) - set.seed(1) - counts.subset <- counts[sample(nrow(counts), min(36000, nrow(counts))), ] - library(mgcv) - # can't do gamma regression with negative - counts.subset$var[counts.subset$var==0] <- 0.1 - fit <- gam(var ~ s(mean) + s(gc), family=Gamma(link=log), data=counts.subset) - # we don't want variance less than Poisson - # we take maximum of genome-wide estimate, method of moments estimate - # and Poisson variance - v.estimate <- pmax(predict(fit, counts, type="response"), counts$var, - counts$mean * 1.01) - return(data.frame(target=counts$target, var.estimate=v.estimate)) -} - -EmissionProbs <- function(test.counts, target.means, - var.estimate, targets){ - num.targets <- length(test.counts) - # calculate the means for the deletion, normal and duplication states - state.target.means <- t(apply(data.frame(x=target.means), 1, function(x) c(x*1/2, x, x*3/2))) - # calculate the expected size (given the predicted variance) - size <- target.means ^ 2 / (var.estimate - target.means) - emission.probs <- matrix(NA, num.targets, 4) - colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") - # calculate the emission probabilities given the read count - size.del <- size - size.dup <- size - size.del <- size / 2 - size.dup <- size * 3 / 2 - emission.probs[, "delprob"] <- dnbinom( - test.counts, - mu=state.target.means[, 1], - size=size.del, log=T) - emission.probs[, "normalprob"] <- dnbinom( - test.counts, - mu=state.target.means[, 2], - size=size, log=T) - emission.probs[, "dupprob"] <- dnbinom( - test.counts, - mu=state.target.means[, 3], - size=size.dup, log=T) - emission.probs[, "target"] <- targets - # some values may be infinite as a result of extreme read count - row.all.inf <- which(apply(emission.probs, 1, function(x){all(is.infinite(x))})) - if (length(row.all.inf) > 0){ - for (i in row.all.inf){ - if (test.counts[i] >= state.target.means[i, 3]){ - emission.probs[i, 2:4] <- c(-Inf, -Inf, -0.01) - } - else if (test.counts[i] <= state.target.means[i, 1]){ - emission.probs[i, 2:4] <- c(-0.01, -Inf, -Inf) - } - else emission.probs[i, 2:4] <- c(-Inf, -0.01, -Inf) - } - } - return(emission.probs) -} - -# Viterbi algorithm -Viterbi <- function(emission.probs.matrix, distances, p, Tnum, D){ - targets <- emission.probs.matrix[, 1] - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - num.exons <- dim(emission.probs.matrix)[1] - viterbi.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) - viterbi.pointers <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - viterbi.matrix[1, ] <- initial.state + emission.probs.matrix[1,] - for (i in 2:num.exons) { - temp.matrix <- viterbi.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) - viterbi.matrix[i, ] <- apply(temp.matrix, 2, max) - emission.probs <- c(emission.probs.matrix[i,]) - dim(emission.probs) <- c(NUM.STATES, 1) - viterbi.matrix[i, ] <- viterbi.matrix[i, ] + emission.probs - viterbi.pointers[i, ] <- apply(temp.matrix, 2, which.max) - } - viterbi.states = vector(length = num.exons) - viterbi.states[num.exons] = which.max(viterbi.matrix[num.exons, ]) - for (i in (num.exons - 1):1) { - viterbi.states[i] <- viterbi.pointers[i + 1, viterbi.states[i + 1]] - } - return(data.frame(target=targets, viterbi.state=viterbi.states)) -} - -# returns a transition matrix -# to state -# deletion normal duplication -# deletion -#from state normal -# duplication -GetTransitionMatrix <- function(distance, p, Tnum, D){ - q <- 1 / Tnum - f = exp(-distance/D) - prob.abnormal.abnormal <- f * (1 - q) + (1 - f) * p - prob.abnormal.normal <- f * q + (1 - f) * (1 - 2 * p) - prob.abnormal.diff.abnormal <- (1 - f) * p - prob.normal.normal <- 1 - 2 * p - prob.normal.abnormal <- p - transition.probs <- - c(prob.abnormal.abnormal, prob.abnormal.normal, prob.abnormal.diff.abnormal, - prob.normal.abnormal, prob.normal.normal, prob.normal.abnormal, - prob.abnormal.diff.abnormal, prob.abnormal.normal, prob.abnormal.abnormal) - transition.m = log(matrix(transition.probs, NUM.STATES, NUM.STATES, byrow=TRUE)) - return(transition.m) -} - -# adds two log-space probabilities using the identity -# log (p1 + p2) = log p1 + log(1 + exp(log p2 - log p1)) -AddTwoProbabilities <- function(x, y){ - if (is.infinite(x)) return (y) - if (is.infinite(y)) return (x) - sum.probs <- max(x, y) + log1p(exp(-abs(x - y))) -} - -# adds multiple log-space probabilities -SumProbabilities <- function(x){ - sum.probs <- x[1] - for (i in 2:length(x)){ - sum.probs <- AddTwoProbabilities(sum.probs, x[i]) - } - return(sum.probs) -} - -# finds the data likelihood by summing the product of the corresponding -# forward and backward probabilities at any token (should give the same value -# regardless of the token) -GetLikelihood <- function(forward.matrix, backward.matrix, x){ - SumProbabilities(forward.matrix[x, ] + backward.matrix[x, ]) -} - -# get the forward probabilities -GetForwardMatrix <- function(emission.probs.matrix, distances, p, Tnum, D){ - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - num.exons <- dim(emission.probs.matrix)[1] - forward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold forward probabilities - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - forward.matrix[1, ] <- initial.state + emission.probs.matrix[1, ] - for (i in 2:num.exons){ - # compute matrix with probability we were in state j and are now in state i - # in temp.matrix[j, i] (ignoring emission of current token) - temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) - # find the probability that we are in each of the three states - sum.probs <- apply(temp.matrix, 2, SumProbabilities) - forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] - } - return(forward.matrix) -} - -# get the backward probabilities -GetBackwardMatrix <- function(emission.probs.matrix, distances, - p, Tnum, D){ - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - num.exons <- dim(emission.probs.matrix)[1] - backward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold backward probabilities - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - backward.matrix[num.exons, ] <- rep(0, NUM.STATES) - for (i in (num.exons - 1):1){ - temp.matrix <- GetTransitionMatrix(distances$distance[i+1], p, Tnum, D) + - matrix(backward.matrix[i + 1, ], 3, 3, byrow=T) + - matrix(emission.probs.matrix[i+1, ], 3, 3, byrow=T) - backward.matrix[i, ] <- apply(temp.matrix, 1, SumProbabilities) - } - final.prob <- backward.matrix[1, ] + emission.probs.matrix[1, ] + initial.state - return(backward.matrix) -} - -# find the likelihood of the data given that certain states are disallowed -# between start target and end target -GetModifiedLikelihood <- function(forward.matrix, backward.matrix, emission.probs.matrix, distances, - start.target, end.target, disallowed.states, p, Tnum, D){ - targets <- emission.probs.matrix[, 1] - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - # there may be missing targets in this sample, we genotype the largest stretch of - # targets that lie in the CNV - left.target <- min(which(targets >= start.target)) - right.target <- max(which(targets <= end.target)) - num.exons <- dim(emission.probs.matrix)[1] - unmodified.likelihood <- GetLikelihood(forward.matrix, - backward.matrix, min(right.target + 1, num.exons)) - #right.target or left.target may be empty - - #if (right.target >= left.target) return(c(NA, unmodified.likelihood)) - stopifnot(right.target >= left.target) - modified.emission.probs.matrix <- emission.probs.matrix - modified.emission.probs.matrix[left.target:right.target, - disallowed.states] <- -Inf - - # if the start target is the first target we need to recalculate the - # forward probabilities - # for that target, using the modified emission probabilities - if (left.target == 1){ - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - forward.matrix[1, ] <- initial.state + modified.emission.probs.matrix[1, ] - left.target <- left.target + 1 - } - for (i in seq(left.target, min(right.target + 1, num.exons))){ - # compute matrix with probability we were in state j and are now in state i - # in temp.matrix[j, i] (ignoring emission of current token) - temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) - # find the probability that we are in each of the three states - sum.probs <- apply(temp.matrix, 2, SumProbabilities) - if (!i == (right.target + 1)){ - forward.matrix[i, ] <- sum.probs + modified.emission.probs.matrix[i, ] - } else{ - forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] - } - } - # find the modified likelihood of the sequence - modified.likelihood <- GetLikelihood(forward.matrix, backward.matrix, min(right.target + 1, num.exons)) - return(c(modified.likelihood, unmodified.likelihood)) -} - -SummarizeCNVs <- function(cnv.targets, counts, sample.name, state){ - sample.name <- sample.name - cnv.type <- ifelse(state==3, "DUP", "DEL") - cnv.start <- min(cnv.targets$target) - cnv.end <- max(cnv.targets$target) - cnv.chromosome <- counts[cnv.start, "chromosome"] - cnv.start.base <- counts[cnv.start, "start"] - cnv.start.target <- counts[cnv.start, "target"] - cnv.end.base <- counts[cnv.end, "end"] - cnv.end.target <- counts[cnv.end, "target"] - cnv.kbs <- (cnv.end.base - cnv.start.base) / 1000 - cnv.midbp <- round((cnv.end.base - cnv.start.base) / 2) + cnv.start.base - cnv.targets <- paste(cnv.start.target, "..", cnv.end.target, sep="") - cnv.interval <- paste(cnv.chromosome, ":", cnv.start.base, "-", cnv.end.base, sep="") - num.targets <- cnv.end.target - cnv.start.target + 1 - return(data.frame(sample.name=sample.name, cnv.type=cnv.type, cnv.interval=cnv.interval, - cnv.kbs=cnv.kbs, cnv.chromosome=cnv.chromosome, - cnv.midbp=cnv.midbp, cnv.targets=cnv.targets, num.targets=num.targets)) -} - -PrintCNVs <- function(test.sample.name, viterbi.state, - nonzero.counts){ - consecutiveGroups <- function(sequence){ - num <- length(sequence) - group <- 1 - groups <- rep(0, num) - groups[1] <- group - if (num > 1){ - for (i in 2:num){ - if (!sequence[i] == (sequence[i - 1] + 1)) group <- group + 1 - groups[i] <- group - } - } - return(groups) - } - num.duplications <- 0 - num.deletions <- 0 - for (state in c(1, 3)){ - cnv.targets <- which(viterbi.state$viterbi.state == state) - if (!length(cnv.targets) == 0){ - groups <- consecutiveGroups(cnv.targets) - library(plyr) - cnvs.temp.df <- ddply(data.frame(target=cnv.targets, group=groups), - "group", SummarizeCNVs, nonzero.counts, test.sample.name, - state) - if (state == 1){ - deletions.df <- cnvs.temp.df - if (!is.null(dim(deletions.df))){ - num.deletions <- dim(deletions.df)[1] - } - } else { - duplications.df <- cnvs.temp.df - if (!is.null(dim(duplications.df))){ - num.duplications <- dim(duplications.df)[1] - } - } - } - } - num.calls <- num.deletions + num.duplications - cat(num.calls, "CNVs called in sample", test.sample.name, "\n") - if (num.deletions == 0 & num.duplications == 0){ - df <- data.frame(SAMPLE=character(0), CNV=character(0), INTERVAL=character(0), - KB=numeric(0), CHR=character(0), - MID_BP=numeric(), TARGETS=character(0), NUM_TARG=numeric(0), Q_SOME=numeric(0), MLCN=numeric(0)) - return(df) - } - if (num.deletions > 0 & num.duplications > 0){ - cnvs.df <- rbind(deletions.df, duplications.df) - } else { - ifelse(num.deletions > 0, - cnvs.df <- deletions.df, cnvs.df <- duplications.df) - } - xcnv <- cbind(cnvs.df[, c("sample.name", "cnv.type", "cnv.interval", - "cnv.kbs", "cnv.chromosome", "cnv.midbp", - "cnv.targets", "num.targets")], 0) - colnames(xcnv) <- c("SAMPLE", "CNV", "INTERVAL", "KB", "CHR", "MID_BP", "TARGETS", - "NUM_TARG", "MLCN") - xcnv$Q_SOME <- NA - return(xcnv) -} - -CalcCopyNumber <- function(data, cnvs, homdel.mean){ - for (i in 1:nrow(cnvs)){ - cnv <- cnvs[i, ] - targets <- as.numeric(unlist(strsplit(as.character(cnv$TARGETS), "..", fixed=T))) - cnv.data <- subset(data, target >= targets[1] & target <= targets[2]) - state.target.means <- t(apply(data.frame(x=cnv.data$countsmean), 1, - function(x) c(C1=x*1/2, C2=x, C3=x*3/2, - C4=x * 2, C5=x * 5/2, C6=x*6/2))) - # calculate the expected size (given the predicted variance) - size <- cnv.data$countsmean ^ 2 / (cnv.data$varestimate - cnv.data$countsmean) - emission.probs <- matrix(NA, nrow(cnv.data), 7) - colnames(emission.probs) <- c("C0", "C1", "C2", "C3", "C4", "C5", "C6") - #colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") - # calculate the emission probabilities given the read count - emission.probs[, 1] <- dpois(cnv.data$sample, homdel.mean, log=T) - for (s in 1:6){ - size.state <- size * s/2 - emission.probs[, s+1] <- dnbinom(cnv.data$sample, mu=state.target.means[, s], - size=size.state, log=T) - } - cs <- colSums(emission.probs) - ml.state <- which.max(cs) - 1 - if (ml.state==2){ - ml.state <- ifelse(cnv$CNV=="DEL", 1, 3) - } - cnvs$MLCN[i] <- ml.state - } - return(cnvs) -} +# build_intersection_matrix <- function(calls, refs){ +# intersection_matrix <- matrix(data=as.integer(0), nrow = nrow(calls), ncol = nrow(refs)) +# if (nrow(intersection_matrix) > 0 && ncol(intersection_matrix) > 0) { +# for (i in 1:nrow(intersection_matrix)) { +# for (j in 1:ncol(intersection_matrix)) { +# if (as.character(calls[i,"sample_name"]) == as.character(refs[j,"sample_name"]) && +# as.character(calls[i,"chr"]) == as.character(refs[j,"chr"]) && +# as.character(calls[i,"cnv"]) == as.character(refs[j,"cnv"])) { +# overlap_length <- calc_overlap_length(calls[i,"st_bp"], +# calls[i,"ed_bp"], +# refs[j,"st_bp"], +# refs[j,"ed_bp"]) +# call_length <- calls[i,"ed_bp"] - calls[i,"st_bp"] +# ref_length <- refs[j,"ed_bp"] - refs[j,"st_bp"] +# overlap_factor <- overlap_length / ((call_length + ref_length) / 2) * 100 +# intersection_matrix[i,j] <- round(overlap_factor, 2) +# } +# } +# } +# } +# intersection_matrix +# } +# +# filter_intersection_matrix_by_overlap_factor <- function(intersection_matrix, min_overlap_factor){ +# if (nrow(intersection_matrix) > 0 && ncol(intersection_matrix) > 0) { +# for (i in 1:nrow(intersection_matrix)) { +# for (j in 1:ncol(intersection_matrix)) { +# if (intersection_matrix[i,j] < min_overlap_factor) { +# intersection_matrix[i,j] <- 0.00 +# } +# } +# } +# } +# intersection_matrix +# } +# +# calc_number_of_different_copy_number_for_cnv <- function(cnv, calls){ +# copy_no <- c() +# for (i in 1:nrow(calls)) { +# if (as.character(calls[i,"chr"]) == as.character(cnv[1,"chr"]) && +# calls[i,"st_bp"] == cnv[1,"st_bp"] && +# calls[i,"ed_bp"] == cnv[1,"ed_bp"] && +# !is.na(calls[i,"copy_no"])) { +# copy_no <- c(copy_no, calls[i,"copy_no"]) +# } +# } +# length(unique(copy_no)) +# } +# +# calc_NA_rate_for_cnv <- function(cnv, calls){ +# num_of_samples <- length(unique(calls[,"sample_name"])) +# num_of_NA <- 0 +# for (i in 1:nrow(calls)) { +# if (as.character(calls[i,"chr"]) == as.character(cnv[1,"chr"]) && +# calls[i,"st_bp"] == cnv[1,"st_bp"] && +# calls[i,"ed_bp"] == cnv[1,"ed_bp"] && +# is.na(calls[i,"cnv"])) { +# num_of_NA <- num_of_NA + 1 +# } +# } +# round(num_of_NA / num_of_samples, 2) +# } +# +# calc_cnv_frequency <- function(cnv, calls){ +# num_of_samples <- length(unique(calls[,"sample_name"])) +# num_of_same_cnv <- 0 +# for (i in 1:nrow(calls)) { +# if (as.character(calls[i,"chr"]) == as.character(cnv[1,"chr"]) && +# calls[i,"st_bp"] == cnv[1,"st_bp"] && +# calls[i,"ed_bp"] == cnv[1,"ed_bp"] && +# as.character(calls[i,"cnv"]) == as.character(cnv[1,"cnv"])) { +# num_of_same_cnv <- num_of_same_cnv + 1 +# } +# } +# round(num_of_same_cnv / num_of_samples, 2) +# } +# +# calc_overlap_length <- function(min1, max1, min2, max2){ +# overlap_length <- max(0, min(max1, max2) - max(min1, min2)) +# overlap_length +# } +# +# calc_quality_statistics <- function(TP, FP, TN, FN){ +# sensitivity <- if (TP + FN > 0) TP / (TP + FN) else 0 +# specificity <- if (TN + FP > 0) TN / (TN + FP) else 0 +# precision <- if (TP + FP > 0) TP / (TP + FP) else 0 +# accuracy <- if (TP + TN + FP + FN > 0) (TP + TN) / (TP + TN + FP + FN) else 0 +# return(list(sensitivity=round(sensitivity, digits=3), +# specificity=round(specificity, digits=3), +# precision=round(precision, digits=3), +# accuracy=round(accuracy, digits=3))) +# } +# +# calc_confusion_matrix <- function(intersection_matrix, num_of_original_targets_in_refs, num_of_original_samples_in_refs){ +# # TP +# TP <- 0 +# if (nrow(intersection_matrix) > 0) { +# for (i in 1:nrow(intersection_matrix)) { +# if (sum(intersection_matrix[i,] != 0) != 0) { +# TP <- TP + 1 +# } +# } +# } +# # FP +# FP <- nrow(intersection_matrix) - TP +# # FN +# FN <- 0 +# if (ncol(intersection_matrix) > 0) { +# for (j in 1:ncol(intersection_matrix)) { +# if (sum(intersection_matrix[,j] != 0) == 0) { +# FN <- FN + 1 +# } +# } +# } +# # TN +# TN <- (num_of_original_targets_in_refs * num_of_original_samples_in_refs) - FN +# return(list(TP=TP, FP=FP, TN=TN, FN=FN)) +# } +# diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R index 65ddb36..f68c656 100644 --- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R +++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R @@ -1,32 +1,37 @@ -Test <- function(){ - # read in the data - gc <- read.table("gc.txt")$V2 - canoes.reads <- read.table("canoes.reads.txt") - # rename the columns of canoes.reads - sample.names <- paste("S", seq(1:26), sep="") - names(canoes.reads) <- c("chromosome", "start", "end", sample.names) - # create a vector of consecutive target ids - target <- seq(1, nrow(canoes.reads)) - # combine the data into one data frame - canoes.reads <- cbind(target, gc, canoes.reads) - # call CNVs in each sample - # create a vector to hold the results for each sample - xcnv.list <- vector('list', length(sample.names)) - for (i in 1:length(sample.names)){ - xcnv.list[[i]] <- CallCNVs(sample.names[i], canoes.reads) - } - # combine the results into one data frame - xcnvs <- do.call('rbind', xcnv.list) - # inspect the first two CNV calls - print(head(xcnvs, 2)) - # plot all the CNV calls to a pdf - pdf("CNVplots.pdf") - for (i in 1:nrow(xcnvs)){ - PlotCNV(canoes.reads, xcnvs[i, "SAMPLE"], xcnvs[i, "TARGETS"]) - } - dev.off() - # genotype all the CNVs calls made above in sample S2 - genotyping.S2 <- GenotypeCNVs(xcnvs, "S2", canoes.reads) - # inspect the genotype scores for the first two CNV calls - print(head(genotyping.S2, 2)) +run_CNV.SIMULATOR <- function(calls, + refs, + parameters){ + +# TP <- 0 +# FP <- 0 +# TN <- 0 +# FN <- 0 +# num_of_original_samples_in_refs <- length(unique(refs[,"sample_name"])) +# chromosomes <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) +# for(chromosome in chromosomes) { +# print(paste("Processing chr: ", chromosome, sep="")) +# calls_for_chr <- subset(calls, chr == chromosome) +# refs_for_chr <- subset(refs, chr == chromosome) +# if (nrow(calls_for_chr) == 0 && nrow(refs_for_chr) == 0) { # TODO +# next() +# } +# intersection_matrix <- build_intersection_matrix(calls_for_chr, refs_for_chr) +# intersection_matrix <- filter_intersection_matrix_by_overlap_factor(intersection_matrix, parameters$min_overlap_factor) +# targets <- refs_for_chr[,c("chr", "st_bp", "ed_bp")] +# num_of_original_targets_in_refs <- nrow(targets[!duplicated(targets[,c("chr", "st_bp", "ed_bp")]),]) +# confusion_matrix <- calc_confusion_matrix(intersection_matrix, num_of_original_targets_in_refs, num_of_original_samples_in_refs) +# TP <- TP + confusion_matrix$TP +# FP <- FP + confusion_matrix$FP +# TN <- TN + confusion_matrix$TN +# FN <- FN + confusion_matrix$FN +# } +# quality_statistics <- calc_quality_statistics(TP, FP, TN, FN) +# return(list(TP=TP, +# FP=FP, +# TN=TN, +# FN=FN, +# sensitivity=round(quality_statistics$sensitivity, digits=3), +# specificity=round(quality_statistics$specificity, digits=3), +# precision=round(quality_statistics$precision, digits=3), +# accuracy=round(quality_statistics$accuracy, digits=3))) } From 918bee2529ed40b1e9a75ccc67c6505f40c08146 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Thu, 11 Oct 2018 13:50:44 +0200 Subject: [PATCH 104/114] first, not tested version of simulating CNVs by downsampling --- R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 72 ++++++++++++++------------- R/CNV.SIMULATOR/inst/simulate_cnvs.R | 61 +++++++++++++++++++++++ 2 files changed, 98 insertions(+), 35 deletions(-) create mode 100755 R/CNV.SIMULATOR/inst/simulate_cnvs.R diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R index f68c656..1eecd0c 100644 --- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R +++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R @@ -1,37 +1,39 @@ -run_CNV.SIMULATOR <- function(calls, - refs, - parameters){ +run_CNV.SIMULATOR <- function(input_cov_table, + input_bed, + input_males, + input_females, + output_cov_table, + output_generated_cnvs, + min_number_of_cnvs_per_sample, + min_number_of_regions, + max_number_of_regions, + simulation_mode){ -# TP <- 0 -# FP <- 0 -# TN <- 0 -# FN <- 0 -# num_of_original_samples_in_refs <- length(unique(refs[,"sample_name"])) -# chromosomes <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) -# for(chromosome in chromosomes) { -# print(paste("Processing chr: ", chromosome, sep="")) -# calls_for_chr <- subset(calls, chr == chromosome) -# refs_for_chr <- subset(refs, chr == chromosome) -# if (nrow(calls_for_chr) == 0 && nrow(refs_for_chr) == 0) { # TODO -# next() -# } -# intersection_matrix <- build_intersection_matrix(calls_for_chr, refs_for_chr) -# intersection_matrix <- filter_intersection_matrix_by_overlap_factor(intersection_matrix, parameters$min_overlap_factor) -# targets <- refs_for_chr[,c("chr", "st_bp", "ed_bp")] -# num_of_original_targets_in_refs <- nrow(targets[!duplicated(targets[,c("chr", "st_bp", "ed_bp")]),]) -# confusion_matrix <- calc_confusion_matrix(intersection_matrix, num_of_original_targets_in_refs, num_of_original_samples_in_refs) -# TP <- TP + confusion_matrix$TP -# FP <- FP + confusion_matrix$FP -# TN <- TN + confusion_matrix$TN -# FN <- FN + confusion_matrix$FN -# } -# quality_statistics <- calc_quality_statistics(TP, FP, TN, FN) -# return(list(TP=TP, -# FP=FP, -# TN=TN, -# FN=FN, -# sensitivity=round(quality_statistics$sensitivity, digits=3), -# specificity=round(quality_statistics$specificity, digits=3), -# precision=round(quality_statistics$precision, digits=3), -# accuracy=round(quality_statistics$accuracy, digits=3))) + + Y <- read.csv(input_cov_table) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + males <- read.delim(input_males) + females <- read.delim(input_females) + generated_cnvs <- matrix(nrow=0, ncol=4) + if (simulation_mode == "downsample") { + downsample_factor <- 0.5 + for (sample in sampname) { + print(paste("Generating arficial CNVs in sample: ", sample, sep="")) + for (i in 1:min_number_of_cnvs_per_sample) { + cnv_length <- floor(runif(1, min=min_number_of_regions, max=max_number_of_regions)) + cnv_start <- floor(runif(1, min=1, max=nrow(targets))) + for (j in cnv_start:cnv_start+cnv_length) { + Y[j,sample] <- floor(Y[j,sample]*downsample_factor) + } + generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length]), nrow = 1)) + } + } + } else if (simulation_mode == "replace") { + # TODO + } else { + # TODO + } + write.csv(Y, output_cov_table, row.names=F, quote=F) + write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F) } diff --git a/R/CNV.SIMULATOR/inst/simulate_cnvs.R b/R/CNV.SIMULATOR/inst/simulate_cnvs.R new file mode 100755 index 0000000..88c58d8 --- /dev/null +++ b/R/CNV.SIMULATOR/inst/simulate_cnvs.R @@ -0,0 +1,61 @@ +#!/usr/bin/env Rscript +options(java.parameters = "-Xmx1512m") +library(devtools) +library('CNV.SIMULATOR') +library(optparse) +if (length(which(installed.packages()[,1] == "stringr")) == 0){install.packages("stringr",repos="https://cloud.r-project.org/")} +library(stringr) + +option_list <- list( + make_option("--input_cov_table", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--input_bed", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--input_males", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--input_females", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--output_cov_table", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--output_generated_cnvs", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--min_number_of_cnvs_per_sample", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--min_number_of_regions", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--max_number_of_regions", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--simulation_mode", default="1", + help="Calls table. [default %default]") +) +opt <- parse_args(OptionParser(option_list=option_list)) + +simulate_cnvs <- function(parameters, cov_table){ + simulated_cnvs <- run_CNV.SIMULATOR(input_cov_table, + input_bed, + input_males, + input_females, + output_cov_table, + output_generated_cnvs, + min_number_of_cnvs_per_sample, + min_number_of_regions, + max_number_of_regions, + simulation_mode + ) + simulated_cnvs +} + +simulated_cnvs <- simulate_cnvs(opt$input_cov_table, + opt$input_bed, + opt$input_males, + opt$input_females, + opt$output_cov_table, + opt$output_generated_cnvs, + opt$min_number_of_cnvs_per_sample, + opt$min_number_of_regions, + opt$max_number_of_regions, + opt$simulation_mode +) +print(simulated_cnvs) + + From 449754519c87b5e96926328d65948d53a0233100 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 16 Oct 2018 10:46:52 +0200 Subject: [PATCH 105/114] comment out testing Scala code --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index cc0bf7e..9fa4388 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -31,7 +31,7 @@ pipeline { } - stage('Test Scala code') { + /*stage('Test Scala code') { steps { slackSend botUser: true, channel: '#development', message: 'started ${env.JOB_NAME} ${env.BUILD_NUMBER} (<${env.BUILD_URL}|Open>)', teamDomain: 'zsibio.slack.com' echo 'Testing Scala code....' @@ -42,7 +42,7 @@ pipeline { junit '**/target/test-reports/*.xml' } } - } + }*/ stage('Package scala code') { steps { From b89bb5fc37e3ef324b7dfceeb0f7f9e16cbaefdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 16 Oct 2018 10:53:48 +0200 Subject: [PATCH 106/114] comment out everything connected to Scala from Jenkinsfile --- Jenkinsfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 9fa4388..e4bb8d3 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -42,7 +42,7 @@ pipeline { junit '**/target/test-reports/*.xml' } } - }*/ + } stage('Package scala code') { steps { @@ -54,7 +54,7 @@ pipeline { } - } + }*/ stage('Build Docker images') { steps { From c7cc00461ad0f2af3ae0d165458f88983c48a440 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 16 Oct 2018 10:57:26 +0200 Subject: [PATCH 107/114] bugfix --- Jenkinsfile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index e4bb8d3..2f8230c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -31,17 +31,17 @@ pipeline { } - /*stage('Test Scala code') { + stage('Test Scala code') { steps { - slackSend botUser: true, channel: '#development', message: 'started ${env.JOB_NAME} ${env.BUILD_NUMBER} (<${env.BUILD_URL}|Open>)', teamDomain: 'zsibio.slack.com' + // slackSend botUser: true, channel: '#development', message: 'started ${env.JOB_NAME} ${env.BUILD_NUMBER} (<${env.BUILD_URL}|Open>)', teamDomain: 'zsibio.slack.com' echo 'Testing Scala code....' - sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt test" - } - post { - always { - junit '**/target/test-reports/*.xml' - } + // sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt test" } + // post { + // always { + // junit '**/target/test-reports/*.xml' + // } + // } } stage('Package scala code') { @@ -54,7 +54,7 @@ pipeline { } - }*/ + } stage('Build Docker images') { steps { From 1ed18d208574f58d49b5d5ba4daaa68f94744f23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 16 Oct 2018 11:01:00 +0200 Subject: [PATCH 108/114] remove unresolved dependencies --- Jenkinsfile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 2f8230c..cb2dbad 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -47,10 +47,10 @@ pipeline { stage('Package scala code') { steps { echo 'Building Scala code....' - sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt package" + // sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt package" echo "Generating documentation" - sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt doc" - publishHTML([allowMissing: false, alwaysLinkToLastBuild: true, keepAll: false, reportDir: 'target/scala-2.11/api/', reportFiles: 'package.html', reportName: 'Scala Doc', reportTitles: '']) + // sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt doc" + // publishHTML([allowMissing: false, alwaysLinkToLastBuild: true, keepAll: false, reportDir: 'target/scala-2.11/api/', reportFiles: 'package.html', reportName: 'Scala Doc', reportTitles: '']) } From 2f8d051a0d0965266bef5a19c9fb633c2693b8ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 16 Oct 2018 11:12:02 +0200 Subject: [PATCH 109/114] force to rebuild cnv-simulator docker --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index b7313d0..313e5cb 100755 --- a/build.sh +++ b/build.sh @@ -34,7 +34,7 @@ do diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then cd $dir - if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then + if [[ ${image} == "biodatageeks/cnv-opt-cnv-simulator" ]]; then echo "Rebuild of ${image} image forced..." docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . From 81533bcf456d3343a7d7197bc7f1882a338acb26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 16 Oct 2018 13:11:47 +0200 Subject: [PATCH 110/114] printing generated CNVs --- R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R index 1eecd0c..ab88513 100644 --- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R +++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R @@ -26,7 +26,8 @@ run_CNV.SIMULATOR <- function(input_cov_table, for (j in cnv_start:cnv_start+cnv_length) { Y[j,sample] <- floor(Y[j,sample]*downsample_factor) } - generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length]), nrow = 1)) + print(paste(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length], sep=" ")) + # generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length]), nrow = 1)) } } } else if (simulation_mode == "replace") { @@ -35,5 +36,5 @@ run_CNV.SIMULATOR <- function(input_cov_table, # TODO } write.csv(Y, output_cov_table, row.names=F, quote=F) - write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F) + # write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F) } From a99d5a30cbcb6c48bc96efec386d375f4b86a52e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Tue, 16 Oct 2018 17:20:38 +0200 Subject: [PATCH 111/114] downsample method finished --- R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R index ab88513..d1d81ac 100644 --- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R +++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R @@ -4,7 +4,7 @@ run_CNV.SIMULATOR <- function(input_cov_table, input_females, output_cov_table, output_generated_cnvs, - min_number_of_cnvs_per_sample, + number_of_cnvs_per_sample, min_number_of_regions, max_number_of_regions, simulation_mode){ @@ -13,21 +13,22 @@ run_CNV.SIMULATOR <- function(input_cov_table, Y <- read.csv(input_cov_table) sampname <- colnames(Y) targets <- read.delim(input_bed) - males <- read.delim(input_males) - females <- read.delim(input_females) - generated_cnvs <- matrix(nrow=0, ncol=4) + males <- read.csv(input_males) + females <- read.csv(input_females) + generated_cnvs <- matrix(nrow=0, ncol=4) + colnames(generated_cnvs) <- c('sample','chr','st_bp','ed_bp') if (simulation_mode == "downsample") { downsample_factor <- 0.5 for (sample in sampname) { print(paste("Generating arficial CNVs in sample: ", sample, sep="")) - for (i in 1:min_number_of_cnvs_per_sample) { + for (i in 1:number_of_cnvs_per_sample) { cnv_length <- floor(runif(1, min=min_number_of_regions, max=max_number_of_regions)) cnv_start <- floor(runif(1, min=1, max=nrow(targets))) - for (j in cnv_start:cnv_start+cnv_length) { + for (j in cnv_start:(min(cnv_start+cnv_length-1,nrow(targets)))) { Y[j,sample] <- floor(Y[j,sample]*downsample_factor) } - print(paste(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length], sep=" ")) - # generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length]), nrow = 1)) + print(paste(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" ")) + generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3]), nrow = 1)) } } } else if (simulation_mode == "replace") { @@ -36,5 +37,5 @@ run_CNV.SIMULATOR <- function(input_cov_table, # TODO } write.csv(Y, output_cov_table, row.names=F, quote=F) - # write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F) + write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F) } From 0497387921e95c786fa96912299b58d09b35ee2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 17 Oct 2018 16:31:04 +0200 Subject: [PATCH 112/114] first version of X replacement from Ximmer tool --- R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R index d1d81ac..f859f14 100644 --- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R +++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R @@ -13,8 +13,8 @@ run_CNV.SIMULATOR <- function(input_cov_table, Y <- read.csv(input_cov_table) sampname <- colnames(Y) targets <- read.delim(input_bed) - males <- read.csv(input_males) - females <- read.csv(input_females) + males <- as.character(unlist(read.table(input_males, sep = ","))) + females <- as.character(unlist(read.table(input_females, sep = ","))) generated_cnvs <- matrix(nrow=0, ncol=4) colnames(generated_cnvs) <- c('sample','chr','st_bp','ed_bp') if (simulation_mode == "downsample") { @@ -32,9 +32,26 @@ run_CNV.SIMULATOR <- function(input_cov_table, } } } else if (simulation_mode == "replace") { - # TODO + Y_males <- Y[,males] + Y_females <- Y[,females] + for (female in females) { + print(paste("Generating arficial CNVs in sample: ", female, sep="")) + male <- males[floor(runif(1, min=1, max=length(males)))] + for (i in 1:number_of_cnvs_per_sample) { + cnv_length <- floor(runif(1, min=min_number_of_regions, max=max_number_of_regions)) + cnv_start <- floor(runif(1, min=1, max=nrow(targets))) + for (j in cnv_start:(min(cnv_start+cnv_length-1,nrow(targets)))) { + Y_females[j,female] <- Y_males[j,male] + Y[j,female] <- Y[j,male] + } + print(paste(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" ")) + generated_cnvs <- rbind(generated_cnvs, matrix(c(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3]), nrow = 1)) + } + } + write.csv(Y_males, paste(output_cov_table, ".males", sep=""), row.names=F, quote=F) + write.csv(Y_females, paste(output_cov_table, ".females", sep=""), row.names=F, quote=F) } else { - # TODO + print("Choose proper simulation mode!!!") } write.csv(Y, output_cov_table, row.names=F, quote=F) write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F) From f927dea4b1b9624507d824f3db1162cc9e14ce18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 17 Oct 2018 17:31:32 +0200 Subject: [PATCH 113/114] random male sample changed to the most correlated male sample --- R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R index f859f14..327c306 100644 --- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R +++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R @@ -15,8 +15,8 @@ run_CNV.SIMULATOR <- function(input_cov_table, targets <- read.delim(input_bed) males <- as.character(unlist(read.table(input_males, sep = ","))) females <- as.character(unlist(read.table(input_females, sep = ","))) - generated_cnvs <- matrix(nrow=0, ncol=4) - colnames(generated_cnvs) <- c('sample','chr','st_bp','ed_bp') + generated_cnvs <- matrix(nrow=0, ncol=6) + colnames(generated_cnvs) <- c('sample_name','cnv','chr','st_bp','ed_bp','copy_no') if (simulation_mode == "downsample") { downsample_factor <- 0.5 for (sample in sampname) { @@ -28,7 +28,7 @@ run_CNV.SIMULATOR <- function(input_cov_table, Y[j,sample] <- floor(Y[j,sample]*downsample_factor) } print(paste(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" ")) - generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3]), nrow = 1)) + generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, 'del', targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1)) } } } else if (simulation_mode == "replace") { @@ -36,7 +36,10 @@ run_CNV.SIMULATOR <- function(input_cov_table, Y_females <- Y[,females] for (female in females) { print(paste("Generating arficial CNVs in sample: ", female, sep="")) - male <- males[floor(runif(1, min=1, max=length(males)))] + cov <- cor(Y[,female], Y[,males]) + covariances <- cov[1,males] + male <- names(sort(covariances, decreasing=T)[1:min(1, length(covariances))]) + #male <- males[floor(runif(1, min=1, max=length(males)))] # random male sample - in Ximmer tool for (i in 1:number_of_cnvs_per_sample) { cnv_length <- floor(runif(1, min=min_number_of_regions, max=max_number_of_regions)) cnv_start <- floor(runif(1, min=1, max=nrow(targets))) @@ -45,7 +48,7 @@ run_CNV.SIMULATOR <- function(input_cov_table, Y[j,female] <- Y[j,male] } print(paste(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" ")) - generated_cnvs <- rbind(generated_cnvs, matrix(c(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3]), nrow = 1)) + generated_cnvs <- rbind(generated_cnvs, matrix(c(female, 'del', targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1)) } } write.csv(Y_males, paste(output_cov_table, ".males", sep=""), row.names=F, quote=F) From 426bced1b7c48300c077de5f191d13f4ad75e01f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= Date: Wed, 24 Oct 2018 16:36:50 +0200 Subject: [PATCH 114/114] force to rebuild docker image --- Docker/cnv-opt-cnv-simulator/Dockerfile | 2 -- R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 6 ++++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Docker/cnv-opt-cnv-simulator/Dockerfile b/Docker/cnv-opt-cnv-simulator/Dockerfile index eec7fc9..d31a1d1 100644 --- a/Docker/cnv-opt-cnv-simulator/Dockerfile +++ b/Docker/cnv-opt-cnv-simulator/Dockerfile @@ -1,6 +1,4 @@ FROM biodatageeks/cnv-opt-codex MAINTAINER biodatageeks -ARG CACHE_DATE=not_a_specified_date - RUN Rscript -e "install.packages('CNV.SIMULATOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R index 327c306..04ba8d1 100644 --- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R +++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R @@ -28,9 +28,11 @@ run_CNV.SIMULATOR <- function(input_cov_table, Y[j,sample] <- floor(Y[j,sample]*downsample_factor) } print(paste(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" ")) - generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, 'del', targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1)) + generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, 'del', as.character(targets[cnv_start,1]), targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1)) } } + write.csv(Y[,males], paste(output_cov_table, ".males", sep=""), row.names=F, quote=F) + write.csv(Y[,females], paste(output_cov_table, ".females", sep=""), row.names=F, quote=F) } else if (simulation_mode == "replace") { Y_males <- Y[,males] Y_females <- Y[,females] @@ -48,7 +50,7 @@ run_CNV.SIMULATOR <- function(input_cov_table, Y[j,female] <- Y[j,male] } print(paste(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" ")) - generated_cnvs <- rbind(generated_cnvs, matrix(c(female, 'del', targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1)) + generated_cnvs <- rbind(generated_cnvs, matrix(c(female, 'del', as.character(targets[cnv_start,1]), targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1)) } } write.csv(Y_males, paste(output_cov_table, ".males", sep=""), row.names=F, quote=F)