diff --git a/Docker/cnv-opt-canoes/Dockerfile b/Docker/cnv-opt-canoes/Dockerfile new file mode 100644 index 0000000..b404cab --- /dev/null +++ b/Docker/cnv-opt-canoes/Dockerfile @@ -0,0 +1,18 @@ +FROM ubuntu:xenial +MAINTAINER biodatageeks + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev + +RUN Rscript -e "install.packages('nnls', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('Hmisc', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('mgcv', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('plyr', repos = 'http://cran.us.r-project.org')" +RUN Rscript -e "install.packages('CANOES', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile new file mode 100644 index 0000000..fa7683b --- /dev/null +++ b/Docker/cnv-opt-canoescov/Dockerfile @@ -0,0 +1,11 @@ +FROM biodatageeks/cnv-opt-canoes +MAINTAINER biodatageeks + +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrings')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')" + +RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/Docker/cnv-opt-cnv-simulator/Dockerfile b/Docker/cnv-opt-cnv-simulator/Dockerfile new file mode 100644 index 0000000..d31a1d1 --- /dev/null +++ b/Docker/cnv-opt-cnv-simulator/Dockerfile @@ -0,0 +1,4 @@ +FROM biodatageeks/cnv-opt-codex +MAINTAINER biodatageeks + +RUN Rscript -e "install.packages('CNV.SIMULATOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/Docker/cnv-opt-codex/Dockerfile b/Docker/cnv-opt-codex/Dockerfile new file mode 100644 index 0000000..6f56eb5 --- /dev/null +++ b/Docker/cnv-opt-codex/Dockerfile @@ -0,0 +1,14 @@ +FROM ubuntu:xenial +MAINTAINER biodatageeks + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev + +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')" diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile new file mode 100644 index 0000000..258bbf8 --- /dev/null +++ b/Docker/cnv-opt-codexcov/Dockerfile @@ -0,0 +1,4 @@ +FROM biodatageeks/cnv-opt-codex +MAINTAINER biodatageeks + +RUN Rscript -e "install.packages('CODEXCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/Docker/cnv-opt-exomecopy/Dockerfile b/Docker/cnv-opt-exomecopy/Dockerfile new file mode 100644 index 0000000..0333434 --- /dev/null +++ b/Docker/cnv-opt-exomecopy/Dockerfile @@ -0,0 +1,20 @@ +FROM ubuntu:xenial +MAINTAINER biodatageeks + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev wget + +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomicRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" + +RUN wget http://bioconductor.org/packages/3.5/bioc/src/contrib/exomeCopy_1.22.0.tar.gz +RUN Rscript -e "install.packages('exomeCopy_1.22.0.tar.gz', repos = NULL, type='source')" diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile new file mode 100644 index 0000000..c12fa2f --- /dev/null +++ b/Docker/cnv-opt-exomecopycov/Dockerfile @@ -0,0 +1,8 @@ +FROM biodatageeks/cnv-opt-exomecopy +MAINTAINER biodatageeks + +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')" + +RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" + diff --git a/Docker/cnv-opt-exomedepth/Dockerfile b/Docker/cnv-opt-exomedepth/Dockerfile new file mode 100644 index 0000000..fd16d85 --- /dev/null +++ b/Docker/cnv-opt-exomedepth/Dockerfile @@ -0,0 +1,21 @@ +FROM ubuntu:xenial +MAINTAINER biodatageeks + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 +RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/' +RUN apt-get install -y apt-transport-https + +RUN apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev + +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrings')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomicRanges')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('RCurl')" +RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomicAlignments')" +RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')" + diff --git a/Docker/cnv-opt-exomedepthcov/Dockerfile b/Docker/cnv-opt-exomedepthcov/Dockerfile new file mode 100644 index 0000000..28448f8 --- /dev/null +++ b/Docker/cnv-opt-exomedepthcov/Dockerfile @@ -0,0 +1,5 @@ +FROM biodatageeks/cnv-opt-exomedepth +MAINTAINER biodatageeks + +RUN Rscript -e "install.packages('EXOMEDEPTHCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" + diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile new file mode 100644 index 0000000..d854276 --- /dev/null +++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile @@ -0,0 +1,8 @@ +FROM biodatageeks/cnv-opt-codex +MAINTAINER biodatageeks + +ARG CACHE_DATE=not_a_specified_date + +RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')" + +RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile new file mode 100644 index 0000000..8391b91 --- /dev/null +++ b/Docker/cnv-opt-target-qc/Dockerfile @@ -0,0 +1,4 @@ +FROM biodatageeks/cnv-opt-codex +MAINTAINER biodatageeks + +RUN Rscript -e "install.packages('TARGET.QC', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')" diff --git a/Jenkinsfile b/Jenkinsfile index 4d46cc4..cb2dbad 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -2,7 +2,7 @@ pipeline { agent any stages { - stage('Test R code') { + /*stage('Test R code') { steps { echo 'Testing R code....' sh 'docker run -i --rm --network="host" -e CNV_OPT_PSQL_USER="cnv-opt" -e CNV_OPT_PSQL_PASSWORD="zsibio321" -e CNV_OPT_PSQL_DRV_URL="http://zsibio.ii.pw.edu.pl/nexus/repository/zsi-bio-raw/common/jdbc/postgresql-42.1.1.jar" -e CNV_OPT_PSQL_CONN_URL="jdbc:postgresql://cdh00.ii.pw.edu.pl:15432/cnv-opt" -w="/tmp" -v $(pwd | sed "s|/var/jenkins_home|/data/home/jenkins|g")/R:/tmp zsibio.ii.pw.edu.pl:50009/zsi-bio-toolset Rscript tests/run_tests.R' @@ -12,7 +12,7 @@ pipeline { junit '**R/tests/*.xml' } } - } + }*/ stage('Build R package') { steps { @@ -21,37 +21,48 @@ pipeline { sh "cd R && R CMD build REFERENCE.SAMPLE.SET.SELECTOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file REFERENCE.SAMPLE.SET.SELECTOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/REFERENCE.SAMPLE.SET.SELECTOR_0.0.1.tar.gz" sh "cd R && R CMD build CODEXCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CODEXCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CODEXCOV_0.0.1.tar.gz" sh "cd R && R CMD build EXOMEDEPTHCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file EXOMEDEPTHCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/EXOMEDEPTHCOV_0.0.1.tar.gz" + sh "cd R && R CMD build EXOMECOPYCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file EXOMECOPYCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/EXOMECOPYCOV_0.0.1.tar.gz" sh "cd R && R CMD build CANOESCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOESCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOESCOV_0.0.1.tar.gz" + sh "cd R && R CMD build CANOES/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOES_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOES_0.0.1.tar.gz" sh "cd R && R CMD build CNVCALLER.RUNNER/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.RUNNER_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.RUNNER_0.0.1.tar.gz" sh "cd R && R CMD build CNVCALLER.EVALUATOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.EVALUATOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.EVALUATOR_0.0.1.tar.gz" + sh "cd R && R CMD build CNV.SIMULATOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNV.SIMULATOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNV.SIMULATOR_0.0.1.tar.gz" } } stage('Test Scala code') { steps { - slackSend botUser: true, channel: '#development', message: 'started ${env.JOB_NAME} ${env.BUILD_NUMBER} (<${env.BUILD_URL}|Open>)', teamDomain: 'zsibio.slack.com' + // slackSend botUser: true, channel: '#development', message: 'started ${env.JOB_NAME} ${env.BUILD_NUMBER} (<${env.BUILD_URL}|Open>)', teamDomain: 'zsibio.slack.com' echo 'Testing Scala code....' - sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt test" - } - post { - always { - junit '**/target/test-reports/*.xml' - } + // sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt test" } + // post { + // always { + // junit '**/target/test-reports/*.xml' + // } + // } } stage('Package scala code') { steps { echo 'Building Scala code....' - sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt package" + // sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt package" echo "Generating documentation" - sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt doc" - publishHTML([allowMissing: false, alwaysLinkToLastBuild: true, keepAll: false, reportDir: 'target/scala-2.11/api/', reportFiles: 'package.html', reportName: 'Scala Doc', reportTitles: '']) + // sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt doc" + // publishHTML([allowMissing: false, alwaysLinkToLastBuild: true, keepAll: false, reportDir: 'target/scala-2.11/api/', reportFiles: 'package.html', reportName: 'Scala Doc', reportTitles: '']) } } + + stage('Build Docker images') { + steps { + echo 'Building Docker images....' + sh './build.sh' + } + } + stage('Publish to Nexus snapshots and copying assembly fat jar to the edge server') { when { branch 'master' diff --git a/R/CANOES/DESCRIPTION b/R/CANOES/DESCRIPTION new file mode 100644 index 0000000..0824435 --- /dev/null +++ b/R/CANOES/DESCRIPTION @@ -0,0 +1,18 @@ +Package: CANOES +Title: CANOES Package +Version: 0.0.1 +Authors@R: c( + person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")), + person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")), + person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut"))) +Description: An implementation of the CANOES package in R. +Depends: + R (>= 3.2.3), + plyr (>= 1.8.4), + nnls (>= 1.4.0), + Hmisc (>= 4.0.0), + mgcv (>= 1.8.0) +License: GPL-3 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.0.1.9000 diff --git a/R/CANOES/NAMESPACE b/R/CANOES/NAMESPACE new file mode 100644 index 0000000..884a631 --- /dev/null +++ b/R/CANOES/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: fake comment so roxygen2 overwrites silently. +exportPattern("^[^\\.]") diff --git a/R/CANOES/R/functions_CANOES.R b/R/CANOES/R/functions_CANOES.R new file mode 100644 index 0000000..b3077b4 --- /dev/null +++ b/R/CANOES/R/functions_CANOES.R @@ -0,0 +1,685 @@ +# Constants +NUM.ABNORMAL.STATES=2 +NUM.STATES=3 +DELETION=1 +NORMAL=2 +DUPLICATION=3 + +# PlotCNV +# Plots count data for targets of interest +# highlights sample of interest in red, +# highlights area of interest with a black line +# highlights probe locations with black dots +# Arguments: +# counts: +# count matrix, with column "target" with target numbers +# and sample data in columns 6:end +# sample.name: +# sample of interest (will be highlighted in red in figure) +# (should correspond to a column in counts) +# targets: +# targets of interest in the form start.target..end.target +# offset: +# number of targets to add on either end (default=1) +# Returns: +# returns nothing +PlotCNV <- function(counts, sample.name, targets, offset=1){ + sample.name <- as.character(sample.name) + if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} + if (length(setdiff("target", names(counts)[1:5]) > 0)){ + stop("counts matrix must have column named target") + } + t <- as.character(targets) + start.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[1]) + end.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[2]) + if (!start.target %in% counts$target){ + stop("no data for start.target in counts matrix") + } + if (!end.target %in% counts$target){ + stop("no data for end.target in counts matrix") + } + if ((start.target - offset) %in% counts$target){ + start.target <- start.target - offset + } + if ((end.target + offset) %in% counts$target){ + end.target <- end.target + offset + } + ref.sample.names <- setdiff(as.character(names(counts)[-seq(1,5)]), + sample.name) + data <- subset(counts, target >= start.target & target <= end.target) + sample.data <- data[, sample.name] + means <- apply(data[, ref.sample.names], 1, mean) + sd <- sqrt(apply(data[, ref.sample.names], 1, var)) + refs.z.scores <- matrix(NA, nrow(data), length(ref.sample.names)) + sample.z.score <- numeric(length = nrow(data)) + for (i in seq(1, dim(data)[1])){ + refs.z.scores[i, ] <- as.numeric((data[i, ref.sample.names] - means[i]) / + max(0.000001, sd[i])) + sample.z.score[i] <- (sample.data[i] - means[i]) / max(0.000001, sd[i]) + } + ylim <- max(abs(refs.z.scores), abs(sample.z.score)) + plot(seq(-6, 6), seq(-6, 6), + xlim=c(data[1, "start"], data[dim(data)[1], "start"]), + ylim=c(-ylim - 0.1, ylim + 0.1), type="n", xlab="", ylab="Z-score") + for (i in seq(1, length(ref.sample.names))){ + lines(data[, "start"], refs.z.scores[, i], col="#2f4f4f85") + } + lines(data[, "start"], sample.z.score, col="red", lwd=3) + points(data[, "start"], rep(-ylim - 0.05, length(data[, "start"])), pch=20) + lines( c(data[1 + offset, "start"], data[nrow(data) - offset, "end"]) , + c(ylim+0.2, ylim+0.2), lwd=2) + title(main=paste("Sample ", sample.name, ", ", + counts$chromosome[start.target], ":", + data$start[1], "-", data$end[nrow(data)], sep="")) +} + +# CallCNVs +# Calls CNVs in sample of interest +# Arguments: +# sample.name: +# sample to call CNVs in (should correspond to a column in counts) +# counts: +# count matrix, first five columns should be +# target: consecutive numbers for targets (integer) +# chromosome: chromosome number (integer-valued) +# (support for sex chromosomes to come) +# start: start position of probe (integer) +# end: end position of probe (integer) +# gc: gc content (real between 0 and 1) +# subsequent columns should include counts for each probe for samples +# p: +# average rate of occurrence of CNVs (real) default is 1e-08 +# D: +# expected distance between targets in a CNV (integer) default is 70,000 +# Tnum: +# expected number of targets in a CNV (integer) default is 6 +# numrefs +# maximum number of reference samples to use (integer) default is 30 +# the weighted variance calculations will take a long time if too +# many reference samples are used +# Returns: +# data frame with the following columns: +# SAMPLE: name of sample +# CNV: DEL of DUP +# INTERVAL: CNV coordinates in the form chr:start-stop +# KB: length of CNV in kilobases +# CHR: chromosome +# MID_BP: middle base pair of CNV +# TARGETS: target numbers of CNV in the form start..stop +# NUM_TARG: how many targets are in the CNV +# Q_SOME: a Phred-scaled quality score for the CNV +CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, numrefs=30, get.dfs=F, homdel.mean=0.2){ + if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} + if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ + stop("First five columns of counts matrix must be target, chromosome, start, end, gc") + } + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) { + # remove sex chromosomes + cat("Trying to remove sex chromosomes and 'chr' prefixes\n") + counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y")) + if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){ + counts$chromosome <- gsub("chr", "", counts$chromosome) + } + counts$chromosome <- as.numeric(counts$chromosome) + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) + stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)") + } + library(plyr) + counts <- arrange(counts, chromosome, start) + if (p <= 0){ + stop("parameter p must be positive") + } + if (Tnum <= 0){ + stop("parameter Tnum must be positive") + } + if (D <= 0){ + stop("parameter D must be positive") + } + if (numrefs <= 0){ + stop("parameter numrefs must be positive") + } + sample.names <- colnames(counts)[-seq(1,5)] + # find mean coverage of probes + mean.counts <- mean(apply(counts[, sample.names], 2, mean)) + # normalize counts; round so we can use negative binomial + counts[, sample.names] <- apply(counts[, sample.names], 2, + function(x, mean.counts) + round(x * mean.counts / mean(x)), mean.counts) + # calculate covariance of read count across samples + cov <- cor(counts[, sample.names], counts[, sample.names]) + reference.samples <- setdiff(sample.names, sample.name) + covariances <- cov[sample.name, reference.samples] + reference.samples <- names(sort(covariances, + decreasing=T)[1:min(numrefs, length(covariances))]) + sample.mean.counts <- mean(counts[, sample.name]) + sample.sumcounts <- apply(counts[, reference.samples], 2, sum) + # normalize reference samples to sample of interest + counts[, reference.samples] <- apply(counts[, reference.samples], 2, + function(x, sample.mean.counts) + round(x * sample.mean.counts / + mean(x)), sample.mean.counts) + # select reference samples and weightings using non-negative least squares + b <- counts[, sample.name] + A <- as.matrix(counts[, reference.samples]) + library(nnls) + all <- nnls(A, b)$x + est <- matrix(0, nrow=50, ncol=length(reference.samples)) + set.seed(1) + for (i in 1:50){ + d <- sample(nrow(A), min(500, nrow(A))) + est[i, ] <- nnls(A[d, ], b[d])$x + } + weights <- colMeans(est) + sample.weights <- weights / sum(weights) + library(Hmisc) + # calculate weighted mean of read count + # this is used to calculate emission probabilities + counts$mean <- apply(counts[, reference.samples], + 1, wtd.mean, sample.weights) + targets <- counts$target + # exclude probes with all zero counts + nonzero.rows <- counts$mean > 0 + nonzero.rows.df <- data.frame(target=counts$target, + nonzero.rows=nonzero.rows) + + counts <- counts[nonzero.rows, ] + # get the distances between consecutive probes + distances <- GetDistances(counts) + # estimate the read count variance at each probe + var.estimate <- EstimateVariance(counts, reference.samples, + sample.weights) + emission.probs <- EmissionProbs(counts[, sample.name], + counts$mean, var.estimate$var.estimate, + counts[, "target"]) + if (get.dfs){ + return(list(emission.probs=emission.probs, distances=distances)) + } + # call CNVs with the Viterbi algorithm + viterbi.state <- Viterbi(emission.probs, distances, p, Tnum, D) + # format the CNVs + cnvs <- PrintCNVs(sample.name, viterbi.state, + counts) + # if there aren't too many CNVs, calculate the Q_SOME + if (nrow(cnvs) > 0 & nrow(cnvs) <= 50){ + qualities <- GenotypeCNVs(cnvs, sample.name, counts, p, Tnum, D, numrefs, + emission.probs=emission.probs, + distances=distances) + for (i in 1:nrow(cnvs)){ + cnvs$Q_SOME[i] <- ifelse(cnvs$CNV[i]=="DEL", qualities[i, "SQDel"], + qualities[i, "SQDup"]) + } + } + data <- as.data.frame(cbind(counts$target, counts$mean, var.estimate$var.estimate, counts[, sample.name])) + names(data) <- c("target", "countsmean", "varestimate", "sample") + if (nrow(cnvs) > 0){ + cnvs <- CalcCopyNumber(data, cnvs, homdel.mean) + } + return(cnvs) +} + +# GenotypeCNVs +# Genotype CNVs in sample of interest +# Arguments: +# xcnv +# data frame with the following columns, and one row for each +# CNV to genotype +# INTERVAL: CNV coordinates in the form chr:start-stop +# TARGETS: target numbers of CNV in the form start..stop +# these should correspond to the target numbers in counts +# sample.name: +# sample to genotype CNVs in (should correspond to a column in counts) +# counts: +# count matrix, first five columns should be +# target: consecutive numbers for targets (integer) +# chromosome: chromosome number (integer-valued) +# (support for sex chromosomes to come) +# start: start position of probe (integer) +# end: end position of probe (integer) +# gc: gc content (real between 0 and 1) +# subsequent columns should include counts for each probe for samples +# p: +# average rate of occurrence of CNVs (real) default is 1e-08 +# D: +# expected distance between targets in a CNV (integer) default is 70,000 +# Tnum: +# expected number of targets in a CNV (integer) default is 6 +# numrefs +# maximum number of reference samples to use (integer) default is 30 +# the weighted variance calculations will take a long time if too +# many reference samples are used +# emission.probs and distances are for internal use only +# Returns: +# data frame with the following columns and one row for each genotyped CNV: +# INTERVAL: CNV coordinates in the form chr:start-stop +# NQDEL: a Phred-scaled quality score that sample.name has no deletion +# in the interval +# SQDEL: a Phred-scaled quality score that sample.name has a deletion +# in the interval +# NQDUP and SQDUP: same, but for a duplication +GenotypeCNVs <- function(xcnvs, sample.name, counts, p=1e-08, Tnum=6, + D=70000, numrefs=30, + emission.probs=NULL, + distances=NULL){ + if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} + if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ + stop("First five columns of counts matrix must be target, chromosome, start, end, gc") + } + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) { + # remove sex chromosomes + cat("Trying to remove sex chromosomes and 'chr' prefixes\n") + counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y")) + if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){ + counts$chromosome <- gsub("chr", "", counts$chromosome) + } + counts$chromosome <- as.numeric(counts$chromosome) + if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) + stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)") + } + library(plyr) + counts <- arrange(counts, chromosome, start) + if (p <= 0){ + stop("parameter p must be positive") + } + if (Tnum <= 0){ + stop("parameter Tnum must be positive") + } + if (D <= 0){ + stop("parameter D must be positive") + } + if (numrefs <= 0){ + stop("parameter numrefs must be positive") + } + num.cnvs <- nrow(xcnvs) + cnv.intervals <- as.character(xcnvs$INTERVAL) + # if no emission probs matrix is passed in, generate a new one + if (is.null(emission.probs)){ + l <- CallCNVs(sample.name, counts, p, Tnum=6, D=70000, numrefs=30, get.dfs=T) + emission.probs <- l[['emission.probs']] + distances <- l[['distances']] + } + forward.m <- GetForwardMatrix(emission.probs, distances, p, Tnum, D) + backward.m <- GetBackwardMatrix(emission.probs, distances, p, Tnum, D) + qualities <- matrix(0, nrow=num.cnvs, ncol=5, + dimnames=list(cnv.intervals, + c("INTERVAL", "NQDel", "SQDel", "NQDup", "SQDup"))) + for (i in 1:num.cnvs){ + interval <- as.character(xcnvs[i, "INTERVAL"]) + targets <- as.numeric(strsplit(as.character(xcnvs[i, "TARGETS"]), ".", fixed=T)[[1]][c(1,3)]) + left.target <- targets[1] + right.target <- targets[2] + likelihoods <- GetModifiedLikelihood(forward.m, backward.m, + emission.probs, distances, + left.target, right.target, + c(DUPLICATION, DELETION), p, Tnum, D) + modified.likelihood <- likelihoods[1]; + unmodified.likelihood <- likelihoods[2] + Prob.All.Normal <- exp(modified.likelihood - unmodified.likelihood) + likelihoods <- GetModifiedLikelihood(forward.m, backward.m, + emission.probs, distances, + left.target, right.target, DELETION, p, Tnum, D) + modified.likelihood <- likelihoods[1]; + unmodified.likelihood <- likelihoods[2] + Prob.No.Deletion <- exp(modified.likelihood - unmodified.likelihood) + likelihoods <- GetModifiedLikelihood(forward.m, backward.m, + emission.probs, distances, + left.target, right.target, DUPLICATION, p, Tnum, D) + modified.likelihood <- likelihoods[1]; + unmodified.likelihood <- likelihoods[2] + Prob.No.Duplication <- exp(modified.likelihood - unmodified.likelihood) + # Check if probabilities greater than 1 are numerical error or bug + Phred <- function(prob){ + return(round(min(99, -10 * log10(1 - prob)))) + } + qualities[i, "NQDel"] <- Phred(Prob.No.Deletion) + qualities[i, "SQDel"] <- Phred(Prob.No.Duplication - Prob.All.Normal) + qualities[i, "NQDup"] <- Phred(Prob.No.Duplication) + qualities[i, "SQDup"] <- Phred(Prob.No.Deletion - Prob.All.Normal) + qualities[i, "INTERVAL"] <- interval + } + qualities <- as.data.frame(qualities, stringsAsFactors=F) + qualities$NQDel <- as.integer(qualities$NQDel) + qualities$NQDup <- as.integer(qualities$NQDup) + qualities$SQDel <- as.integer(qualities$SQDel) + qualities$SQDup <- as.integer(qualities$SQDup) + return(qualities) +} + +# returns data frame with distance to each target from the previous target +# (0 in the case of the first target on chromosome 1, a very big number +# for the first target on each other chromosome--this resets the HMM +# for each chromosome) +GetDistances <- function(counts){ + chromosome <- counts[, "chromosome"] + startbase <- counts[, "start"] + num.nonzero.exons <- length(startbase) + distances <- c(0, startbase[2:num.nonzero.exons] - + startbase[1:(num.nonzero.exons - 1)] + + 1000000000000 * (chromosome[2:num.nonzero.exons] - + chromosome[1:(num.nonzero.exons - 1)])) + return(data.frame(target=counts[, "target"], distance=distances)) +} + +EstimateVariance <- function(counts, ref.sample.names, sample.weights){ + library(Hmisc) + counts$var <- apply(counts[, ref.sample.names], 1, wtd.var, sample.weights, normwt=T) + set.seed(1) + counts.subset <- counts[sample(nrow(counts), min(36000, nrow(counts))), ] + library(mgcv) + # can't do gamma regression with negative + counts.subset$var[counts.subset$var==0] <- 0.1 + fit <- gam(var ~ s(mean) + s(gc), family=Gamma(link=log), data=counts.subset) + # we don't want variance less than Poisson + # we take maximum of genome-wide estimate, method of moments estimate + # and Poisson variance + v.estimate <- pmax(predict(fit, counts, type="response"), counts$var, + counts$mean * 1.01) + return(data.frame(target=counts$target, var.estimate=v.estimate)) +} + +EmissionProbs <- function(test.counts, target.means, + var.estimate, targets){ + num.targets <- length(test.counts) + # calculate the means for the deletion, normal and duplication states + state.target.means <- t(apply(data.frame(x=target.means), 1, function(x) c(x*1/2, x, x*3/2))) + # calculate the expected size (given the predicted variance) + size <- target.means ^ 2 / (var.estimate - target.means) + emission.probs <- matrix(NA, num.targets, 4) + colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") + # calculate the emission probabilities given the read count + size.del <- size + size.dup <- size + size.del <- size / 2 + size.dup <- size * 3 / 2 + emission.probs[, "delprob"] <- dnbinom( + test.counts, + mu=state.target.means[, 1], + size=size.del, log=T) + emission.probs[, "normalprob"] <- dnbinom( + test.counts, + mu=state.target.means[, 2], + size=size, log=T) + emission.probs[, "dupprob"] <- dnbinom( + test.counts, + mu=state.target.means[, 3], + size=size.dup, log=T) + emission.probs[, "target"] <- targets + # some values may be infinite as a result of extreme read count + row.all.inf <- which(apply(emission.probs, 1, function(x){all(is.infinite(x))})) + if (length(row.all.inf) > 0){ + for (i in row.all.inf){ + if (test.counts[i] >= state.target.means[i, 3]){ + emission.probs[i, 2:4] <- c(-Inf, -Inf, -0.01) + } + else if (test.counts[i] <= state.target.means[i, 1]){ + emission.probs[i, 2:4] <- c(-0.01, -Inf, -Inf) + } + else emission.probs[i, 2:4] <- c(-Inf, -0.01, -Inf) + } + } + return(emission.probs) +} + +# Viterbi algorithm +Viterbi <- function(emission.probs.matrix, distances, p, Tnum, D){ + targets <- emission.probs.matrix[, 1] + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + num.exons <- dim(emission.probs.matrix)[1] + viterbi.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) + viterbi.pointers <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + viterbi.matrix[1, ] <- initial.state + emission.probs.matrix[1,] + for (i in 2:num.exons) { + temp.matrix <- viterbi.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) + viterbi.matrix[i, ] <- apply(temp.matrix, 2, max) + emission.probs <- c(emission.probs.matrix[i,]) + dim(emission.probs) <- c(NUM.STATES, 1) + viterbi.matrix[i, ] <- viterbi.matrix[i, ] + emission.probs + viterbi.pointers[i, ] <- apply(temp.matrix, 2, which.max) + } + viterbi.states = vector(length = num.exons) + viterbi.states[num.exons] = which.max(viterbi.matrix[num.exons, ]) + for (i in (num.exons - 1):1) { + viterbi.states[i] <- viterbi.pointers[i + 1, viterbi.states[i + 1]] + } + return(data.frame(target=targets, viterbi.state=viterbi.states)) +} + +# returns a transition matrix +# to state +# deletion normal duplication +# deletion +#from state normal +# duplication +GetTransitionMatrix <- function(distance, p, Tnum, D){ + q <- 1 / Tnum + f = exp(-distance/D) + prob.abnormal.abnormal <- f * (1 - q) + (1 - f) * p + prob.abnormal.normal <- f * q + (1 - f) * (1 - 2 * p) + prob.abnormal.diff.abnormal <- (1 - f) * p + prob.normal.normal <- 1 - 2 * p + prob.normal.abnormal <- p + transition.probs <- + c(prob.abnormal.abnormal, prob.abnormal.normal, prob.abnormal.diff.abnormal, + prob.normal.abnormal, prob.normal.normal, prob.normal.abnormal, + prob.abnormal.diff.abnormal, prob.abnormal.normal, prob.abnormal.abnormal) + transition.m = log(matrix(transition.probs, NUM.STATES, NUM.STATES, byrow=TRUE)) + return(transition.m) +} + +# adds two log-space probabilities using the identity +# log (p1 + p2) = log p1 + log(1 + exp(log p2 - log p1)) +AddTwoProbabilities <- function(x, y){ + if (is.infinite(x)) return (y) + if (is.infinite(y)) return (x) + sum.probs <- max(x, y) + log1p(exp(-abs(x - y))) +} + +# adds multiple log-space probabilities +SumProbabilities <- function(x){ + sum.probs <- x[1] + for (i in 2:length(x)){ + sum.probs <- AddTwoProbabilities(sum.probs, x[i]) + } + return(sum.probs) +} + +# finds the data likelihood by summing the product of the corresponding +# forward and backward probabilities at any token (should give the same value +# regardless of the token) +GetLikelihood <- function(forward.matrix, backward.matrix, x){ + SumProbabilities(forward.matrix[x, ] + backward.matrix[x, ]) +} + +# get the forward probabilities +GetForwardMatrix <- function(emission.probs.matrix, distances, p, Tnum, D){ + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + num.exons <- dim(emission.probs.matrix)[1] + forward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold forward probabilities + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + forward.matrix[1, ] <- initial.state + emission.probs.matrix[1, ] + for (i in 2:num.exons){ + # compute matrix with probability we were in state j and are now in state i + # in temp.matrix[j, i] (ignoring emission of current token) + temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) + # find the probability that we are in each of the three states + sum.probs <- apply(temp.matrix, 2, SumProbabilities) + forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] + } + return(forward.matrix) +} + +# get the backward probabilities +GetBackwardMatrix <- function(emission.probs.matrix, distances, + p, Tnum, D){ + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + num.exons <- dim(emission.probs.matrix)[1] + backward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold backward probabilities + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + backward.matrix[num.exons, ] <- rep(0, NUM.STATES) + for (i in (num.exons - 1):1){ + temp.matrix <- GetTransitionMatrix(distances$distance[i+1], p, Tnum, D) + + matrix(backward.matrix[i + 1, ], 3, 3, byrow=T) + + matrix(emission.probs.matrix[i+1, ], 3, 3, byrow=T) + backward.matrix[i, ] <- apply(temp.matrix, 1, SumProbabilities) + } + final.prob <- backward.matrix[1, ] + emission.probs.matrix[1, ] + initial.state + return(backward.matrix) +} + +# find the likelihood of the data given that certain states are disallowed +# between start target and end target +GetModifiedLikelihood <- function(forward.matrix, backward.matrix, emission.probs.matrix, distances, + start.target, end.target, disallowed.states, p, Tnum, D){ + targets <- emission.probs.matrix[, 1] + emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) + # there may be missing targets in this sample, we genotype the largest stretch of + # targets that lie in the CNV + left.target <- min(which(targets >= start.target)) + right.target <- max(which(targets <= end.target)) + num.exons <- dim(emission.probs.matrix)[1] + unmodified.likelihood <- GetLikelihood(forward.matrix, + backward.matrix, min(right.target + 1, num.exons)) + #right.target or left.target may be empty + + #if (right.target >= left.target) return(c(NA, unmodified.likelihood)) + stopifnot(right.target >= left.target) + modified.emission.probs.matrix <- emission.probs.matrix + modified.emission.probs.matrix[left.target:right.target, + disallowed.states] <- -Inf + + # if the start target is the first target we need to recalculate the + # forward probabilities + # for that target, using the modified emission probabilities + if (left.target == 1){ + initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) + forward.matrix[1, ] <- initial.state + modified.emission.probs.matrix[1, ] + left.target <- left.target + 1 + } + for (i in seq(left.target, min(right.target + 1, num.exons))){ + # compute matrix with probability we were in state j and are now in state i + # in temp.matrix[j, i] (ignoring emission of current token) + temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) + # find the probability that we are in each of the three states + sum.probs <- apply(temp.matrix, 2, SumProbabilities) + if (!i == (right.target + 1)){ + forward.matrix[i, ] <- sum.probs + modified.emission.probs.matrix[i, ] + } else{ + forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] + } + } + # find the modified likelihood of the sequence + modified.likelihood <- GetLikelihood(forward.matrix, backward.matrix, min(right.target + 1, num.exons)) + return(c(modified.likelihood, unmodified.likelihood)) +} + +SummarizeCNVs <- function(cnv.targets, counts, sample.name, state){ + sample.name <- sample.name + cnv.type <- ifelse(state==3, "DUP", "DEL") + cnv.start <- min(cnv.targets$target) + cnv.end <- max(cnv.targets$target) + cnv.chromosome <- counts[cnv.start, "chromosome"] + cnv.start.base <- counts[cnv.start, "start"] + cnv.start.target <- counts[cnv.start, "target"] + cnv.end.base <- counts[cnv.end, "end"] + cnv.end.target <- counts[cnv.end, "target"] + cnv.kbs <- (cnv.end.base - cnv.start.base) / 1000 + cnv.midbp <- round((cnv.end.base - cnv.start.base) / 2) + cnv.start.base + cnv.targets <- paste(cnv.start.target, "..", cnv.end.target, sep="") + cnv.interval <- paste(cnv.chromosome, ":", cnv.start.base, "-", cnv.end.base, sep="") + num.targets <- cnv.end.target - cnv.start.target + 1 + return(data.frame(sample.name=sample.name, cnv.type=cnv.type, cnv.interval=cnv.interval, + cnv.kbs=cnv.kbs, cnv.chromosome=cnv.chromosome, + cnv.midbp=cnv.midbp, cnv.targets=cnv.targets, num.targets=num.targets)) +} + +PrintCNVs <- function(test.sample.name, viterbi.state, + nonzero.counts){ + consecutiveGroups <- function(sequence){ + num <- length(sequence) + group <- 1 + groups <- rep(0, num) + groups[1] <- group + if (num > 1){ + for (i in 2:num){ + if (!sequence[i] == (sequence[i - 1] + 1)) group <- group + 1 + groups[i] <- group + } + } + return(groups) + } + num.duplications <- 0 + num.deletions <- 0 + for (state in c(1, 3)){ + cnv.targets <- which(viterbi.state$viterbi.state == state) + if (!length(cnv.targets) == 0){ + groups <- consecutiveGroups(cnv.targets) + library(plyr) + cnvs.temp.df <- ddply(data.frame(target=cnv.targets, group=groups), + "group", SummarizeCNVs, nonzero.counts, test.sample.name, + state) + if (state == 1){ + deletions.df <- cnvs.temp.df + if (!is.null(dim(deletions.df))){ + num.deletions <- dim(deletions.df)[1] + } + } else { + duplications.df <- cnvs.temp.df + if (!is.null(dim(duplications.df))){ + num.duplications <- dim(duplications.df)[1] + } + } + } + } + num.calls <- num.deletions + num.duplications + cat(num.calls, "CNVs called in sample", test.sample.name, "\n") + if (num.deletions == 0 & num.duplications == 0){ + df <- data.frame(SAMPLE=character(0), CNV=character(0), INTERVAL=character(0), + KB=numeric(0), CHR=character(0), + MID_BP=numeric(), TARGETS=character(0), NUM_TARG=numeric(0), Q_SOME=numeric(0), MLCN=numeric(0)) + return(df) + } + if (num.deletions > 0 & num.duplications > 0){ + cnvs.df <- rbind(deletions.df, duplications.df) + } else { + ifelse(num.deletions > 0, + cnvs.df <- deletions.df, cnvs.df <- duplications.df) + } + xcnv <- cbind(cnvs.df[, c("sample.name", "cnv.type", "cnv.interval", + "cnv.kbs", "cnv.chromosome", "cnv.midbp", + "cnv.targets", "num.targets")], 0) + colnames(xcnv) <- c("SAMPLE", "CNV", "INTERVAL", "KB", "CHR", "MID_BP", "TARGETS", + "NUM_TARG", "MLCN") + xcnv$Q_SOME <- NA + return(xcnv) +} + +CalcCopyNumber <- function(data, cnvs, homdel.mean){ + for (i in 1:nrow(cnvs)){ + cnv <- cnvs[i, ] + targets <- as.numeric(unlist(strsplit(as.character(cnv$TARGETS), "..", fixed=T))) + cnv.data <- subset(data, target >= targets[1] & target <= targets[2]) + state.target.means <- t(apply(data.frame(x=cnv.data$countsmean), 1, + function(x) c(C1=x*1/2, C2=x, C3=x*3/2, + C4=x * 2, C5=x * 5/2, C6=x*6/2))) + # calculate the expected size (given the predicted variance) + size <- cnv.data$countsmean ^ 2 / (cnv.data$varestimate - cnv.data$countsmean) + emission.probs <- matrix(NA, nrow(cnv.data), 7) + colnames(emission.probs) <- c("C0", "C1", "C2", "C3", "C4", "C5", "C6") + #colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") + # calculate the emission probabilities given the read count + emission.probs[, 1] <- dpois(cnv.data$sample, homdel.mean, log=T) + for (s in 1:6){ + size.state <- size * s/2 + emission.probs[, s+1] <- dnbinom(cnv.data$sample, mu=state.target.means[, s], + size=size.state, log=T) + } + cs <- colSums(emission.probs) + ml.state <- which.max(cs) - 1 + if (ml.state==2){ + ml.state <- ifelse(cnv$CNV=="DEL", 1, 3) + } + cnvs$MLCN[i] <- ml.state + } + return(cnvs) +} + diff --git a/R/CANOES/R/run_CANOES.R b/R/CANOES/R/run_CANOES.R new file mode 100644 index 0000000..65ddb36 --- /dev/null +++ b/R/CANOES/R/run_CANOES.R @@ -0,0 +1,32 @@ +Test <- function(){ + # read in the data + gc <- read.table("gc.txt")$V2 + canoes.reads <- read.table("canoes.reads.txt") + # rename the columns of canoes.reads + sample.names <- paste("S", seq(1:26), sep="") + names(canoes.reads) <- c("chromosome", "start", "end", sample.names) + # create a vector of consecutive target ids + target <- seq(1, nrow(canoes.reads)) + # combine the data into one data frame + canoes.reads <- cbind(target, gc, canoes.reads) + # call CNVs in each sample + # create a vector to hold the results for each sample + xcnv.list <- vector('list', length(sample.names)) + for (i in 1:length(sample.names)){ + xcnv.list[[i]] <- CallCNVs(sample.names[i], canoes.reads) + } + # combine the results into one data frame + xcnvs <- do.call('rbind', xcnv.list) + # inspect the first two CNV calls + print(head(xcnvs, 2)) + # plot all the CNV calls to a pdf + pdf("CNVplots.pdf") + for (i in 1:nrow(xcnvs)){ + PlotCNV(canoes.reads, xcnvs[i, "SAMPLE"], xcnvs[i, "TARGETS"]) + } + dev.off() + # genotype all the CNVs calls made above in sample S2 + genotyping.S2 <- GenotypeCNVs(xcnvs, "S2", canoes.reads) + # inspect the genotype scores for the first two CNV calls + print(head(genotyping.S2, 2)) +} diff --git a/R/CANOESCOV/DESCRIPTION b/R/CANOESCOV/DESCRIPTION index 314f419..27c73ca 100644 --- a/R/CANOESCOV/DESCRIPTION +++ b/R/CANOESCOV/DESCRIPTION @@ -12,14 +12,13 @@ Description: An extended implementation of the CANOES package in R. It extends Depends: R (>= 3.2.3), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4), IRanges (>= 2.0.0), plyr (>= 1.8.4), nnls (>= 1.4.0), Hmisc (>= 4.0.0), - mgcv (>= 1.8.0), - REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1) + mgcv (>= 1.8.0) License: GPL-3 Encoding: UTF-8 LazyData: true diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R index 873e894..f3195a7 100644 --- a/R/CANOESCOV/R/functions_CANOESCOV.R +++ b/R/CANOESCOV/R/functions_CANOESCOV.R @@ -1,23 +1,24 @@ -coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ - Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) - for(sample in sampname) { - cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] - cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ] - Y <- cbind(Y, cov_targets_for_sample[,"read_count"]) +# from CODEX package +getgc <- function(chr, ref) { + library(GenomeInfoDb) + library(BSgenome.Hsapiens.UCSC.hg19) + if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") { + chrtemp <- 23 + } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") { + chrtemp <- 24 + } else { + chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1]) } - colnames(Y) <- sampname - rownames(Y) <- targets_for_chr[,"target_id"] - return(list(Y=Y)) + if (length(chrtemp) == 0) + message("Chromosome cannot be found in NCBI Homo sapiens database!") + chrm <- unmasked(Hsapiens[[chrtemp]]) + seqs <- Views(chrm, ref) + af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE) + gc <- round((af[, "G"] + af[, "C"]) * 100,2) + gc } -# Constants -NUM.ABNORMAL.STATES=2 -NUM.STATES=3 -DELETION=1 -NORMAL=2 -DUPLICATION=3 - # CallCNVs # Calls CNVs in sample of interest # Arguments: @@ -53,7 +54,14 @@ DUPLICATION=3 # TARGETS: target numbers of CNV in the form start..stop # NUM_TARG: how many targets are in the CNV # Q_SOME: a Phred-scaled quality score for the CNV -CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, reference_set_select_method='canoes', num_of_samples_in_reference_set=30, get.dfs=F, homdel.mean=0.2, target_length){ +CallCNVs <- function(sample.name, reference.samples, counts, p=1e-08, Tnum=6, D=70000, get.dfs=F, homdel.mean=0.2){ + library(IRanges) + library(BSgenome.Hsapiens.UCSC.hg19) + library(Biostrings) + library(Rsamtools) + library(GenomeInfoDb) + library(S4Vectors) + library(CANOES) if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ stop("First five columns of counts matrix must be target, chromosome, start, end, gc") @@ -97,12 +105,12 @@ CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, reference_se #reference.samples <- names(sort(covariances, # decreasing=T)[1:min(numrefs, length(covariances))]) Y <- data.matrix(counts[,6:ncol(counts)]) - library('REFERENCE.SAMPLE.SET.SELECTOR') - reference.samples <- run_REFERENCE.SAMPLE.SET.SELECTOR(sample.name, - Y, - reference_set_select_method, - num_of_samples_in_reference_set, - target_length) + #library('REFERENCE.SAMPLE.SET.SELECTOR') + #reference.samples <- run_REFERENCE.SAMPLE.SET.SELECTOR(sample.name, + # Y, + # reference_set_select_method, + # num_of_samples_in_reference_set, + # target_length) sample.mean.counts <- mean(counts[, sample.name]) sample.sumcounts <- apply(counts[, reference.samples], 2, sum) # normalize reference samples to sample of interest @@ -168,469 +176,3 @@ CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, reference_se } return(cnvs) } - -# GenotypeCNVs -# Genotype CNVs in sample of interest -# Arguments: -# xcnv -# data frame with the following columns, and one row for each -# CNV to genotype -# INTERVAL: CNV coordinates in the form chr:start-stop -# TARGETS: target numbers of CNV in the form start..stop -# these should correspond to the target numbers in counts -# sample.name: -# sample to genotype CNVs in (should correspond to a column in counts) -# counts: -# count matrix, first five columns should be -# target: consecutive numbers for targets (integer) -# chromosome: chromosome number (integer-valued) -# (support for sex chromosomes to come) -# start: start position of probe (integer) -# end: end position of probe (integer) -# gc: gc content (real between 0 and 1) -# subsequent columns should include counts for each probe for samples -# p: -# average rate of occurrence of CNVs (real) default is 1e-08 -# D: -# expected distance between targets in a CNV (integer) default is 70,000 -# Tnum: -# expected number of targets in a CNV (integer) default is 6 -# numrefs -# maximum number of reference samples to use (integer) default is 30 -# the weighted variance calculations will take a long time if too -# many reference samples are used -# emission.probs and distances are for internal use only -# Returns: -# data frame with the following columns and one row for each genotyped CNV: -# INTERVAL: CNV coordinates in the form chr:start-stop -# NQDEL: a Phred-scaled quality score that sample.name has no deletion -# in the interval -# SQDEL: a Phred-scaled quality score that sample.name has a deletion -# in the interval -# NQDUP and SQDUP: same, but for a duplication -GenotypeCNVs <- function(xcnvs, sample.name, counts, p=1e-08, Tnum=6, - D=70000, numrefs=30, - emission.probs=NULL, - distances=NULL){ - if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")} - if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){ - stop("First five columns of counts matrix must be target, chromosome, start, end, gc") - } - if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) { - # remove sex chromosomes - cat("Trying to remove sex chromosomes and 'chr' prefixes\n") - counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y")) - if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){ - counts$chromosome <- gsub("chr", "", counts$chromosome) - } - counts$chromosome <- as.numeric(counts$chromosome) - if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) - stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)") - } - library(plyr) - counts <- arrange(counts, chromosome, start) - if (p <= 0){ - stop("parameter p must be positive") - } - if (Tnum <= 0){ - stop("parameter Tnum must be positive") - } - if (D <= 0){ - stop("parameter D must be positive") - } - if (numrefs <= 0){ - stop("parameter numrefs must be positive") - } - num.cnvs <- nrow(xcnvs) - cnv.intervals <- as.character(xcnvs$INTERVAL) - # if no emission probs matrix is passed in, generate a new one - if (is.null(emission.probs)){ - l <- CANOESCOV::CallCNVs(sample.name, counts, p, Tnum=6, D=70000, numrefs=30, get.dfs=T) - emission.probs <- l[['emission.probs']] - distances <- l[['distances']] - } - forward.m <- GetForwardMatrix(emission.probs, distances, p, Tnum, D) - backward.m <- GetBackwardMatrix(emission.probs, distances, p, Tnum, D) - qualities <- matrix(0, nrow=num.cnvs, ncol=5, - dimnames=list(cnv.intervals, - c("INTERVAL", "NQDel", "SQDel", "NQDup", "SQDup"))) - for (i in 1:num.cnvs){ - interval <- as.character(xcnvs[i, "INTERVAL"]) - targets <- as.numeric(strsplit(as.character(xcnvs[i, "TARGETS"]), ".", fixed=T)[[1]][c(1,3)]) - left.target <- targets[1] - right.target <- targets[2] - likelihoods <- GetModifiedLikelihood(forward.m, backward.m, - emission.probs, distances, - left.target, right.target, - c(DUPLICATION, DELETION), p, Tnum, D) - modified.likelihood <- likelihoods[1]; - unmodified.likelihood <- likelihoods[2] - Prob.All.Normal <- exp(modified.likelihood - unmodified.likelihood) - likelihoods <- GetModifiedLikelihood(forward.m, backward.m, - emission.probs, distances, - left.target, right.target, DELETION, p, Tnum, D) - modified.likelihood <- likelihoods[1]; - unmodified.likelihood <- likelihoods[2] - Prob.No.Deletion <- exp(modified.likelihood - unmodified.likelihood) - likelihoods <- GetModifiedLikelihood(forward.m, backward.m, - emission.probs, distances, - left.target, right.target, DUPLICATION, p, Tnum, D) - modified.likelihood <- likelihoods[1]; - unmodified.likelihood <- likelihoods[2] - Prob.No.Duplication <- exp(modified.likelihood - unmodified.likelihood) - # Check if probabilities greater than 1 are numerical error or bug - Phred <- function(prob){ - return(round(min(99, -10 * log10(1 - prob)))) - } - qualities[i, "NQDel"] <- Phred(Prob.No.Deletion) - qualities[i, "SQDel"] <- Phred(Prob.No.Duplication - Prob.All.Normal) - qualities[i, "NQDup"] <- Phred(Prob.No.Duplication) - qualities[i, "SQDup"] <- Phred(Prob.No.Deletion - Prob.All.Normal) - qualities[i, "INTERVAL"] <- interval - } - qualities <- as.data.frame(qualities, stringsAsFactors=F) - qualities$NQDel <- as.integer(qualities$NQDel) - qualities$NQDup <- as.integer(qualities$NQDup) - qualities$SQDel <- as.integer(qualities$SQDel) - qualities$SQDup <- as.integer(qualities$SQDup) - return(qualities) -} - -# returns data frame with distance to each target from the previous target -# (0 in the case of the first target on chromosome 1, a very big number -# for the first target on each other chromosome--this resets the HMM -# for each chromosome) -GetDistances <- function(counts){ - chromosome <- counts[, "chromosome"] - startbase <- counts[, "start"] - num.nonzero.exons <- length(startbase) - distances <- c(0, startbase[2:num.nonzero.exons] - - startbase[1:(num.nonzero.exons - 1)] + - 1000000000000 * (chromosome[2:num.nonzero.exons] - - chromosome[1:(num.nonzero.exons - 1)])) - return(data.frame(target=counts[, "target"], distance=distances)) -} - -EstimateVariance <- function(counts, ref.sample.names, sample.weights){ - library(Hmisc) - counts$var <- apply(counts[, ref.sample.names], 1, wtd.var, sample.weights, normwt=T) - set.seed(1) - counts.subset <- counts[sample(nrow(counts), min(36000, nrow(counts))), ] - library(mgcv) - # can't do gamma regression with negative - counts.subset$var[counts.subset$var==0] <- 0.1 - fit <- gam(var ~ s(mean) + s(gc), family=Gamma(link=log), data=counts.subset) - # we don't want variance less than Poisson - # we take maximum of genome-wide estimate, method of moments estimate - # and Poisson variance - v.estimate <- pmax(predict(fit, counts, type="response"), counts$var, - counts$mean * 1.01) - return(data.frame(target=counts$target, var.estimate=v.estimate)) -} - -EmissionProbs <- function(test.counts, target.means, - var.estimate, targets){ - num.targets <- length(test.counts) - # calculate the means for the deletion, normal and duplication states - state.target.means <- t(apply(data.frame(x=target.means), 1, function(x) c(x*1/2, x, x*3/2))) - # calculate the expected size (given the predicted variance) - size <- target.means ^ 2 / (var.estimate - target.means) - emission.probs <- matrix(NA, num.targets, 4) - colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") - # calculate the emission probabilities given the read count - size.del <- size - size.dup <- size - size.del <- size / 2 - size.dup <- size * 3 / 2 - emission.probs[, "delprob"] <- dnbinom( - test.counts, - mu=state.target.means[, 1], - size=size.del, log=T) - emission.probs[, "normalprob"] <- dnbinom( - test.counts, - mu=state.target.means[, 2], - size=size, log=T) - emission.probs[, "dupprob"] <- dnbinom( - test.counts, - mu=state.target.means[, 3], - size=size.dup, log=T) - emission.probs[, "target"] <- targets - # some values may be infinite as a result of extreme read count - row.all.inf <- which(apply(emission.probs, 1, function(x){all(is.infinite(x))})) - if (length(row.all.inf) > 0){ - for (i in row.all.inf){ - if (test.counts[i] >= state.target.means[i, 3]){ - emission.probs[i, 2:4] <- c(-Inf, -Inf, -0.01) - } - else if (test.counts[i] <= state.target.means[i, 1]){ - emission.probs[i, 2:4] <- c(-0.01, -Inf, -Inf) - } - else emission.probs[i, 2:4] <- c(-Inf, -0.01, -Inf) - } - } - return(emission.probs) -} - -# Viterbi algorithm -Viterbi <- function(emission.probs.matrix, distances, p, Tnum, D){ - targets <- emission.probs.matrix[, 1] - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - num.exons <- dim(emission.probs.matrix)[1] - viterbi.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) - viterbi.pointers <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - viterbi.matrix[1, ] <- initial.state + emission.probs.matrix[1,] - for (i in 2:num.exons) { - temp.matrix <- viterbi.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) - viterbi.matrix[i, ] <- apply(temp.matrix, 2, max) - emission.probs <- c(emission.probs.matrix[i,]) - dim(emission.probs) <- c(NUM.STATES, 1) - viterbi.matrix[i, ] <- viterbi.matrix[i, ] + emission.probs - viterbi.pointers[i, ] <- apply(temp.matrix, 2, which.max) - } - viterbi.states = vector(length = num.exons) - viterbi.states[num.exons] = which.max(viterbi.matrix[num.exons, ]) - for (i in (num.exons - 1):1) { - viterbi.states[i] <- viterbi.pointers[i + 1, viterbi.states[i + 1]] - } - return(data.frame(target=targets, viterbi.state=viterbi.states)) -} - -# returns a transition matrix -# to state -# deletion normal duplication -# deletion -#from state normal -# duplication -GetTransitionMatrix <- function(distance, p, Tnum, D){ - q <- 1 / Tnum - f = exp(-distance/D) - prob.abnormal.abnormal <- f * (1 - q) + (1 - f) * p - prob.abnormal.normal <- f * q + (1 - f) * (1 - 2 * p) - prob.abnormal.diff.abnormal <- (1 - f) * p - prob.normal.normal <- 1 - 2 * p - prob.normal.abnormal <- p - transition.probs <- - c(prob.abnormal.abnormal, prob.abnormal.normal, prob.abnormal.diff.abnormal, - prob.normal.abnormal, prob.normal.normal, prob.normal.abnormal, - prob.abnormal.diff.abnormal, prob.abnormal.normal, prob.abnormal.abnormal) - transition.m = log(matrix(transition.probs, NUM.STATES, NUM.STATES, byrow=TRUE)) - return(transition.m) -} - -# adds two log-space probabilities using the identity -# log (p1 + p2) = log p1 + log(1 + exp(log p2 - log p1)) -AddTwoProbabilities <- function(x, y){ - if (is.infinite(x)) return (y) - if (is.infinite(y)) return (x) - sum.probs <- max(x, y) + log1p(exp(-abs(x - y))) -} - -# adds multiple log-space probabilities -SumProbabilities <- function(x){ - sum.probs <- x[1] - for (i in 2:length(x)){ - sum.probs <- AddTwoProbabilities(sum.probs, x[i]) - } - return(sum.probs) -} - -# finds the data likelihood by summing the product of the corresponding -# forward and backward probabilities at any token (should give the same value -# regardless of the token) -GetLikelihood <- function(forward.matrix, backward.matrix, x){ - SumProbabilities(forward.matrix[x, ] + backward.matrix[x, ]) -} - -# get the forward probabilities -GetForwardMatrix <- function(emission.probs.matrix, distances, p, Tnum, D){ - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - num.exons <- dim(emission.probs.matrix)[1] - forward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold forward probabilities - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - forward.matrix[1, ] <- initial.state + emission.probs.matrix[1, ] - for (i in 2:num.exons){ - # compute matrix with probability we were in state j and are now in state i - # in temp.matrix[j, i] (ignoring emission of current token) - temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) - # find the probability that we are in each of the three states - sum.probs <- apply(temp.matrix, 2, SumProbabilities) - forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] - } - return(forward.matrix) -} - -# get the backward probabilities -GetBackwardMatrix <- function(emission.probs.matrix, distances, - p, Tnum, D){ - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - num.exons <- dim(emission.probs.matrix)[1] - backward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES) # matrix to hold backward probabilities - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - backward.matrix[num.exons, ] <- rep(0, NUM.STATES) - for (i in (num.exons - 1):1){ - temp.matrix <- GetTransitionMatrix(distances$distance[i+1], p, Tnum, D) + - matrix(backward.matrix[i + 1, ], 3, 3, byrow=T) + - matrix(emission.probs.matrix[i+1, ], 3, 3, byrow=T) - backward.matrix[i, ] <- apply(temp.matrix, 1, SumProbabilities) - } - final.prob <- backward.matrix[1, ] + emission.probs.matrix[1, ] + initial.state - return(backward.matrix) -} - -# find the likelihood of the data given that certain states are disallowed -# between start target and end target -GetModifiedLikelihood <- function(forward.matrix, backward.matrix, emission.probs.matrix, distances, - start.target, end.target, disallowed.states, p, Tnum, D){ - targets <- emission.probs.matrix[, 1] - emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4]) - # there may be missing targets in this sample, we genotype the largest stretch of - # targets that lie in the CNV - left.target <- min(which(targets >= start.target)) - right.target <- max(which(targets <= end.target)) - num.exons <- dim(emission.probs.matrix)[1] - unmodified.likelihood <- GetLikelihood(forward.matrix, - backward.matrix, min(right.target + 1, num.exons)) - #right.target or left.target may be empty - - #if (right.target >= left.target) return(c(NA, unmodified.likelihood)) - stopifnot(right.target >= left.target) - modified.emission.probs.matrix <- emission.probs.matrix - modified.emission.probs.matrix[left.target:right.target, - disallowed.states] <- -Inf - - # if the start target is the first target we need to recalculate the - # forward probabilities - # for that target, using the modified emission probabilities - if (left.target == 1){ - initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES)) - forward.matrix[1, ] <- initial.state + modified.emission.probs.matrix[1, ] - left.target <- left.target + 1 - } - for (i in seq(left.target, min(right.target + 1, num.exons))){ - # compute matrix with probability we were in state j and are now in state i - # in temp.matrix[j, i] (ignoring emission of current token) - temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D) - # find the probability that we are in each of the three states - sum.probs <- apply(temp.matrix, 2, SumProbabilities) - if (!i == (right.target + 1)){ - forward.matrix[i, ] <- sum.probs + modified.emission.probs.matrix[i, ] - } else{ - forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ] - } - } - # find the modified likelihood of the sequence - modified.likelihood <- GetLikelihood(forward.matrix, backward.matrix, min(right.target + 1, num.exons)) - return(c(modified.likelihood, unmodified.likelihood)) -} - -SummarizeCNVs <- function(cnv.targets, counts, sample.name, state){ - sample.name <- sample.name - cnv.type <- ifelse(state==3, "DUP", "DEL") - cnv.start <- min(cnv.targets$target) - cnv.end <- max(cnv.targets$target) - cnv.chromosome <- counts[cnv.start, "chromosome"] - cnv.start.base <- counts[cnv.start, "start"] - cnv.start.target <- counts[cnv.start, "target"] - cnv.end.base <- counts[cnv.end, "end"] - cnv.end.target <- counts[cnv.end, "target"] - cnv.kbs <- (cnv.end.base - cnv.start.base) / 1000 - cnv.midbp <- round((cnv.end.base - cnv.start.base) / 2) + cnv.start.base - cnv.targets <- paste(cnv.start.target, "..", cnv.end.target, sep="") - cnv.interval <- paste(cnv.chromosome, ":", cnv.start.base, "-", cnv.end.base, sep="") - num.targets <- cnv.end.target - cnv.start.target + 1 - return(data.frame(sample.name=sample.name, cnv.type=cnv.type, cnv.interval=cnv.interval, - cnv.kbs=cnv.kbs, cnv.chromosome=cnv.chromosome, - cnv.midbp=cnv.midbp, cnv.targets=cnv.targets, num.targets=num.targets)) -} - -PrintCNVs <- function(test.sample.name, viterbi.state, - nonzero.counts){ - consecutiveGroups <- function(sequence){ - num <- length(sequence) - group <- 1 - groups <- rep(0, num) - groups[1] <- group - if (num > 1){ - for (i in 2:num){ - if (!sequence[i] == (sequence[i - 1] + 1)) group <- group + 1 - groups[i] <- group - } - } - return(groups) - } - num.duplications <- 0 - num.deletions <- 0 - for (state in c(1, 3)){ - cnv.targets <- which(viterbi.state$viterbi.state == state) - if (!length(cnv.targets) == 0){ - groups <- consecutiveGroups(cnv.targets) - library(plyr) - cnvs.temp.df <- ddply(data.frame(target=cnv.targets, group=groups), - "group", SummarizeCNVs, nonzero.counts, test.sample.name, - state) - if (state == 1){ - deletions.df <- cnvs.temp.df - if (!is.null(dim(deletions.df))){ - num.deletions <- dim(deletions.df)[1] - } - } else { - duplications.df <- cnvs.temp.df - if (!is.null(dim(duplications.df))){ - num.duplications <- dim(duplications.df)[1] - } - } - } - } - num.calls <- num.deletions + num.duplications - cat(num.calls, "CNVs called in sample", test.sample.name, "\n") - if (num.deletions == 0 & num.duplications == 0){ - df <- data.frame(SAMPLE=character(0), CNV=character(0), INTERVAL=character(0), - KB=numeric(0), CHR=character(0), - MID_BP=numeric(), TARGETS=character(0), NUM_TARG=numeric(0), Q_SOME=numeric(0), MLCN=numeric(0)) - return(df) - } - if (num.deletions > 0 & num.duplications > 0){ - cnvs.df <- rbind(deletions.df, duplications.df) - } else { - ifelse(num.deletions > 0, - cnvs.df <- deletions.df, cnvs.df <- duplications.df) - } - xcnv <- cbind(cnvs.df[, c("sample.name", "cnv.type", "cnv.interval", - "cnv.kbs", "cnv.chromosome", "cnv.midbp", - "cnv.targets", "num.targets")], 0) - colnames(xcnv) <- c("SAMPLE", "CNV", "INTERVAL", "KB", "CHR", "MID_BP", "TARGETS", - "NUM_TARG", "MLCN") - xcnv$Q_SOME <- NA - return(xcnv) -} - -CalcCopyNumber <- function(data, cnvs, homdel.mean){ - for (i in 1:nrow(cnvs)){ - cnv <- cnvs[i, ] - targets <- as.numeric(unlist(strsplit(as.character(cnv$TARGETS), "..", fixed=T))) - cnv.data <- subset(data, target >= targets[1] & target <= targets[2]) - state.target.means <- t(apply(data.frame(x=cnv.data$countsmean), 1, - function(x) c(C1=x*1/2, C2=x, C3=x*3/2, - C4=x * 2, C5=x * 5/2, C6=x*6/2))) - # calculate the expected size (given the predicted variance) - size <- cnv.data$countsmean ^ 2 / (cnv.data$varestimate - cnv.data$countsmean) - emission.probs <- matrix(NA, nrow(cnv.data), 7) - colnames(emission.probs) <- c("C0", "C1", "C2", "C3", "C4", "C5", "C6") - #colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob") - # calculate the emission probabilities given the read count - emission.probs[, 1] <- dpois(cnv.data$sample, homdel.mean, log=T) - for (s in 1:6){ - size.state <- size * s/2 - emission.probs[, s+1] <- dnbinom(cnv.data$sample, mu=state.target.means[, s], - size=size.state, log=T) - } - cs <- colSums(emission.probs) - ml.state <- which.max(cs) - 1 - if (ml.state==2){ - ml.state <- ifelse(cnv$CNV=="DEL", 1, 3) - } - cnvs$MLCN[i] <- ml.state - } - return(cnvs) -} diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R index 980f6b1..62e2cfd 100644 --- a/R/CANOESCOV/R/run_CANOESCOV.R +++ b/R/CANOESCOV/R/run_CANOESCOV.R @@ -1,53 +1,56 @@ library(methods) -library(CODEX) -run_CANOESCOV <- function(reference_set_select_method, - num_of_samples_in_reference_set, - cov_table){ +run_CANOESCOV <- function(input_cov_table, + input_bed, + reference_sample_set_file, + output_calls_file){ - sampname <- unique(cov_table[,"sample_name"]) - targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] - targets <- targets[!duplicated(targets[,"target_id"]),] - targets <- targets[with(targets, order(target_id)), ] + con <- file(reference_sample_set_file, open='r') + reference_sample_set <- readLines(con) + Y <- read.csv(input_cov_table) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + rownames(Y) <- 1:nrow(Y) + rownames(targets) <- 1:nrow(targets) calls <- data.frame(matrix(nrow=0, ncol=13)) - chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) - for(chr in chrs) { - targets_for_chr <- targets[targets[,"chr"] == chr,] - ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"]) - if (length(ref) == 0) { # 0 elements for specified chromosome in bed - next() - } - Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y - Y <- cbind(rep(chr, nrow(Y)), start(ref), end(ref), Y) - target_length <- c() - for (i in 1:nrow(Y)) { - target_length <- c(target_length, width(ref[i])) - } + chr <- targets[1,'chr'] + ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) + if (length(ref) == 0) { # 0 elements for specified chromosome in bed + next() + } + Y <- cbind(rep(chr, nrow(Y)), start(ref), end(ref), Y) + target_length <- c() + for (i in 1:nrow(Y)) { + target_length <- c(target_length, width(ref[i])) + } - # TODO better transformation - write.table(Y, file=paste('cov_', chr, '.tsv', sep=""), quote=FALSE, sep="\t", col.names = F, row.names = F) - canoes.reads <- read.table(paste('cov_', chr, '.tsv', sep="")) + # TODO better transformation + write.table(Y, file=paste('cov_', chr, '.tsv', sep=""), quote=FALSE, sep="\t", col.names = F, row.names = F) + canoes.reads <- read.table(paste('cov_', chr, '.tsv', sep="")) - gc <- getgc(chr, ref) - target <- seq(1, nrow(Y)) - canoes.reads <- cbind(target, gc, canoes.reads) - sampname <- as.vector(sampname) - names(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) - colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) - write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T) - xcnv.list <- vector('list', length(sampname)) - for (i in 1:length(sampname)){ - xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=sampname[i], - counts=canoes.reads, - reference_set_select_method=reference_set_select_method, - num_of_samples_in_reference_set=num_of_samples_in_reference_set, - target_length=target_length) + gc <- getgc(chr, ref) + target <- seq(1, nrow(Y)) + canoes.reads <- cbind(target, gc, canoes.reads) + sampname <- as.vector(sampname) + names(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) + colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname) + write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T) + xcnv.list <- vector('list', length(sampname)) + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { + next() } - xcnvs <- do.call('rbind', xcnv.list) - if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} - calls <- rbind(calls, xcnvs) + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + actual_sample <- samples[1] + reference_samples <- samples[-1] + xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample, + reference.samples=reference_samples, + counts=canoes.reads) } + xcnvs <- do.call('rbind', xcnv.list) + if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} + calls <- rbind(calls, xcnvs) # unify results format if (nrow(calls) != 0) { @@ -80,7 +83,7 @@ run_CANOESCOV <- function(reference_set_select_method, calls[colnames(calls) == 'ed_bp'] <- as.character(unlist(calls[colnames(calls) == 'ed_bp'])) calls[colnames(calls) == 'st_exon'] <- as.character(unlist(calls[colnames(calls) == 'st_exon'])) calls[colnames(calls) == 'ed_exon'] <- as.character(unlist(calls[colnames(calls) == 'ed_exon'])) - calls + write.csv(calls, output_calls_file, row.names=F) } # SAMPLE CNV INTERVAL KB CHR MID_BP TARGETS NUM_TARG MLCN Q_SOME diff --git a/R/CNV.SIMULATOR/DESCRIPTION b/R/CNV.SIMULATOR/DESCRIPTION new file mode 100644 index 0000000..0acb0de --- /dev/null +++ b/R/CNV.SIMULATOR/DESCRIPTION @@ -0,0 +1,18 @@ +Package: CNV.SIMULATOR +Title: CNV.SIMULATOR A Package To Generate Artificial CNVs +Version: 0.0.1 +Authors@R: c( + person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")), + person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")), + person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut"))) +Description: An package to generate artificial CNVs. +Depends: + R (>= 3.2.3), + plyr (>= 1.8.4), + nnls (>= 1.4.0), + Hmisc (>= 4.0.0), + mgcv (>= 1.8.0) +License: GPL-3 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.0.1.9000 diff --git a/R/CNV.SIMULATOR/NAMESPACE b/R/CNV.SIMULATOR/NAMESPACE new file mode 100644 index 0000000..884a631 --- /dev/null +++ b/R/CNV.SIMULATOR/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: fake comment so roxygen2 overwrites silently. +exportPattern("^[^\\.]") diff --git a/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R new file mode 100644 index 0000000..779ba29 --- /dev/null +++ b/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R @@ -0,0 +1,120 @@ + +# build_intersection_matrix <- function(calls, refs){ +# intersection_matrix <- matrix(data=as.integer(0), nrow = nrow(calls), ncol = nrow(refs)) +# if (nrow(intersection_matrix) > 0 && ncol(intersection_matrix) > 0) { +# for (i in 1:nrow(intersection_matrix)) { +# for (j in 1:ncol(intersection_matrix)) { +# if (as.character(calls[i,"sample_name"]) == as.character(refs[j,"sample_name"]) && +# as.character(calls[i,"chr"]) == as.character(refs[j,"chr"]) && +# as.character(calls[i,"cnv"]) == as.character(refs[j,"cnv"])) { +# overlap_length <- calc_overlap_length(calls[i,"st_bp"], +# calls[i,"ed_bp"], +# refs[j,"st_bp"], +# refs[j,"ed_bp"]) +# call_length <- calls[i,"ed_bp"] - calls[i,"st_bp"] +# ref_length <- refs[j,"ed_bp"] - refs[j,"st_bp"] +# overlap_factor <- overlap_length / ((call_length + ref_length) / 2) * 100 +# intersection_matrix[i,j] <- round(overlap_factor, 2) +# } +# } +# } +# } +# intersection_matrix +# } +# +# filter_intersection_matrix_by_overlap_factor <- function(intersection_matrix, min_overlap_factor){ +# if (nrow(intersection_matrix) > 0 && ncol(intersection_matrix) > 0) { +# for (i in 1:nrow(intersection_matrix)) { +# for (j in 1:ncol(intersection_matrix)) { +# if (intersection_matrix[i,j] < min_overlap_factor) { +# intersection_matrix[i,j] <- 0.00 +# } +# } +# } +# } +# intersection_matrix +# } +# +# calc_number_of_different_copy_number_for_cnv <- function(cnv, calls){ +# copy_no <- c() +# for (i in 1:nrow(calls)) { +# if (as.character(calls[i,"chr"]) == as.character(cnv[1,"chr"]) && +# calls[i,"st_bp"] == cnv[1,"st_bp"] && +# calls[i,"ed_bp"] == cnv[1,"ed_bp"] && +# !is.na(calls[i,"copy_no"])) { +# copy_no <- c(copy_no, calls[i,"copy_no"]) +# } +# } +# length(unique(copy_no)) +# } +# +# calc_NA_rate_for_cnv <- function(cnv, calls){ +# num_of_samples <- length(unique(calls[,"sample_name"])) +# num_of_NA <- 0 +# for (i in 1:nrow(calls)) { +# if (as.character(calls[i,"chr"]) == as.character(cnv[1,"chr"]) && +# calls[i,"st_bp"] == cnv[1,"st_bp"] && +# calls[i,"ed_bp"] == cnv[1,"ed_bp"] && +# is.na(calls[i,"cnv"])) { +# num_of_NA <- num_of_NA + 1 +# } +# } +# round(num_of_NA / num_of_samples, 2) +# } +# +# calc_cnv_frequency <- function(cnv, calls){ +# num_of_samples <- length(unique(calls[,"sample_name"])) +# num_of_same_cnv <- 0 +# for (i in 1:nrow(calls)) { +# if (as.character(calls[i,"chr"]) == as.character(cnv[1,"chr"]) && +# calls[i,"st_bp"] == cnv[1,"st_bp"] && +# calls[i,"ed_bp"] == cnv[1,"ed_bp"] && +# as.character(calls[i,"cnv"]) == as.character(cnv[1,"cnv"])) { +# num_of_same_cnv <- num_of_same_cnv + 1 +# } +# } +# round(num_of_same_cnv / num_of_samples, 2) +# } +# +# calc_overlap_length <- function(min1, max1, min2, max2){ +# overlap_length <- max(0, min(max1, max2) - max(min1, min2)) +# overlap_length +# } +# +# calc_quality_statistics <- function(TP, FP, TN, FN){ +# sensitivity <- if (TP + FN > 0) TP / (TP + FN) else 0 +# specificity <- if (TN + FP > 0) TN / (TN + FP) else 0 +# precision <- if (TP + FP > 0) TP / (TP + FP) else 0 +# accuracy <- if (TP + TN + FP + FN > 0) (TP + TN) / (TP + TN + FP + FN) else 0 +# return(list(sensitivity=round(sensitivity, digits=3), +# specificity=round(specificity, digits=3), +# precision=round(precision, digits=3), +# accuracy=round(accuracy, digits=3))) +# } +# +# calc_confusion_matrix <- function(intersection_matrix, num_of_original_targets_in_refs, num_of_original_samples_in_refs){ +# # TP +# TP <- 0 +# if (nrow(intersection_matrix) > 0) { +# for (i in 1:nrow(intersection_matrix)) { +# if (sum(intersection_matrix[i,] != 0) != 0) { +# TP <- TP + 1 +# } +# } +# } +# # FP +# FP <- nrow(intersection_matrix) - TP +# # FN +# FN <- 0 +# if (ncol(intersection_matrix) > 0) { +# for (j in 1:ncol(intersection_matrix)) { +# if (sum(intersection_matrix[,j] != 0) == 0) { +# FN <- FN + 1 +# } +# } +# } +# # TN +# TN <- (num_of_original_targets_in_refs * num_of_original_samples_in_refs) - FN +# return(list(TP=TP, FP=FP, TN=TN, FN=FN)) +# } +# diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R new file mode 100644 index 0000000..04ba8d1 --- /dev/null +++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R @@ -0,0 +1,63 @@ +run_CNV.SIMULATOR <- function(input_cov_table, + input_bed, + input_males, + input_females, + output_cov_table, + output_generated_cnvs, + number_of_cnvs_per_sample, + min_number_of_regions, + max_number_of_regions, + simulation_mode){ + + + Y <- read.csv(input_cov_table) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + males <- as.character(unlist(read.table(input_males, sep = ","))) + females <- as.character(unlist(read.table(input_females, sep = ","))) + generated_cnvs <- matrix(nrow=0, ncol=6) + colnames(generated_cnvs) <- c('sample_name','cnv','chr','st_bp','ed_bp','copy_no') + if (simulation_mode == "downsample") { + downsample_factor <- 0.5 + for (sample in sampname) { + print(paste("Generating arficial CNVs in sample: ", sample, sep="")) + for (i in 1:number_of_cnvs_per_sample) { + cnv_length <- floor(runif(1, min=min_number_of_regions, max=max_number_of_regions)) + cnv_start <- floor(runif(1, min=1, max=nrow(targets))) + for (j in cnv_start:(min(cnv_start+cnv_length-1,nrow(targets)))) { + Y[j,sample] <- floor(Y[j,sample]*downsample_factor) + } + print(paste(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" ")) + generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, 'del', as.character(targets[cnv_start,1]), targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1)) + } + } + write.csv(Y[,males], paste(output_cov_table, ".males", sep=""), row.names=F, quote=F) + write.csv(Y[,females], paste(output_cov_table, ".females", sep=""), row.names=F, quote=F) + } else if (simulation_mode == "replace") { + Y_males <- Y[,males] + Y_females <- Y[,females] + for (female in females) { + print(paste("Generating arficial CNVs in sample: ", female, sep="")) + cov <- cor(Y[,female], Y[,males]) + covariances <- cov[1,males] + male <- names(sort(covariances, decreasing=T)[1:min(1, length(covariances))]) + #male <- males[floor(runif(1, min=1, max=length(males)))] # random male sample - in Ximmer tool + for (i in 1:number_of_cnvs_per_sample) { + cnv_length <- floor(runif(1, min=min_number_of_regions, max=max_number_of_regions)) + cnv_start <- floor(runif(1, min=1, max=nrow(targets))) + for (j in cnv_start:(min(cnv_start+cnv_length-1,nrow(targets)))) { + Y_females[j,female] <- Y_males[j,male] + Y[j,female] <- Y[j,male] + } + print(paste(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" ")) + generated_cnvs <- rbind(generated_cnvs, matrix(c(female, 'del', as.character(targets[cnv_start,1]), targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1)) + } + } + write.csv(Y_males, paste(output_cov_table, ".males", sep=""), row.names=F, quote=F) + write.csv(Y_females, paste(output_cov_table, ".females", sep=""), row.names=F, quote=F) + } else { + print("Choose proper simulation mode!!!") + } + write.csv(Y, output_cov_table, row.names=F, quote=F) + write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F) +} diff --git a/R/CNV.SIMULATOR/inst/simulate_cnvs.R b/R/CNV.SIMULATOR/inst/simulate_cnvs.R new file mode 100755 index 0000000..88c58d8 --- /dev/null +++ b/R/CNV.SIMULATOR/inst/simulate_cnvs.R @@ -0,0 +1,61 @@ +#!/usr/bin/env Rscript +options(java.parameters = "-Xmx1512m") +library(devtools) +library('CNV.SIMULATOR') +library(optparse) +if (length(which(installed.packages()[,1] == "stringr")) == 0){install.packages("stringr",repos="https://cloud.r-project.org/")} +library(stringr) + +option_list <- list( + make_option("--input_cov_table", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--input_bed", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--input_males", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--input_females", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--output_cov_table", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--output_generated_cnvs", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--min_number_of_cnvs_per_sample", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--min_number_of_regions", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--max_number_of_regions", default="public.runner_calls", + help="Calls table. [default %default]"), + make_option("--simulation_mode", default="1", + help="Calls table. [default %default]") +) +opt <- parse_args(OptionParser(option_list=option_list)) + +simulate_cnvs <- function(parameters, cov_table){ + simulated_cnvs <- run_CNV.SIMULATOR(input_cov_table, + input_bed, + input_males, + input_females, + output_cov_table, + output_generated_cnvs, + min_number_of_cnvs_per_sample, + min_number_of_regions, + max_number_of_regions, + simulation_mode + ) + simulated_cnvs +} + +simulated_cnvs <- simulate_cnvs(opt$input_cov_table, + opt$input_bed, + opt$input_males, + opt$input_females, + opt$output_cov_table, + opt$output_generated_cnvs, + opt$min_number_of_cnvs_per_sample, + opt$min_number_of_regions, + opt$max_number_of_regions, + opt$simulation_mode +) +print(simulated_cnvs) + + diff --git a/R/CNVCALLER.EVALUATOR/DESCRIPTION b/R/CNVCALLER.EVALUATOR/DESCRIPTION index f16487e..5dda4e9 100644 --- a/R/CNVCALLER.EVALUATOR/DESCRIPTION +++ b/R/CNVCALLER.EVALUATOR/DESCRIPTION @@ -9,7 +9,7 @@ Description: A package to evaluate CNV callers results. Depends: R (>= 3.2.3), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4) License: GPL-3 Encoding: UTF-8 diff --git a/R/CNVCALLER.RUNNER/DESCRIPTION b/R/CNVCALLER.RUNNER/DESCRIPTION index aa62f1f..1f68fdc 100644 --- a/R/CNVCALLER.RUNNER/DESCRIPTION +++ b/R/CNVCALLER.RUNNER/DESCRIPTION @@ -13,7 +13,7 @@ Depends: EXOMEDEPTHCOV (>= 0.0.1), CANOESCOV (>= 0.0.1), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4) License: GPL-3 Encoding: UTF-8 diff --git a/R/CODEXCOV/CODEXCOV.Rproj b/R/CODEXCOV/CODEXCOV.Rproj deleted file mode 100755 index d848a9f..0000000 --- a/R/CODEXCOV/CODEXCOV.Rproj +++ /dev/null @@ -1,16 +0,0 @@ -Version: 1.0 - -RestoreWorkspace: No -SaveWorkspace: No -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -Encoding: UTF-8 - -AutoAppendNewline: Yes -StripTrailingWhitespace: Yes - -BuildType: Package -PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source -PackageRoxygenize: rd,collate,namespace diff --git a/R/CODEXCOV/DESCRIPTION b/R/CODEXCOV/DESCRIPTION index d9e2385..aacb351 100755 --- a/R/CODEXCOV/DESCRIPTION +++ b/R/CODEXCOV/DESCRIPTION @@ -12,10 +12,9 @@ Description: An extended implementation of the CODEX package in R. It extends Depends: R (>= 3.2.3), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4), - CODEX (>= 1.8.0), - REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1) + CODEX (>= 1.8.0) License: GPL-3 Encoding: UTF-8 LazyData: true diff --git a/R/CODEXCOV/R/functions_CODEXCOV.R b/R/CODEXCOV/R/functions_CODEXCOV.R index 3f7522c..25ad2de 100644 --- a/R/CODEXCOV/R/functions_CODEXCOV.R +++ b/R/CODEXCOV/R/functions_CODEXCOV.R @@ -1,26 +1,5 @@ library(CODEX) -#' Function Dexcription -#' -#' Function description. -#' @param cov_file -#' @param sampname -#' @keywords -#' @export -#' @examples -#' coverageObj1 -coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ - Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) - for(sample in sampname) { - cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] - cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ] - Y <- cbind(Y, cov_targets_for_sample[,"read_count"]) - } - colnames(Y) <- sampname - rownames(Y) <- targets_for_chr[,"target_id"] - return(list(Y=Y)) -} - #' Function Dexcription #' #' Function description. diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R index 78929de..e5565ae 100644 --- a/R/CODEXCOV/R/run_CODEXCOV.R +++ b/R/CODEXCOV/R/run_CODEXCOV.R @@ -9,77 +9,57 @@ run_CODEXCOV <- function(K_from, K_to, lmax, - reference_set_select_method, - num_of_samples_in_reference_set, - cov_table){ - - sampname <- unique(cov_table[,"sample_name"]) - sampname <- as.character(sampname) - targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] - targets <- targets[!duplicated(targets[,"target_id"]),] - targets <- targets[with(targets, order(target_id)), ] + input_cov_table, + input_bed, + reference_sample_set_file, + output_calls_file){ + + con <- file(reference_sample_set_file, open='r') + reference_sample_set <- readLines(con) + Y <- read.csv(input_cov_table) + targets <- read.delim(input_bed) + rownames(Y) <- 1:nrow(Y) + rownames(targets) <- 1:nrow(targets) finalcall <- matrix(nrow=0, ncol=13) - chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) - - for(chr in chrs) { - targets_for_chr <- targets[targets[,"chr"] == chr,] - ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"]) - if (length(ref) == 0) { # 0 elements for specified chromosome in bed + ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) + + ################################################### + ### code chunk number 5: gcmapp1 + ################################################### + gcmapp1_result <- gcmapp1(targets[1,'chr'], ref) + gc <- gcmapp1_result$gc + + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { next() } - ################################################### - ### code chunk number 4: coverageObj1 - ################################################### - Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y - - ################################################### - ### code chunk number 5: gcmapp1 - ################################################### - gcmapp1_result <- gcmapp1(chr, ref) - gc <- gcmapp1_result$gc + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + actual_sample <- samples[1] + reference_samples <- samples[-1] + samples <- sort(samples) + Y_subset <- as.matrix(Y[,samples]) ################################################### ### code chunk number 7: normObj1 ################################################### - normObj_result <- normObj1(Y, gc, K = K_from:K_to) + normObj_result <- normObj1(Y_subset, gc, K = K_from:K_to) Yhat <- normObj_result$Yhat AIC <- normObj_result$AIC BIC <- normObj_result$BIC RSS <- normObj_result$RSS K <- normObj_result$K - - ################################################### - ### code chunk number 8: normObj2 (eval = FALSE) - ################################################### - ## normObj_result <- normObj2(Y, gc, K = 1:9, normal_index=seq(1,45,2)) - ## Yhat <- normObj_result$Yhat - ## AIC <- normObj_result$AIC - ## BIC <- normObj_result$BIC - ## RSS <- normObj_result$RSS - ## K <- normObj_result$K - - ################################################### - ### code chunk number 9: choiceofK (eval = FALSE) - ################################################### - #choiceofK(AIC, BIC, RSS, K, filename = paste("choiceofK_", chr, ".pdf", sep = "")) - - ################################################### - ### code chunk number 10: fig1 - ################################################### - #plot(K, RSS, type = "b", xlab = "Number of latent variables") - #plot(K, AIC, type = "b", xlab = "Number of latent variables") - #plot(K, BIC, type = "b", xlab = "Number of latent variables") - + ################################################### ### code chunk number 11: segment1 ################################################### - finalcallIt <- segment1(Y, Yhat, K[which.max(BIC)], K, sampname, - ref, chr, lmax, mode = "integer")$finalcall - if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))} + finalcallIt <- segment1(Y_subset, Yhat, K[which.max(BIC)], K, samples, + ref, targets[1,'chr'], lmax, mode = "integer")$finalcall + finalcallIt <- finalcallIt[finalcallIt[,"sample_name"] == actual_sample,] + if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))} finalcall <- rbind(finalcall, finalcallIt) - + print(finalcall) } finalcall <- unify_calls_format(finalcall)$finalcall - finalcall + write.csv(finalcall, output_calls_file, row.names=F) } diff --git a/R/CODEXCOV/man/coverageObj1.Rd b/R/CODEXCOV/man/coverageObj1.Rd deleted file mode 100644 index f6c6b1c..0000000 --- a/R/CODEXCOV/man/coverageObj1.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{coverageObj1} -\alias{coverageObj1} -\title{Function Dexcription} -\usage{ -coverageObj1(cov_file, sampname) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/CODEXCOV/man/gcmapp1.Rd b/R/CODEXCOV/man/gcmapp1.Rd deleted file mode 100644 index 2fa53f1..0000000 --- a/R/CODEXCOV/man/gcmapp1.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{gcmapp1} -\alias{gcmapp1} -\title{Function Dexcription} -\usage{ -gcmapp1(chr, ref) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/CODEXCOV/man/normObj1.Rd b/R/CODEXCOV/man/normObj1.Rd deleted file mode 100644 index 66b0a96..0000000 --- a/R/CODEXCOV/man/normObj1.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{normObj1} -\alias{normObj1} -\title{Function Dexcription} -\usage{ -normObj1(Y_qc, gc_qc, K) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/CODEXCOV/man/normObj2.Rd b/R/CODEXCOV/man/normObj2.Rd deleted file mode 100644 index 4a10d47..0000000 --- a/R/CODEXCOV/man/normObj2.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{normObj2} -\alias{normObj2} -\title{Function Dexcription} -\usage{ -normObj2(Y_qc, gc_qc, K, normal_index) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/CODEXCOV/man/qcObj1.Rd b/R/CODEXCOV/man/qcObj1.Rd deleted file mode 100644 index 806a098..0000000 --- a/R/CODEXCOV/man/qcObj1.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{qcObj1} -\alias{qcObj1} -\title{Function Dexcription} -\usage{ -qcObj1(Y, sampname, chr, ref, mapp, gc, cov_thresh, length_thresh, mapp_thresh, - gc_thresh) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/CODEXCOV/man/run_CODEXCOV.Rd b/R/CODEXCOV/man/run_CODEXCOV.Rd deleted file mode 100644 index f80759e..0000000 --- a/R/CODEXCOV/man/run_CODEXCOV.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/run_CODEXCOV.R -\name{run_CODEXCOV} -\alias{run_CODEXCOV} -\title{Function Dexcription} -\usage{ -run_CODEXCOV(cov_file, sampname) -} -\description{ -Function description. -} -\examples{ -run_codexcov -} diff --git a/R/CODEXCOV/man/segment1.Rd b/R/CODEXCOV/man/segment1.Rd deleted file mode 100644 index c7cf654..0000000 --- a/R/CODEXCOV/man/segment1.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/functions_CODEXCOV.R -\name{segment1} -\alias{segment1} -\title{Function Dexcription} -\usage{ -segment1(Y_qc, Yhat, optK, K, sampname_qc, ref_qc, chr, lmax, mode) -} -\description{ -Function description. -} -\examples{ -coverageObj1 -} diff --git a/R/EXOMECOPYCOV/DESCRIPTION b/R/EXOMECOPYCOV/DESCRIPTION new file mode 100644 index 0000000..1c27c1d --- /dev/null +++ b/R/EXOMECOPYCOV/DESCRIPTION @@ -0,0 +1,22 @@ +Package: EXOMECOPYCOV +Title: EXOMECOPY Package With Interface To External Coverage File +Version: 0.0.1 +Authors@R: c( + person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")), + person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")), + person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut"))) +Description: An extended implementation of the exomeCopy package in R. It extends + original implementation by using external coverage file, which should + speed up calculations for running application with multiple sets of input + parameters. +Depends: + R (>= 3.2.3), + devtools (>= 1.13.2), + DBI (>= 0.8), + optparse (== 1.4.4), + IRanges (>= 2.0.0), + exomeCopy (== 1.22) +License: GPL-3 +Encoding: UTF-8 +LazyData: true +RoxygenNote: 6.0.1.9000 diff --git a/R/EXOMECOPYCOV/NAMESPACE b/R/EXOMECOPYCOV/NAMESPACE new file mode 100644 index 0000000..884a631 --- /dev/null +++ b/R/EXOMECOPYCOV/NAMESPACE @@ -0,0 +1,2 @@ +# Generated by roxygen2: fake comment so roxygen2 overwrites silently. +exportPattern("^[^\\.]") diff --git a/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R new file mode 100644 index 0000000..bcfdd0d --- /dev/null +++ b/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R @@ -0,0 +1,34 @@ + +# from CODEX package +getgc <- function(chr, ref) { + library(GenomeInfoDb) + library(BSgenome.Hsapiens.UCSC.hg19) + if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") { + chrtemp <- 23 + } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") { + chrtemp <- 24 + } else { + chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1]) + } + if (length(chrtemp) == 0) + message("Chromosome cannot be found in NCBI Homo sapiens database!") + chrm <- unmasked(Hsapiens[[chrtemp]]) + seqs <- Views(chrm, ref) + af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE) + gc <- round((af[, "G"] + af[, "C"]) * 100,2) + gc +} + +unify_calls_format <- function(compiled.segments, chr){ + calls <- matrix(nrow=length(compiled.segments$sample.name), ncol=7) + colnames(calls) <- c('sample_name', 'chr', 'st_bp', 'ed_bp', 'cnv', 'copy_no', 'log_odds') + calls[,'sample_name'] <- compiled.segments$sample.name + calls[,'chr'] <- rep(chr, nrow(calls)) + calls[,'st_bp'] <- unlist(start(ranges(compiled.segments))) + calls[,'ed_bp'] <- unlist(end(ranges(compiled.segments))) + calls[,'copy_no'] <- compiled.segments$copy.count + calls[,'cnv'] <- ifelse(calls[,'copy_no'] > 2, 'dup', 'del') + calls[,'log_odds'] <- compiled.segments$log.odds + calls <- subset(calls, calls[,'copy_no'] != "2") + return(list(calls=calls)) +} diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R new file mode 100644 index 0000000..af45307 --- /dev/null +++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R @@ -0,0 +1,59 @@ +library(methods) +library(exomeCopy) + +run_EXOMECOPYCOV <- function(input_cov_table, + input_bed, + reference_sample_set_file, + output_calls_file){ + + con <- file(reference_sample_set_file, open='r') + reference_sample_set <- readLines(con) + Y <- read.csv(input_cov_table) + targets <- read.delim(input_bed) + rownames(Y) <- 1:nrow(Y) + rownames(targets) <- 1:nrow(targets) + chr <- targets[1,'chr'] + ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) + if (length(ref) == 0) { # 0 elements for specified chromosome in bed + next() + } + target <- GRanges(seqname = chr, IRanges(start = start(ref) + 1, end = end(ref))) + gc <- getgc(chr, ref) + + rdata_org <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2) + finalcall <- matrix(nrow=0, ncol=13) + + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { + next() + } + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + actual_sample <- samples[1] + reference_samples <- samples[-1] + samples <- sort(samples) + rdata <- rdata_org + + for(sample.name in samples) { + rdata[[sample.name]] <- Y[,sample.name] + } + + rdata[["bg"]] <- generateBackground(samples, rdata, median) + rdata[["log.bg"]] <- log(rdata$bg + .1) + rdata[["width"]] <- width(ref) + + samples <- c(actual_sample) + fit.list <- lapply(samples, function(sample.name) { + lapply(seqlevels(target), function(seq.name) { + print(paste("Processing sample: ", sample.name, sep="")) + exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2) + }) + }) + compiled.segments <- compileCopyCountSegments(fit.list) + finalcallIt <- unify_calls_format(compiled.segments, chr)$calls + if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))} + finalcall <- rbind(finalcall, finalcallIt) + print(finalcallIt) + + } + write.csv(finalcall, output_calls_file, row.names=F) +} diff --git a/R/EXOMEDEPTHCOV/DESCRIPTION b/R/EXOMEDEPTHCOV/DESCRIPTION index d4a3025..4627623 100644 --- a/R/EXOMEDEPTHCOV/DESCRIPTION +++ b/R/EXOMEDEPTHCOV/DESCRIPTION @@ -12,11 +12,10 @@ Description: An extended implementation of the ExomeDepth package in R. It exten Depends: R (>= 3.2.3), devtools (>= 1.13.2), - DBI (== 0.8), + DBI (>= 0.8), optparse (== 1.4.4), IRanges (>= 2.0.0), ExomeDepth (>= 1.1.10), - REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1) License: GPL-3 Encoding: UTF-8 LazyData: true diff --git a/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R index 2b38d6b..e69de29 100644 --- a/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R +++ b/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R @@ -1,15 +0,0 @@ -library(ExomeDepth) - -coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ - Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) - for(sample in sampname) { - cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] - cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ] - Y <- cbind(Y, cov_targets_for_sample[,"read_count"]) - } - colnames(Y) <- sampname - rownames(Y) <- targets_for_chr[,"target_id"] - return(list(Y=Y)) -} - - diff --git a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R index 295306e..5694e4b 100644 --- a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R +++ b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R @@ -1,66 +1,54 @@ library(ExomeDepth) library(methods) -run_EXOMEDEPTHCOV <- function(reference_set_select_method, - num_of_samples_in_reference_set, - cov_table){ +run_EXOMEDEPTHCOV <- function(input_cov_table, + input_bed, + reference_sample_set_file, + output_calls_file){ - sampname <- unique(cov_table[,"sample_name"]) - targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] - targets <- targets[!duplicated(targets[,"target_id"]),] - targets <- targets[with(targets, order(target_id)), ] - + con <- file(reference_sample_set_file, open='r') + reference_sample_set <- readLines(con) + Y <- read.csv(input_cov_table) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + rownames(Y) <- 1:nrow(Y) + rownames(targets) <- 1:nrow(targets) calls <- data.frame(matrix(nrow=0, ncol=13)) - chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) - library(IRanges) - for(chr in chrs) { - targets_for_chr <- targets[targets[,"chr"] == chr,] - ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"]) - if (length(ref) == 0) { # 0 elements for specified chromosome in bed + + for (i in 1:length(reference_sample_set)) { + if (reference_sample_set[[i]] == '') { next() } - Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y - - for (actual_sample_id in 1:length(sampname)) { - actual_sample <- sampname[actual_sample_id] - ## ----reference.selection------------------------------------------------- - target_length <- c() - for (i in 1:nrow(Y)) { - target_length <- c(target_length, width(ref[i])) - } - reference_samples <- run_REFERENCE.SAMPLE.SET.SELECTOR(actual_sample, - Y, - reference_set_select_method, - num_of_samples_in_reference_set, - target_length) + samples <- unlist(strsplit(reference_sample_set[[i]], ',')) + actual_sample <- samples[1] + reference_samples <- samples[-1] - ## ----construct.ref------------------------------------------------------- - my.matrix <- as.matrix(Y[,reference_samples]) - my.reference.selected <- apply(X = my.matrix, - MAR = 1, - FUN = sum) + ## ----construct.ref------------------------------------------------------- + my.matrix <- as.matrix(Y[,reference_samples]) + my.reference.selected <- apply(X = my.matrix, + MAR = 1, + FUN = sum) - ## ----build.complete------------------------------------------------------ - all.exons <- new('ExomeDepth', - test = Y[,actual_sample_id], - reference = my.reference.selected, - formula = 'cbind(test, reference) ~ 1') + ## ----build.complete------------------------------------------------------ + all.exons <- new('ExomeDepth', + test = Y[,actual_sample], + reference = my.reference.selected, + formula = 'cbind(test, reference) ~ 1') - ## ----call.CNVs----------------------------------------------------------- - all.exons <- ExomeDepth::CallCNVs(x = all.exons, - transition.probability = 10^-4, - chromosome = rep(chr, nrow(Y)), - start = start(ref), - end = end(ref), - name = rep('name', nrow(Y))) - print(all.exons@CNV.calls) - if (nrow(all.exons@CNV.calls) > 0) { - actual_sample_column <- data.frame(matrix(rep(actual_sample, nrow(all.exons@CNV.calls)), nrow=nrow(all.exons@CNV.calls))) - callsIt <- cbind(actual_sample_column, all.exons@CNV.calls) - colnames(callsIt) <- c(c, colnames(all.exons@CNV.calls)) - if (nrow(calls)==0){calls <- data.frame(matrix(nrow=0, ncol=ncol(callsIt)))} - calls <- rbind(calls, callsIt) - } + ## ----call.CNVs----------------------------------------------------------- + all.exons <- ExomeDepth::CallCNVs(x = all.exons, + transition.probability = 10^-4, + chromosome = targets[,"chr"], + start = targets[,"st_bp"], + end = targets[,"ed_bp"], + name = rep('name', nrow(Y))) + print(all.exons@CNV.calls) + if (nrow(all.exons@CNV.calls) > 0) { + actual_sample_column <- data.frame(matrix(rep(actual_sample, nrow(all.exons@CNV.calls)), nrow=nrow(all.exons@CNV.calls))) + callsIt <- cbind(actual_sample_column, all.exons@CNV.calls) + colnames(callsIt) <- c(c, colnames(all.exons@CNV.calls)) + if (nrow(calls)==0){calls <- data.frame(matrix(nrow=0, ncol=ncol(callsIt)))} + calls <- rbind(calls, callsIt) } } # unify names of output columns @@ -86,5 +74,5 @@ run_EXOMEDEPTHCOV <- function(reference_set_select_method, colnames(calls)[colnames(calls) == 'reads.observed'] <- 'raw_cov' colnames(calls)[colnames(calls) == 'reads.ratio'] <- 'copy_no' calls[colnames(calls) == 'copy_no'] <- round(calls[colnames(calls) == 'raw_cov'] / (calls[colnames(calls) == 'norm_cov'] / 2)) - calls + write.csv(calls, output_calls_file, row.names=F) } diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R index 6ac3736..4863a15 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -1,4 +1,3 @@ -library(ExomeDepth) canoes_method <- function(investigated_sample, Y, num_refs){ if (num_refs == 0) { @@ -13,7 +12,19 @@ canoes_method <- function(investigated_sample, Y, num_refs){ return(list(reference_samples=reference_samples)) } +canoes_cov_thresh_method <- function(investigated_sample, Y, cov_thresh){ + samples <- colnames(Y) + cov <- cor(Y[, samples], Y[, samples]) + reference_samples <- setdiff(samples, investigated_sample) + covariances <- cov[investigated_sample, reference_samples] + num_refs <- sum(covariances > cov_thresh) + reference_samples <- names(sort(covariances, + decreasing=T)[1:num_refs]) + return(list(reference_samples=reference_samples)) +} + exomedepth_method <- function(investigated_sample, Y, num_refs, target_length){ + library(ExomeDepth) samples <- colnames(Y) reference_samples <- setdiff(samples, investigated_sample) reference_set <- select.reference.set(test.counts = Y[,investigated_sample], @@ -30,3 +41,45 @@ exomedepth_method <- function(investigated_sample, Y, num_refs, target_length){ } return(list(reference_samples=reference_samples)) } + +random_method <- function(investigated_sample, Y, num_refs){ + samples <- colnames(Y) + reference_samples <- setdiff(samples, investigated_sample) + reference_samples <- reference_samples[sample(1:length(reference_samples), num_refs, replace=F)] + return(list(reference_samples=reference_samples)) +} + +kmeans_select_groups <- function(Y, number_of_clusters){ + samples <- colnames(Y) + cov <- cor(Y[, samples], Y[, samples]) + d <- cov + for(i in 1:nrow(d)) { + d[i,] <- cov[samples[i], samples] + } + d <- 1-d + c <- c() + for(i in 1:ncol(d)-1) { + c <- c(c, d[(i+1):nrow(d),i]) + } + d <- dist(d) + for(i in 1:length(d)) { + d[i] <- c[i] + } + km1 <- kmeans(d, number_of_clusters, nstart=100) + return(list(clusters=km1)) +} + +kmeans_method <- function(investigated_sample, Y, kmeans_clusters){ + samples <- colnames(Y) + cluster_id <- kmeans_clusters$cluster[investigated_sample] + reference_samples <- c() + list_index <- 1 + for(i in kmeans_clusters$cluster) { + if(i == cluster_id) { + reference_samples <- c(reference_samples, samples[list_index]) + } + list_index <- list_index + 1 + } + reference_samples <- setdiff(reference_samples, investigated_sample) + return(list(reference_samples=reference_samples)) +} diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R index f025b95..202c284 100644 --- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R +++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R @@ -1,16 +1,48 @@ -run_REFERENCE.SAMPLE.SET.SELECTOR <- function(investigated_sample, - Y, - select_method, +run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method, num_refs, - target_length){ - if(select_method == "canoes") { - reference_samples <- canoes_method(investigated_sample, Y, num_refs)$reference_samples - } else if(select_method == "codex") { - #reference_samples <- codex_method(investigated_sample, Y, num_refs)$reference_samples - } else if(select_method == "exomedepth") { - reference_samples <- exomedepth_method(investigated_sample, Y, num_refs, target_length)$reference_samples - } else if(select_method == "clamms") { - #reference_samples <- clamms_method(investigated_sample, Y, num_refs)$reference_samples + cov_thresh, + input_cov_table, + input_bed, + output_reference_file){ + + Y <- data.matrix(read.csv(input_cov_table)) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + target_length <- targets[,"st_bp"] - targets[,"ed_bp"] + reference_samples <- list() + if(select_method == "kmeans") { + kmeans_clusters <- kmeans_select_groups(Y, num_refs)$clusters } - reference_samples + + for(i in 1:length(sampname)) { + investigated_sample <- as.character(sampname[i]) + print(paste("Processing ", investigated_sample, " sample ...", sep="")) + if(select_method == "canoes") { + reference_samples_for_investigated_sample <- canoes_method(investigated_sample, Y, num_refs)$reference_samples + reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "codex") { + #reference_samples_for_investigated_sample <- codex_method(investigated_sample, Y, num_refs)$reference_samples + #reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "exomedepth") { + reference_samples_for_investigated_sample <- exomedepth_method(investigated_sample, Y, num_refs, target_length)$reference_samples + reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "clamms") { + #reference_samples_for_investigated_sample <- clamms_method(investigated_sample, Y, num_refs)$reference_samples + #reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "random") { + reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs)$reference_samples + reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "canoes_cov_thresh") { + reference_samples_for_investigated_sample <- canoes_cov_thresh_method(investigated_sample, Y, cov_thresh)$reference_samples + reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } else if(select_method == "kmeans") { + reference_samples_for_investigated_sample <- kmeans_method(investigated_sample, Y, kmeans_clusters)$reference_samples + reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample) + } + } + resultant_string <- '' + for(i in 1:length(reference_samples)) { + resultant_string <- paste(resultant_string, paste(reference_samples[[i]], collapse=","), '\n', sep="") + } + write(resultant_string, output_reference_file) } diff --git a/R/TARGET.QC/R/functions_TARGET.QC.R b/R/TARGET.QC/R/functions_TARGET.QC.R index 63f2278..850803e 100644 --- a/R/TARGET.QC/R/functions_TARGET.QC.R +++ b/R/TARGET.QC/R/functions_TARGET.QC.R @@ -1,17 +1,5 @@ library(CODEX) -coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){ - Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0) - for(sample in sampname) { - cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,] - cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ] - Y <- cbind(Y, cov_targets_for_sample[,"read_count"]) - } - colnames(Y) <- sampname - rownames(Y) <- targets_for_chr[,"target_id"] - return(list(Y=Y)) -} - gcmapp1 <- function(chr, ref){ gc <- getgc(chr, ref) mapp <- getmapp(chr, ref) diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R index 23eed55..f5560c6 100644 --- a/R/TARGET.QC/R/run_TARGET.QC.R +++ b/R/TARGET.QC/R/run_TARGET.QC.R @@ -5,7 +5,10 @@ run_TARGET.QC <- function(mapp_thresh, length_thresh_to, gc_thresh_from, gc_thresh_to, - cov_table){ + input_cov_table, + output_cov_table, + input_bed, + output_bed){ #mapp_thresh <- 0.9 #cov_thresh_from <- 20 #cov_thresh_to <- 4000 @@ -13,51 +16,23 @@ run_TARGET.QC <- function(mapp_thresh, #length_thresh_to <- 2000 #gc_thresh_from <- 20 #gc_thresh_to <- 80 - #K_from <- 1 - #K_to <- 9 - #lmax <- 200 - sampname <- unique(cov_table[,"sample_name"]) - targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")] - targets <- targets[!duplicated(targets[,"target_id"]),] - targets <- targets[with(targets, order(target_id)), ] - cov_table_qc <- matrix(nrow=0, ncol=6) - colnames(cov_table_qc) <- colnames(cov_table) - - chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y"))) - for(chr in chrs) { - targets_for_chr <- targets[targets[,"chr"] == chr,] - ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"]) - if (length(ref) == 0) { # 0 elements for specified chromosome in bed - next() - } - Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y - gcmapp1_result <- gcmapp1(chr, ref) - gc <- gcmapp1_result$gc - mapp <- gcmapp1_result$mapp - - qcObj1_result <- qcObj1(Y, sampname, chr, ref, mapp, gc, cov_thresh = c(cov_thresh_from, cov_thresh_to), - length_thresh = c(length_thresh_from, length_thresh_to), mapp_thresh, - gc_thresh = c(gc_thresh_from, gc_thresh_to)) - Y_qc <- qcObj1_result$Y_qc - sampname_qc <- qcObj1_result$sampname_qc - ref_qc <- qcObj1_result$ref_qc - colnames(Y_qc) <- sampname_qc - for(sample in colnames(Y_qc)) { - new_cov_table_qc_rows <- cbind(sample, rownames(Y_qc), chr, start(ref_qc), end(ref_qc), Y_qc[,sample]) - cov_table_qc <- rbind(cov_table_qc, new_cov_table_qc_rows) - } - } - cov_table_qc <- as.data.frame(cov_table_qc) - cov_table_qc[,"pos_min"] <- strtoi(cov_table_qc[,"pos_min"]) - cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"]) - cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"]) - cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"]) - cov_table_qc + Y <- read.csv(input_cov_table) + sampname <- colnames(Y) + targets <- read.delim(input_bed) + rownames(Y) <- 1:nrow(Y) + rownames(targets) <- 1:nrow(targets) + ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"]) + gcmapp1_result <- gcmapp1(targets[1,"chr"], ref) + gc <- gcmapp1_result$gc + mapp <- gcmapp1_result$mapp + qcObj1_result <- qcObj1(Y, sampname, targets[1,"chr"], ref, mapp, gc, cov_thresh = c(cov_thresh_from, cov_thresh_to), + length_thresh = c(length_thresh_from, length_thresh_to), mapp_thresh, + gc_thresh = c(gc_thresh_from, gc_thresh_to)) + Y_qc <- qcObj1_result$Y_qc + sampname_qc <- qcObj1_result$sampname_qc + ref_qc <- qcObj1_result$ref_qc + colnames(Y_qc) <- sampname_qc + write.csv(Y_qc, output_cov_table, row.names=F, quote=F) + write.table(targets[rownames(Y_qc),], output_bed, row.names=F, quote=F, sep="\t") } -# sample_name target_id chr pos_min pos_max read_count -#1 NA19012 193524 Y 25426932 25427053 0 -#2 NA19012 193525 Y 25431556 25431676 0 -#3 NA19012 193526 Y 25535089 25535239 0 -#4 NA19012 193527 Y 25537286 25537526 0 -#5 NA19012 193528 Y 25538793 25538913 0 diff --git a/airflow/dags/canoes.py b/airflow/dags/canoes.py new file mode 100755 index 0000000..fe08612 --- /dev/null +++ b/airflow/dags/canoes.py @@ -0,0 +1,59 @@ +from airflow import DAG +from airflow.operators.bash_operator import BashOperator +from airflow.models import Variable +from datetime import datetime, timedelta + +default_args = { + 'owner': 'biodatageeks', + 'depends_on_past': False, + 'start_date': datetime(2017, 10, 18), + 'email': ['team@biodatageeks.ii.pw.edu.pl'], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 0 +} + +dag = DAG( + 'canoes', default_args=default_args, schedule_interval=None) + +############################################## +########## RUN RAW CANOES CNV CALLER ########## +############################################## + +### target qc parameters +mapp_thresh = '0.9' +cov_thresh_from = '20' +cov_thresh_to = '4000' +length_thresh_from = '20' +length_thresh_to = '2000' +gc_thresh_from = '20' +gc_thresh_to = '80' +raw_cov_table = 'raw_cov_table.csv' +qc_cov_table = 'qc_cov_table.csv' +raw_bed = 'raw_bed.bed' +qc_bed = 'qc_bed.bed' + +### select reference sample set parameters +select_method = 'exomedepth' # "canoes", "codex" or "exomedepth" +num_refs = '30' +reference_sample_set_file = 'reference_sample_set.csv' + +### canoes parameters +output_calls_file = 'calls.csv' + +run_canoes_caller_cmd= " \ +docker pull biodatageeks/cnv-opt-target-qc; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "','" + raw_bed + "','" + qc_bed + "')\"; \ +docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "')\"; \ +docker pull biodatageeks/cnv-opt-canoescov; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-canoescov Rscript -e \"library(\'CANOESCOV\');run_CANOESCOV('" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \ +" + +run_canoes_caller_task= BashOperator ( + bash_command=run_canoes_caller_cmd, + task_id='run_canoes_caller_task', + dag=dag +) + +run_canoes_caller_task diff --git a/airflow/dags/codex.py b/airflow/dags/codex.py new file mode 100755 index 0000000..210b3bb --- /dev/null +++ b/airflow/dags/codex.py @@ -0,0 +1,62 @@ +from airflow import DAG +from airflow.operators.bash_operator import BashOperator +from airflow.models import Variable +from datetime import datetime, timedelta + +default_args = { + 'owner': 'biodatageeks', + 'depends_on_past': False, + 'start_date': datetime(2017, 10, 18), + 'email': ['team@biodatageeks.ii.pw.edu.pl'], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 0 +} + +dag = DAG( + 'codex', default_args=default_args, schedule_interval=None) + +############################################## +########## RUN RAW CODEX CNV CALLER ########## +############################################## + +### target qc parameters +mapp_thresh = '0.9' +cov_thresh_from = '20' +cov_thresh_to = '4000' +length_thresh_from = '20' +length_thresh_to = '2000' +gc_thresh_from = '20' +gc_thresh_to = '80' +raw_cov_table = 'raw_cov_table.csv' +qc_cov_table = 'qc_cov_table.csv' +raw_bed = 'raw_bed.bed' +qc_bed = 'qc_bed.bed' + +### select reference sample set parameters +select_method = 'exomedepth' # "canoes", "codex" or "exomedepth" +num_refs = '30' +reference_sample_set_file = 'reference_sample_set.csv' + +### codex parameters +k_from = '1' +k_to = '3' +lmax = '200' +output_calls_file = 'calls.csv' + +run_codex_caller_cmd= " \ +docker pull biodatageeks/cnv-opt-target-qc; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "','" + raw_bed + "','" + qc_bed + "')\"; \ +docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "')\"; \ +docker pull biodatageeks/cnv-opt-codexcov; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-codexcov Rscript -e \"library(\'CODEXCOV\');run_CODEXCOV(" + k_from + "," + k_to + "," + lmax + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \ +" + +run_codex_caller_task= BashOperator ( + bash_command=run_codex_caller_cmd, + task_id='run_codex_caller_task', + dag=dag +) + +run_codex_caller_task diff --git a/airflow/dags/exomedepth.py b/airflow/dags/exomedepth.py new file mode 100755 index 0000000..7952c2b --- /dev/null +++ b/airflow/dags/exomedepth.py @@ -0,0 +1,59 @@ +from airflow import DAG +from airflow.operators.bash_operator import BashOperator +from airflow.models import Variable +from datetime import datetime, timedelta + +default_args = { + 'owner': 'biodatageeks', + 'depends_on_past': False, + 'start_date': datetime(2017, 10, 18), + 'email': ['team@biodatageeks.ii.pw.edu.pl'], + 'email_on_failure': False, + 'email_on_retry': False, + 'retries': 0 +} + +dag = DAG( + 'exomedepth', default_args=default_args, schedule_interval=None) + +################################################### +########## RUN RAW EXOMEDEPTH CNV CALLER ########## +################################################### + +### target qc parameters +mapp_thresh = '0.9' +cov_thresh_from = '20' +cov_thresh_to = '4000' +length_thresh_from = '20' +length_thresh_to = '2000' +gc_thresh_from = '20' +gc_thresh_to = '80' +raw_cov_table = 'raw_cov_table.csv' +qc_cov_table = 'qc_cov_table.csv' +raw_bed = 'raw_bed.bed' +qc_bed = 'qc_bed.bed' + +### select reference sample set parameters +select_method = 'exomedepth' # "canoes", "codex" or "exomedepth" +num_refs = '30' +reference_sample_set_file = 'reference_sample_set.csv' + +### exomedepth parameters +output_calls_file = 'calls.csv' + +run_exomedepth_caller_cmd= " \ +docker pull biodatageeks/cnv-opt-target-qc; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "','" + raw_bed + "','" + qc_bed + "')\"; \ +docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "')\"; \ +docker pull biodatageeks/cnv-opt-exomedepthcov; \ +docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-exomedepthcov Rscript -e \"library(\'EXOMEDEPTHCOV\');run_EXOMEDEPTHCOV('" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \ +" + +run_exomedepth_caller_task= BashOperator ( + bash_command=run_exomedepth_caller_cmd, + task_id='run_exomedepth_caller_task', + dag=dag +) + +run_exomedepth_caller_task diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..313e5cb --- /dev/null +++ b/build.sh @@ -0,0 +1,57 @@ +#!/bin/bash -x + +BUILD_MODE=$1 +#only build images modified in the last 10h (10*3600s) +MAX_COMMIT_TS_DIFF=36000 + +bump_version () { + incl=0.01 + version="0.00" + if [ "$(curl -L -s "https://registry.hub.docker.com/v2/repositories/${image}/tags" | jq -r ".detail")" == "Object not found" ]; then + version="0.01" + else + version=`curl -L -s "https://registry.hub.docker.com/v2/repositories/${image}/tags" | jq -r '.results[0].name '` + version=`echo $version + $incl | bc| awk '{printf "%.2f\n", $0}'` + fi + echo $version +} + + +find Docker -name "Dockerfile" | sed 's/\/Dockerfile//' | while read dir; +do + + image=`echo $dir| sed 's/^Docker/biodatageeks/'` + version=`if [ ! -e $dir/version ]; then bump_version $image; else tail -1 $dir/version; fi` + if [ -e $dir/version ]; then + ver=`tail -1 $dir/version`; + if [[ $OSTYPE != "darwin17" ]]; then + sed -i "s/{{COMPONENT_VERSION}}/${ver}/g" $dir/Dockerfile ; + else + sed -i '' "s/{{COMPONENT_VERSION}}/${ver}/g" $dir/Dockerfile ; + fi + fi + echo "Building image ${image}..." + diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc` + if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then + cd $dir + if [[ ${image} == "biodatageeks/cnv-opt-cnv-simulator" ]]; then + echo "Rebuild of ${image} image forced..." + docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version . + docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest . + else + docker build -t $image:$version . + docker build -t $image:latest . + fi + if [[ ${BUILD_MODE} != "local" ]]; then + docker push docker.io/$image:latest + docker push docker.io/$image:$version + fi + ##revert COMPONENT_VERSION variable + if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi + #keep only last 3 versions of an image locally (2+3 in tail part) + docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {} + + cd ../.. + fi + +done