From ea559a77a1f0e8a02c7e38b20fb0a126467e9741 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 19 Mar 2018 18:20:20 +0100
Subject: [PATCH 001/114] dockers for single R packages - init

---
 Docker/target-qc/Dockerfile | 18 +++++++++++++
 build.sh                    | 51 +++++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 Docker/target-qc/Dockerfile
 create mode 100755 build.sh

diff --git a/Docker/target-qc/Dockerfile b/Docker/target-qc/Dockerfile
new file mode 100644
index 0000000..1dbc44e
--- /dev/null
+++ b/Docker/target-qc/Dockerfile
@@ -0,0 +1,18 @@
+FROM biodatageeks/bdg-spark
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+
+RUN apt-get update && apt-get install --yes \
+				wget \
+                 && rm -rf /var/lib/apt/lists/*
+
+RUN wget https://repo1.maven.org/maven2/org/bdgenomics/adam/adam-distribution-spark2_2.11/0.23.0/adam-distribution-spark2_2.11-0.23.0-bin.tar.gz
+RUN tar -zxvf adam-distribution-spark2_2.11-0.23.0-bin.tar.gz
+RUN mv adam-distribution-spark2_2.11-0.23.0 adam && mv adam /tmp && rm -f adam-distribution-spark2_2.11-0.23.0-bin.tar.gz
+ENV PATH="/tmp/adam/bin:${PATH}"
+
+
+
+
+
+
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..cae7f57
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,51 @@
+#!/bin/bash -x
+
+BUILD_MODE=$1
+#only build images modified in the last 10h (10*3600s)
+MAX_COMMIT_TS_DIFF=36000
+
+bump_version () {
+  incl=0.01
+  version="0.00"
+  if [ "$(curl -L -s "https://registry.hub.docker.com/v2/repositories/${image}/tags" | jq -r ".detail")" == "Object not found" ]; then
+    version="0.01"
+  else
+    version=`curl -L -s "https://registry.hub.docker.com/v2/repositories/${image}/tags"  | jq -r '.results[0].name '`
+    version=`echo $version + $incl | bc| awk '{printf "%.2f\n", $0}'`
+  fi
+  echo $version
+}
+
+
+find Docker  -name "Dockerfile"  | sed 's/\/Dockerfile//' | while read dir;
+do
+
+  image=`echo $dir| sed 's/^Docker/biodatageeks/'`
+  version=`if [ ! -e $dir/version ]; then bump_version $image; else tail -1 $dir/version; fi`
+  if [ -e $dir/version ]; then
+    ver=`tail -1 $dir/version`;
+    if [[ $OSTYPE != "darwin17" ]]; then
+     sed -i "s/{{COMPONENT_VERSION}}/${ver}/g" $dir/Dockerfile ;
+    else
+     sed -i '' "s/{{COMPONENT_VERSION}}/${ver}/g" $dir/Dockerfile ;
+    fi
+  fi
+  echo "Building image ${image}..."
+  diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
+  if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
+    cd $dir
+    docker build  -t $image:$version .
+    docker build  -t $image:latest .
+    if [[ ${BUILD_MODE} != "local" ]]; then
+      docker push docker.io/$image:latest
+      docker push docker.io/$image:$version
+    fi
+    ##revert COMPONENT_VERSION variable
+    if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi
+    #keep only last 3 versions of an image locally (2+3 in tail part)
+    docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {}
+
+    cd ../..
+  fi
+
+done

From 723d22248e261164a60bc62325af1d12401e519c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 19 Mar 2018 18:36:11 +0100
Subject: [PATCH 002/114] building docker images before running tests - only
 for speed up dev process

---
 Docker/reference-sample-set-selector/Dockerfile | 7 +++++++
 1 file changed, 7 insertions(+)
 create mode 100644 Docker/reference-sample-set-selector/Dockerfile

diff --git a/Docker/reference-sample-set-selector/Dockerfile b/Docker/reference-sample-set-selector/Dockerfile
new file mode 100644
index 0000000..5ff1007
--- /dev/null
+++ b/Docker/reference-sample-set-selector/Dockerfile
@@ -0,0 +1,7 @@
+FROM biodatageeks/bdg-spark
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+
+
+
+

From 357e36ce5299bc04fa1dae9edf2eaf7e56bae9ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 19 Mar 2018 18:39:36 +0100
Subject: [PATCH 003/114] bugfix in Jenkinsfile

---
 Docker/target-qc/Dockerfile | 11 -----------
 Jenkinsfile                 | 15 +++++++++++++++
 build.sh                    | 14 +++++++-------
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/Docker/target-qc/Dockerfile b/Docker/target-qc/Dockerfile
index 1dbc44e..5ff1007 100644
--- a/Docker/target-qc/Dockerfile
+++ b/Docker/target-qc/Dockerfile
@@ -2,17 +2,6 @@ FROM biodatageeks/bdg-spark
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
 
-RUN apt-get update && apt-get install --yes \
-				wget \
-                 && rm -rf /var/lib/apt/lists/*
-
-RUN wget https://repo1.maven.org/maven2/org/bdgenomics/adam/adam-distribution-spark2_2.11/0.23.0/adam-distribution-spark2_2.11-0.23.0-bin.tar.gz
-RUN tar -zxvf adam-distribution-spark2_2.11-0.23.0-bin.tar.gz
-RUN mv adam-distribution-spark2_2.11-0.23.0 adam && mv adam /tmp && rm -f adam-distribution-spark2_2.11-0.23.0-bin.tar.gz
-ENV PATH="/tmp/adam/bin:${PATH}"
-
-
-
 
 
 
diff --git a/Jenkinsfile b/Jenkinsfile
index 4d46cc4..4b999d4 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -2,6 +2,13 @@ pipeline {
     agent any
        stages {
 
+        stage('Building Docker images') {
+                    steps {
+                        echo 'Building Docker images....'
+                        sh './build.sh'
+                    }
+                }
+
         stage('Test R code') {
                     steps {
                         echo 'Testing R code....'
@@ -52,6 +59,14 @@ pipeline {
                             }
 
                 }
+
+        stage('Building Docker images') {
+                    steps {
+                        echo 'Building Docker images....'
+                        sh './build.sh'
+                    }
+                }
+
          stage('Publish to Nexus snapshots and copying assembly fat jar to the edge server') {
                    when {
                          branch 'master'
diff --git a/build.sh b/build.sh
index cae7f57..73ef363 100755
--- a/build.sh
+++ b/build.sh
@@ -36,16 +36,16 @@ do
     cd $dir
     docker build  -t $image:$version .
     docker build  -t $image:latest .
-    if [[ ${BUILD_MODE} != "local" ]]; then
-      docker push docker.io/$image:latest
-      docker push docker.io/$image:$version
-    fi
+#    if [[ ${BUILD_MODE} != "local" ]]; then
+#      docker push docker.io/$image:latest
+#      docker push docker.io/$image:$version
+#    fi
     ##revert COMPONENT_VERSION variable
-    if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi
+#    if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi
     #keep only last 3 versions of an image locally (2+3 in tail part)
-    docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {}
+#    docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {}
 
-    cd ../..
+#    cd ../..
   fi
 
 done

From 7cd7620560b1288d783eb6c4f7fdd5788f30ab79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 19 Mar 2018 18:41:31 +0100
Subject: [PATCH 004/114] build.sh file without pushing dockers images

---
 Jenkinsfile | 2 +-
 build.sh    | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 4b999d4..4c88e48 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -60,7 +60,7 @@ pipeline {
 
                 }
 
-        stage('Building Docker images') {
+        stage('Build Docker images') {
                     steps {
                         echo 'Building Docker images....'
                         sh './build.sh'
diff --git a/build.sh b/build.sh
index 73ef363..7bd1419 100755
--- a/build.sh
+++ b/build.sh
@@ -41,11 +41,11 @@ do
 #      docker push docker.io/$image:$version
 #    fi
     ##revert COMPONENT_VERSION variable
-#    if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi
+    if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi
     #keep only last 3 versions of an image locally (2+3 in tail part)
-#    docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {}
+    docker images $image | tail -n +5 | sed 's/ \{1,\}/:/g' | cut -f1,2 -d':' | xargs -i docker rmi {}
 
-#    cd ../..
+    cd ../..
   fi
 
 done

From f72a2ba71e447e22d5599ef47cf8defb7dbb755f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 19 Mar 2018 18:44:02 +0100
Subject: [PATCH 005/114] pushing images in build.sh file

---
 build.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/build.sh b/build.sh
index 7bd1419..cae7f57 100755
--- a/build.sh
+++ b/build.sh
@@ -36,10 +36,10 @@ do
     cd $dir
     docker build  -t $image:$version .
     docker build  -t $image:latest .
-#    if [[ ${BUILD_MODE} != "local" ]]; then
-#      docker push docker.io/$image:latest
-#      docker push docker.io/$image:$version
-#    fi
+    if [[ ${BUILD_MODE} != "local" ]]; then
+      docker push docker.io/$image:latest
+      docker push docker.io/$image:$version
+    fi
     ##revert COMPONENT_VERSION variable
     if [ -e version ]; then ver=`tail -1 version`; sed -i '' "s/${ver}/{{COMPONENT_VERSION}}/g" Dockerfile ; fi
     #keep only last 3 versions of an image locally (2+3 in tail part)

From 5de7effb6a6e91a239eb359057ff8b7b161c4fb6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 19 Mar 2018 19:12:26 +0100
Subject: [PATCH 006/114] rename docker images

---
 .../Dockerfile                                                    | 0
 Docker/{target-qc => cnv-opt-target-qc}/Dockerfile                | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename Docker/{reference-sample-set-selector => cnv-opt-reference-sample-set-selector}/Dockerfile (100%)
 rename Docker/{target-qc => cnv-opt-target-qc}/Dockerfile (100%)

diff --git a/Docker/reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
similarity index 100%
rename from Docker/reference-sample-set-selector/Dockerfile
rename to Docker/cnv-opt-reference-sample-set-selector/Dockerfile
diff --git a/Docker/target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile
similarity index 100%
rename from Docker/target-qc/Dockerfile
rename to Docker/cnv-opt-target-qc/Dockerfile

From a0cc870a05fc43676841fef2159d591dcb262608 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 20 Mar 2018 14:27:37 +0100
Subject: [PATCH 007/114] draft of docker containers for CODEX, Canoes and
 Exomedepth

---
 Docker/cnv-opt-canoes/Dockerfile     | 17 +++++++++++++++++
 Docker/cnv-opt-codex/Dockerfile      | 14 ++++++++++++++
 Docker/cnv-opt-exomedepth/Dockerfile | 21 +++++++++++++++++++++
 Docker/cnv-opt-target-qc/Dockerfile  |  8 ++++++--
 4 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 Docker/cnv-opt-canoes/Dockerfile
 create mode 100644 Docker/cnv-opt-codex/Dockerfile
 create mode 100644 Docker/cnv-opt-exomedepth/Dockerfile

diff --git a/Docker/cnv-opt-canoes/Dockerfile b/Docker/cnv-opt-canoes/Dockerfile
new file mode 100644
index 0000000..6667651
--- /dev/null
+++ b/Docker/cnv-opt-canoes/Dockerfile
@@ -0,0 +1,17 @@
+FROM ubuntu:xenial
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+RUN apt-get update
+RUN apt-get install -y software-properties-common
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
+RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/'
+RUN apt-get install -y apt-transport-https
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
+
+RUN Rscript -e "install.packages('nnls', repos = 'http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages('Hmisc', repos = 'http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages('mgcv', repos = 'http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages('plyr', repos = 'http://cran.us.r-project.org')"
diff --git a/Docker/cnv-opt-codex/Dockerfile b/Docker/cnv-opt-codex/Dockerfile
new file mode 100644
index 0000000..6f56eb5
--- /dev/null
+++ b/Docker/cnv-opt-codex/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:xenial
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+RUN apt-get update
+RUN apt-get install -y software-properties-common
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
+RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/'
+RUN apt-get install -y apt-transport-https
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
+
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')"
diff --git a/Docker/cnv-opt-exomedepth/Dockerfile b/Docker/cnv-opt-exomedepth/Dockerfile
new file mode 100644
index 0000000..fd16d85
--- /dev/null
+++ b/Docker/cnv-opt-exomedepth/Dockerfile
@@ -0,0 +1,21 @@
+FROM ubuntu:xenial
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+RUN apt-get update
+RUN apt-get install -y software-properties-common
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
+RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/'
+RUN apt-get install -y apt-transport-https
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
+
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrings')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomicRanges')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('RCurl')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomicAlignments')"
+RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')"
+
diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile
index 5ff1007..ab19c2f 100644
--- a/Docker/cnv-opt-target-qc/Dockerfile
+++ b/Docker/cnv-opt-target-qc/Dockerfile
@@ -1,7 +1,11 @@
-FROM biodatageeks/bdg-spark
+FROM ubuntu:xenial
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
 
+RUN Rscript -e "install.packages('devtools', repos='http://cran.cnr.berkeley.edu'); devtools::install_github('hadley/testthat')"
 
-
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')"
 

From be88e856fc390c701bb7400816bae17fbba5d72f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 20 Mar 2018 16:33:18 +0100
Subject: [PATCH 008/114] CANOES package

---
 R/CANOES/DESCRIPTION          |  26 ++
 R/CANOES/NAMESPACE            |   2 +
 R/CANOES/R/functions_CANOES.R | 685 ++++++++++++++++++++++++++++++++++
 R/CANOES/R/run_CANOES.R       |  32 ++
 4 files changed, 745 insertions(+)
 create mode 100644 R/CANOES/DESCRIPTION
 create mode 100644 R/CANOES/NAMESPACE
 create mode 100644 R/CANOES/R/functions_CANOES.R
 create mode 100644 R/CANOES/R/run_CANOES.R

diff --git a/R/CANOES/DESCRIPTION b/R/CANOES/DESCRIPTION
new file mode 100644
index 0000000..314f419
--- /dev/null
+++ b/R/CANOES/DESCRIPTION
@@ -0,0 +1,26 @@
+Package: CANOESCOV
+Title: CANOES Package With Interface To External Coverage File
+Version: 0.0.1
+Authors@R: c(
+    person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")),
+    person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")),
+    person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut")))
+Description: An extended implementation of the CANOES package in R. It extends 
+    original implementation by using external coverage file, which should
+    speed up calculations for running application with multiple sets of input
+    parameters.
+Depends:
+    R (>= 3.2.3),
+    devtools (>= 1.13.2),
+    DBI (== 0.8),
+    optparse (== 1.4.4),
+    IRanges (>= 2.0.0),
+    plyr (>= 1.8.4),
+    nnls (>= 1.4.0),
+    Hmisc (>= 4.0.0),
+    mgcv (>= 1.8.0),
+    REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1)
+License: GPL-3
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.0.1.9000
diff --git a/R/CANOES/NAMESPACE b/R/CANOES/NAMESPACE
new file mode 100644
index 0000000..884a631
--- /dev/null
+++ b/R/CANOES/NAMESPACE
@@ -0,0 +1,2 @@
+# Generated by roxygen2: fake comment so roxygen2 overwrites silently.
+exportPattern("^[^\\.]")
diff --git a/R/CANOES/R/functions_CANOES.R b/R/CANOES/R/functions_CANOES.R
new file mode 100644
index 0000000..b3077b4
--- /dev/null
+++ b/R/CANOES/R/functions_CANOES.R
@@ -0,0 +1,685 @@
+# Constants
+NUM.ABNORMAL.STATES=2
+NUM.STATES=3
+DELETION=1
+NORMAL=2
+DUPLICATION=3
+
+# PlotCNV
+#     Plots count data for targets of interest
+#     highlights sample of interest in red, 
+#     highlights area of interest with a black line
+#     highlights probe locations with black dots
+# Arguments:
+#   counts: 
+#     count matrix, with column "target" with target numbers 
+#     and sample data in columns 6:end
+#   sample.name:
+#     sample of interest (will be highlighted in red in figure)
+#     (should correspond to a column in counts)
+#   targets:
+#     targets of interest in the form start.target..end.target
+#   offset:
+#     number of targets to add on either end (default=1)
+# Returns: 
+#   returns nothing
+PlotCNV <- function(counts, sample.name, targets, offset=1){
+  sample.name <- as.character(sample.name)
+  if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
+  if (length(setdiff("target", names(counts)[1:5]) > 0)){
+    stop("counts matrix must have column named target")
+  }
+  t <- as.character(targets)
+  start.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[1])
+  end.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[2])
+  if (!start.target %in% counts$target){
+    stop("no data for start.target in counts matrix")
+  }
+  if (!end.target %in% counts$target){
+    stop("no data for end.target in counts matrix")
+  }
+  if ((start.target - offset) %in% counts$target){
+    start.target <- start.target - offset
+  }
+  if ((end.target + offset) %in% counts$target){
+    end.target <- end.target + offset
+  }
+  ref.sample.names <- setdiff(as.character(names(counts)[-seq(1,5)]), 
+                              sample.name)
+  data <- subset(counts, target >= start.target & target <= end.target)
+  sample.data <- data[, sample.name]
+  means <- apply(data[, ref.sample.names], 1, mean)
+  sd <- sqrt(apply(data[, ref.sample.names], 1, var))
+  refs.z.scores <- matrix(NA, nrow(data), length(ref.sample.names))
+  sample.z.score <- numeric(length = nrow(data))
+  for (i in seq(1, dim(data)[1])){
+    refs.z.scores[i, ] <- as.numeric((data[i, ref.sample.names] - means[i]) / 
+                                       max(0.000001, sd[i]))
+    sample.z.score[i] <- (sample.data[i] - means[i]) / max(0.000001, sd[i])
+  }
+  ylim <- max(abs(refs.z.scores), abs(sample.z.score))
+  plot(seq(-6, 6), seq(-6, 6), 
+       xlim=c(data[1, "start"], data[dim(data)[1], "start"]), 
+       ylim=c(-ylim - 0.1, ylim + 0.1), type="n", xlab="", ylab="Z-score")
+  for (i in seq(1, length(ref.sample.names))){
+    lines(data[, "start"], refs.z.scores[, i], col="#2f4f4f85")
+  }
+  lines(data[, "start"], sample.z.score, col="red", lwd=3)
+  points(data[, "start"], rep(-ylim - 0.05, length(data[, "start"])), pch=20)
+  lines( c(data[1 + offset, "start"], data[nrow(data) - offset, "end"]) , 
+         c(ylim+0.2, ylim+0.2), lwd=2)
+  title(main=paste("Sample ", sample.name, ", ", 
+                   counts$chromosome[start.target], ":", 
+                   data$start[1], "-", data$end[nrow(data)], sep=""))
+}
+
+# CallCNVs
+#     Calls CNVs in sample of interest
+# Arguments:
+#   sample.name:
+#     sample to call CNVs in (should correspond to a column in counts)
+#   counts: 
+#     count matrix, first five columns should be 
+#       target: consecutive numbers for targets (integer)
+#       chromosome: chromosome number (integer-valued) 
+#         (support for sex chromosomes to come)
+#       start: start position of probe (integer)
+#       end: end position of probe (integer)
+#       gc: gc content (real between 0 and 1)
+#       subsequent columns should include counts for each probe for samples
+#   p:
+#     average rate of occurrence of CNVs (real) default is 1e-08
+#   D:
+#     expected distance between targets in a CNV (integer) default is 70,000
+#   Tnum:
+#     expected number of targets in a CNV (integer) default is 6
+#   numrefs
+#     maximum number of reference samples to use (integer) default is 30
+#     the weighted variance calculations will take a long time if too 
+#     many reference samples are used
+# Returns: 
+#   data frame with the following columns:
+#      SAMPLE: name of sample
+#      CNV: DEL of DUP
+#      INTERVAL: CNV coordinates in the form chr:start-stop
+#      KB: length of CNV in kilobases
+#      CHR: chromosome
+#      MID_BP: middle base pair of CNV
+#      TARGETS: target numbers of CNV in the form start..stop
+#      NUM_TARG: how many targets are in the CNV
+#      Q_SOME: a Phred-scaled quality score for the CNV
+CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, numrefs=30, get.dfs=F, homdel.mean=0.2){
+  if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
+  if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){
+    stop("First five columns of counts matrix must be target, chromosome, start, end, gc")
+  }
+  if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) {
+    # remove sex chromosomes
+    cat("Trying to remove sex chromosomes and 'chr' prefixes\n")
+    counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y"))
+    if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){
+      counts$chromosome <- gsub("chr", "", counts$chromosome)
+    }
+    counts$chromosome <- as.numeric(counts$chromosome)
+    if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) 
+      stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)")
+  }
+  library(plyr)
+  counts <- arrange(counts, chromosome, start)
+  if (p <= 0){
+    stop("parameter p must be positive")
+  }
+  if (Tnum <= 0){
+    stop("parameter Tnum must be positive")
+  }
+  if (D <= 0){
+    stop("parameter D must be positive")
+  }
+  if (numrefs <= 0){
+    stop("parameter numrefs must be positive")
+  }
+  sample.names <- colnames(counts)[-seq(1,5)]
+  # find mean coverage of probes
+  mean.counts <- mean(apply(counts[, sample.names], 2, mean))
+  # normalize counts; round so we can use negative binomial
+  counts[, sample.names] <- apply(counts[, sample.names], 2, 
+        function(x, mean.counts) 
+                 round(x * mean.counts / mean(x)), mean.counts)
+  # calculate covariance of read count across samples
+  cov <- cor(counts[, sample.names], counts[, sample.names])
+  reference.samples <- setdiff(sample.names, sample.name)
+  covariances <- cov[sample.name, reference.samples]
+  reference.samples <- names(sort(covariances, 
+          decreasing=T)[1:min(numrefs, length(covariances))])
+  sample.mean.counts <- mean(counts[, sample.name])
+  sample.sumcounts <- apply(counts[, reference.samples], 2, sum)
+  # normalize reference samples to sample of interest
+  counts[, reference.samples] <- apply(counts[, reference.samples], 2, 
+        function(x, sample.mean.counts) 
+                round(x * sample.mean.counts / 
+                mean(x)), sample.mean.counts)  
+  # select reference samples and weightings using non-negative least squares
+  b <- counts[, sample.name]
+  A <- as.matrix(counts[, reference.samples])
+  library(nnls)
+  all <- nnls(A, b)$x
+  est <- matrix(0, nrow=50, ncol=length(reference.samples))
+  set.seed(1)
+  for (i in 1:50){
+    d <- sample(nrow(A), min(500, nrow(A)))
+    est[i, ] <- nnls(A[d, ], b[d])$x
+  }
+  weights <- colMeans(est)
+  sample.weights <- weights / sum(weights)
+  library(Hmisc)
+  # calculate weighted mean of read count
+  # this is used to calculate emission probabilities
+  counts$mean <- apply(counts[, reference.samples], 
+                       1, wtd.mean, sample.weights)
+  targets <- counts$target
+  # exclude probes with all zero counts
+  nonzero.rows <- counts$mean > 0
+  nonzero.rows.df <- data.frame(target=counts$target, 
+                                nonzero.rows=nonzero.rows)
+
+  counts <- counts[nonzero.rows, ]
+  # get the distances between consecutive probes
+  distances <- GetDistances(counts)
+  # estimate the read count variance at each probe
+  var.estimate <- EstimateVariance(counts, reference.samples, 
+                                               sample.weights)
+  emission.probs <- EmissionProbs(counts[, sample.name], 
+                        counts$mean, var.estimate$var.estimate, 
+                        counts[, "target"])
+  if (get.dfs){
+    return(list(emission.probs=emission.probs, distances=distances))
+  }
+  # call CNVs with the Viterbi algorithm
+  viterbi.state <- Viterbi(emission.probs, distances, p, Tnum, D)  
+  # format the CNVs
+  cnvs <- PrintCNVs(sample.name, viterbi.state, 
+                         counts)
+  # if there aren't too many CNVs, calculate the Q_SOME
+  if (nrow(cnvs) > 0 & nrow(cnvs) <= 50){
+    qualities <- GenotypeCNVs(cnvs, sample.name, counts, p, Tnum, D, numrefs, 
+                          emission.probs=emission.probs, 
+                          distances=distances)
+    for (i in 1:nrow(cnvs)){
+      cnvs$Q_SOME[i] <- ifelse(cnvs$CNV[i]=="DEL", qualities[i, "SQDel"], 
+                               qualities[i, "SQDup"])
+    }
+  }
+  data <- as.data.frame(cbind(counts$target, counts$mean, var.estimate$var.estimate, counts[, sample.name]))
+  names(data) <- c("target", "countsmean", "varestimate", "sample")
+  if (nrow(cnvs) > 0){
+    cnvs <- CalcCopyNumber(data, cnvs, homdel.mean)
+  }
+  return(cnvs)
+}
+
+# GenotypeCNVs
+#     Genotype CNVs in sample of interest
+# Arguments:
+#   xcnv
+#     data frame with the following columns, and one row for each
+#     CNV to genotype
+#      INTERVAL: CNV coordinates in the form chr:start-stop
+#      TARGETS: target numbers of CNV in the form start..stop
+#               these should correspond to the target numbers in counts
+#   sample.name:
+#     sample to genotype CNVs in (should correspond to a column in counts)
+#   counts: 
+#     count matrix, first five columns should be 
+#       target: consecutive numbers for targets (integer)
+#       chromosome: chromosome number (integer-valued) 
+#         (support for sex chromosomes to come)
+#       start: start position of probe (integer)
+#       end: end position of probe (integer)
+#       gc: gc content (real between 0 and 1)
+#       subsequent columns should include counts for each probe for samples
+#   p:
+#     average rate of occurrence of CNVs (real) default is 1e-08
+#   D:
+#     expected distance between targets in a CNV (integer) default is 70,000
+#   Tnum:
+#     expected number of targets in a CNV (integer) default is 6
+#   numrefs
+#     maximum number of reference samples to use (integer) default is 30
+#     the weighted variance calculations will take a long time if too 
+#     many reference samples are used
+#   emission.probs and distances are for internal use only
+# Returns: 
+#   data frame with the following columns and one row for each genotyped CNV:
+#      INTERVAL: CNV coordinates in the form chr:start-stop
+#      NQDEL: a Phred-scaled quality score that sample.name has no deletion 
+#             in the interval
+#      SQDEL: a Phred-scaled quality score that sample.name has a deletion 
+#             in the interval
+#      NQDUP and SQDUP: same, but for a duplication
+GenotypeCNVs <- function(xcnvs, sample.name, counts, p=1e-08, Tnum=6, 
+                    D=70000, numrefs=30,
+                    emission.probs=NULL, 
+                    distances=NULL){
+  if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
+  if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){
+    stop("First five columns of counts matrix must be target, chromosome, start, end, gc")
+  }
+  if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) {
+    # remove sex chromosomes
+    cat("Trying to remove sex chromosomes and 'chr' prefixes\n")
+    counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y"))
+    if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){
+      counts$chromosome <- gsub("chr", "", counts$chromosome)
+    }
+    counts$chromosome <- as.numeric(counts$chromosome)
+    if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) 
+      stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)")
+  }
+  library(plyr)
+  counts <- arrange(counts, chromosome, start)
+  if (p <= 0){
+    stop("parameter p must be positive")
+  }
+  if (Tnum <= 0){
+    stop("parameter Tnum must be positive")
+  }
+  if (D <= 0){
+    stop("parameter D must be positive")
+  }
+  if (numrefs <= 0){
+    stop("parameter numrefs must be positive")
+  }
+  num.cnvs <- nrow(xcnvs)
+  cnv.intervals <- as.character(xcnvs$INTERVAL)
+  # if no emission probs matrix is passed in, generate a new one
+  if (is.null(emission.probs)){
+    l <- CallCNVs(sample.name, counts, p, Tnum=6, D=70000, numrefs=30, get.dfs=T)
+    emission.probs <- l[['emission.probs']]
+    distances <- l[['distances']]
+  }
+  forward.m <- GetForwardMatrix(emission.probs, distances, p, Tnum, D)
+  backward.m <- GetBackwardMatrix(emission.probs, distances, p, Tnum, D)
+  qualities <- matrix(0, nrow=num.cnvs, ncol=5, 
+                      dimnames=list(cnv.intervals, 
+                                    c("INTERVAL", "NQDel", "SQDel", "NQDup", "SQDup")))
+  for (i in 1:num.cnvs){
+    interval <- as.character(xcnvs[i, "INTERVAL"])
+    targets <- as.numeric(strsplit(as.character(xcnvs[i, "TARGETS"]), ".", fixed=T)[[1]][c(1,3)])
+    left.target <- targets[1]
+    right.target <- targets[2]
+    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
+                                         emission.probs, distances, 
+                                         left.target, right.target, 
+                                         c(DUPLICATION, DELETION), p, Tnum, D)
+    modified.likelihood <- likelihoods[1]; 
+    unmodified.likelihood <- likelihoods[2]
+    Prob.All.Normal <- exp(modified.likelihood - unmodified.likelihood)
+    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
+                                         emission.probs, distances, 
+                                         left.target, right.target, DELETION, p, Tnum, D)
+    modified.likelihood <- likelihoods[1]; 
+    unmodified.likelihood <- likelihoods[2]
+    Prob.No.Deletion <- exp(modified.likelihood - unmodified.likelihood)
+    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
+                                         emission.probs, distances, 
+                                         left.target, right.target, DUPLICATION, p, Tnum, D)
+    modified.likelihood <- likelihoods[1]; 
+    unmodified.likelihood <- likelihoods[2]
+    Prob.No.Duplication <- exp(modified.likelihood - unmodified.likelihood)
+    # Check if probabilities greater than 1 are numerical error or bug
+    Phred <- function(prob){
+      return(round(min(99, -10 * log10(1 - prob))))
+    }
+    qualities[i, "NQDel"] <- Phred(Prob.No.Deletion)       
+    qualities[i, "SQDel"] <- Phred(Prob.No.Duplication - Prob.All.Normal)
+    qualities[i, "NQDup"] <- Phred(Prob.No.Duplication)       
+    qualities[i, "SQDup"] <- Phred(Prob.No.Deletion - Prob.All.Normal)
+    qualities[i, "INTERVAL"] <- interval
+  }
+  qualities <- as.data.frame(qualities, stringsAsFactors=F)
+  qualities$NQDel <- as.integer(qualities$NQDel)
+  qualities$NQDup <- as.integer(qualities$NQDup)
+  qualities$SQDel <- as.integer(qualities$SQDel)
+  qualities$SQDup <- as.integer(qualities$SQDup)
+  return(qualities)
+}
+
+# returns data frame with distance to each target from the previous target 
+# (0 in the case of the first target on chromosome 1, a very big number
+# for the first target on each other chromosome--this resets the HMM
+# for each chromosome)
+GetDistances <- function(counts){
+  chromosome <- counts[, "chromosome"]
+  startbase <- counts[, "start"]
+  num.nonzero.exons <- length(startbase)
+  distances <- c(0, startbase[2:num.nonzero.exons] - 
+                   startbase[1:(num.nonzero.exons - 1)] + 
+                   1000000000000 * (chromosome[2:num.nonzero.exons] - 
+                                      chromosome[1:(num.nonzero.exons - 1)]))
+  return(data.frame(target=counts[, "target"], distance=distances))
+}
+
+EstimateVariance <- function(counts, ref.sample.names, sample.weights){
+  library(Hmisc)
+  counts$var <- apply(counts[, ref.sample.names], 1, wtd.var, sample.weights, normwt=T)
+  set.seed(1)
+  counts.subset <- counts[sample(nrow(counts), min(36000, nrow(counts))), ]
+  library(mgcv)
+  # can't do gamma regression with negative 
+  counts.subset$var[counts.subset$var==0] <- 0.1 
+  fit <- gam(var ~ s(mean) + s(gc), family=Gamma(link=log), data=counts.subset)
+  # we don't want variance less than Poisson
+  # we take maximum of genome-wide estimate, method of moments estimate
+  # and Poisson variance
+  v.estimate <- pmax(predict(fit, counts, type="response"), counts$var, 
+                     counts$mean * 1.01)
+  return(data.frame(target=counts$target, var.estimate=v.estimate))
+}
+
+EmissionProbs <- function(test.counts, target.means, 
+                                      var.estimate, targets){
+  num.targets <- length(test.counts)
+  # calculate the means for the deletion, normal and duplication states
+  state.target.means <- t(apply(data.frame(x=target.means), 1, function(x) c(x*1/2, x, x*3/2)))
+  # calculate the expected size (given the predicted variance)
+  size <- target.means ^ 2 / (var.estimate - target.means)
+  emission.probs <- matrix(NA, num.targets, 4)
+  colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob")
+  # calculate the emission probabilities given the read count
+  size.del <- size
+  size.dup <- size
+  size.del <- size / 2
+  size.dup <- size * 3 / 2
+  emission.probs[, "delprob"] <- dnbinom(
+    test.counts,
+    mu=state.target.means[, 1],
+    size=size.del, log=T)
+  emission.probs[, "normalprob"] <- dnbinom(
+    test.counts,
+    mu=state.target.means[, 2],
+    size=size, log=T)
+  emission.probs[, "dupprob"] <- dnbinom(
+    test.counts,
+    mu=state.target.means[, 3],
+    size=size.dup, log=T)
+  emission.probs[, "target"] <- targets
+  # some values may be infinite as a result of extreme read count
+  row.all.inf <- which(apply(emission.probs, 1, function(x){all(is.infinite(x))}))
+  if (length(row.all.inf) > 0){
+    for (i in row.all.inf){
+      if (test.counts[i] >= state.target.means[i, 3]){
+        emission.probs[i, 2:4] <- c(-Inf, -Inf, -0.01)
+      }
+      else if (test.counts[i] <= state.target.means[i, 1]){
+        emission.probs[i, 2:4] <- c(-0.01, -Inf, -Inf)
+      }
+      else emission.probs[i, 2:4] <- c(-Inf, -0.01, -Inf)
+    }
+  }
+  return(emission.probs)
+}
+
+# Viterbi algorithm
+Viterbi <- function(emission.probs.matrix, distances, p, Tnum, D){
+  targets <- emission.probs.matrix[, 1]
+  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
+  num.exons <- dim(emission.probs.matrix)[1]
+  viterbi.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)
+  viterbi.pointers <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)
+  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
+  viterbi.matrix[1, ] <- initial.state + emission.probs.matrix[1,]
+  for (i in 2:num.exons) {
+    temp.matrix <- viterbi.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
+    viterbi.matrix[i, ] <- apply(temp.matrix, 2, max)
+    emission.probs <- c(emission.probs.matrix[i,])
+    dim(emission.probs) <- c(NUM.STATES, 1)
+    viterbi.matrix[i, ] <- viterbi.matrix[i, ] + emission.probs
+    viterbi.pointers[i, ] <- apply(temp.matrix, 2, which.max)
+  }
+  viterbi.states = vector(length = num.exons)
+  viterbi.states[num.exons] = which.max(viterbi.matrix[num.exons, ])
+  for (i in (num.exons - 1):1) {
+    viterbi.states[i] <- viterbi.pointers[i + 1, viterbi.states[i + 1]]
+  }
+  return(data.frame(target=targets, viterbi.state=viterbi.states))
+}
+
+# returns a transition matrix
+#                              to state
+#                    deletion   normal    duplication
+#           deletion   
+#from state   normal
+#        duplication
+GetTransitionMatrix <- function(distance, p, Tnum, D){
+  q <- 1 / Tnum
+  f = exp(-distance/D)
+  prob.abnormal.abnormal <- f * (1 - q) + (1 - f) * p
+  prob.abnormal.normal <- f * q + (1 - f) * (1 - 2 * p)
+  prob.abnormal.diff.abnormal <- (1 - f) * p
+  prob.normal.normal <- 1 - 2 * p
+  prob.normal.abnormal <- p
+  transition.probs <- 
+    c(prob.abnormal.abnormal, prob.abnormal.normal, prob.abnormal.diff.abnormal, 
+      prob.normal.abnormal, prob.normal.normal, prob.normal.abnormal,
+      prob.abnormal.diff.abnormal, prob.abnormal.normal, prob.abnormal.abnormal)
+  transition.m = log(matrix(transition.probs, NUM.STATES, NUM.STATES, byrow=TRUE))
+  return(transition.m)
+}
+
+# adds two log-space probabilities using the identity
+# log (p1 + p2) = log p1 + log(1 + exp(log p2 - log p1))
+AddTwoProbabilities <- function(x, y){
+  if (is.infinite(x)) return (y)
+  if (is.infinite(y)) return (x)
+  sum.probs <- max(x, y) + log1p(exp(-abs(x - y)))
+}
+
+# adds multiple log-space probabilities
+SumProbabilities <- function(x){
+  sum.probs <- x[1]
+  for (i in 2:length(x)){
+    sum.probs <- AddTwoProbabilities(sum.probs, x[i])
+  }
+  return(sum.probs)
+}
+
+# finds the data likelihood by summing the product of the corresponding 
+# forward and backward probabilities at any token (should give the same value
+# regardless of the token)
+GetLikelihood <- function(forward.matrix, backward.matrix, x){
+  SumProbabilities(forward.matrix[x, ] + backward.matrix[x, ])
+}
+
+# get the forward probabilities
+GetForwardMatrix <- function(emission.probs.matrix, distances, p, Tnum, D){
+  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
+  num.exons <- dim(emission.probs.matrix)[1]
+  forward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)   # matrix to hold forward probabilities
+  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
+  forward.matrix[1, ] <- initial.state + emission.probs.matrix[1, ]
+  for (i in 2:num.exons){
+    # compute matrix with probability we were in state j and are now in state i
+    # in temp.matrix[j, i] (ignoring emission of current token)
+    temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
+    # find the probability that we are in each of the three states
+    sum.probs <- apply(temp.matrix, 2, SumProbabilities)
+    forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ]
+  }  
+  return(forward.matrix)  
+}
+
+# get the backward probabilities
+GetBackwardMatrix <- function(emission.probs.matrix, distances, 
+                                  p, Tnum, D){
+  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
+  num.exons <- dim(emission.probs.matrix)[1]
+  backward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)   # matrix to hold backward probabilities
+  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
+  backward.matrix[num.exons, ] <- rep(0, NUM.STATES)
+  for (i in (num.exons - 1):1){
+    temp.matrix <- GetTransitionMatrix(distances$distance[i+1], p, Tnum, D) + 
+      matrix(backward.matrix[i + 1, ], 3, 3, byrow=T) +
+      matrix(emission.probs.matrix[i+1, ], 3, 3, byrow=T)
+    backward.matrix[i, ] <- apply(temp.matrix, 1, SumProbabilities)
+  }  
+  final.prob <- backward.matrix[1, ] + emission.probs.matrix[1, ] + initial.state
+  return(backward.matrix)  
+}
+
+# find the likelihood of the data given that certain states are disallowed
+# between start target and end target
+GetModifiedLikelihood <- function(forward.matrix, backward.matrix, emission.probs.matrix, distances, 
+                                      start.target, end.target, disallowed.states, p, Tnum, D){
+  targets <- emission.probs.matrix[, 1]
+  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
+  # there may be missing targets in this sample, we genotype the largest stretch of 
+  # targets that lie in the CNV
+  left.target <- min(which(targets >= start.target))
+  right.target <- max(which(targets <= end.target))
+  num.exons <- dim(emission.probs.matrix)[1]
+  unmodified.likelihood <- GetLikelihood(forward.matrix, 
+                                             backward.matrix, min(right.target + 1, num.exons))
+  #right.target or left.target may be empty
+  
+  #if (right.target >= left.target) return(c(NA, unmodified.likelihood))
+  stopifnot(right.target >= left.target)
+  modified.emission.probs.matrix <- emission.probs.matrix
+  modified.emission.probs.matrix[left.target:right.target, 
+                                 disallowed.states] <- -Inf
+  
+  # if the start target is the first target we need to recalculate the 
+  # forward probabilities
+  # for that target, using the modified emission probabilities
+  if (left.target == 1){
+    initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
+    forward.matrix[1, ] <- initial.state + modified.emission.probs.matrix[1, ]
+    left.target <- left.target + 1
+  } 
+  for (i in seq(left.target, min(right.target + 1, num.exons))){
+    # compute matrix with probability we were in state j and are now in state i
+    # in temp.matrix[j, i] (ignoring emission of current token)
+    temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
+    # find the probability that we are in each of the three states
+    sum.probs <- apply(temp.matrix, 2, SumProbabilities) 
+    if (!i == (right.target + 1)){
+      forward.matrix[i, ] <- sum.probs + modified.emission.probs.matrix[i, ]
+    } else{
+      forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ]
+    }
+  }  
+  # find the modified likelihood of the sequence
+  modified.likelihood <- GetLikelihood(forward.matrix, backward.matrix, min(right.target + 1, num.exons))
+  return(c(modified.likelihood, unmodified.likelihood))
+}
+
+SummarizeCNVs <- function(cnv.targets, counts, sample.name, state){
+  sample.name <- sample.name
+  cnv.type <- ifelse(state==3, "DUP", "DEL")
+  cnv.start <- min(cnv.targets$target)
+  cnv.end <- max(cnv.targets$target)
+  cnv.chromosome <- counts[cnv.start, "chromosome"]
+  cnv.start.base <- counts[cnv.start, "start"]
+  cnv.start.target <- counts[cnv.start, "target"]
+  cnv.end.base <- counts[cnv.end, "end"]
+  cnv.end.target <- counts[cnv.end, "target"]
+  cnv.kbs <- (cnv.end.base - cnv.start.base) / 1000
+  cnv.midbp <- round((cnv.end.base - cnv.start.base) / 2) + cnv.start.base
+  cnv.targets <- paste(cnv.start.target, "..", cnv.end.target, sep="")
+  cnv.interval <- paste(cnv.chromosome, ":", cnv.start.base, "-", cnv.end.base, sep="")
+  num.targets <- cnv.end.target - cnv.start.target + 1
+  return(data.frame(sample.name=sample.name, cnv.type=cnv.type, cnv.interval=cnv.interval, 
+                    cnv.kbs=cnv.kbs, cnv.chromosome=cnv.chromosome, 
+                    cnv.midbp=cnv.midbp, cnv.targets=cnv.targets, num.targets=num.targets))
+}
+
+PrintCNVs <- function(test.sample.name, viterbi.state, 
+                      nonzero.counts){  
+  consecutiveGroups <- function(sequence){
+    num <- length(sequence)
+    group <- 1
+    groups <- rep(0, num)
+    groups[1] <- group
+    if (num > 1){
+      for (i in 2:num){
+        if (!sequence[i] == (sequence[i - 1] + 1)) group <- group + 1
+        groups[i] <- group
+      }
+    }
+    return(groups)
+  }
+  num.duplications <- 0
+  num.deletions <- 0
+  for (state in c(1, 3)){
+    cnv.targets <- which(viterbi.state$viterbi.state == state)
+    if (!length(cnv.targets) == 0){
+      groups <- consecutiveGroups(cnv.targets)
+      library(plyr)
+      cnvs.temp.df <- ddply(data.frame(target=cnv.targets, group=groups), 
+                            "group", SummarizeCNVs, nonzero.counts, test.sample.name, 
+                            state)
+      if (state == 1){
+        deletions.df <- cnvs.temp.df
+        if (!is.null(dim(deletions.df))){
+          num.deletions <- dim(deletions.df)[1]
+        }
+      } else {
+        duplications.df <- cnvs.temp.df
+        if (!is.null(dim(duplications.df))){
+          num.duplications <- dim(duplications.df)[1]
+        }
+      }
+    }
+  }
+  num.calls <- num.deletions + num.duplications
+  cat(num.calls, "CNVs called in sample", test.sample.name, "\n")
+  if (num.deletions == 0 & num.duplications == 0){
+    df <- data.frame(SAMPLE=character(0), CNV=character(0), INTERVAL=character(0), 
+                     KB=numeric(0), CHR=character(0), 
+                     MID_BP=numeric(), TARGETS=character(0), NUM_TARG=numeric(0), Q_SOME=numeric(0), MLCN=numeric(0))
+    return(df)
+  }
+  if (num.deletions > 0 & num.duplications > 0){
+    cnvs.df <- rbind(deletions.df, duplications.df)
+  } else {
+    ifelse(num.deletions > 0, 
+           cnvs.df <- deletions.df, cnvs.df <- duplications.df)
+  }
+  xcnv <- cbind(cnvs.df[, c("sample.name", "cnv.type", "cnv.interval", 
+                      "cnv.kbs", "cnv.chromosome", "cnv.midbp", 
+                      "cnv.targets", "num.targets")], 0)
+  colnames(xcnv) <- c("SAMPLE", "CNV", "INTERVAL", "KB", "CHR", "MID_BP", "TARGETS",
+                      "NUM_TARG", "MLCN")
+  xcnv$Q_SOME <- NA
+  return(xcnv)
+}
+
+CalcCopyNumber <- function(data, cnvs, homdel.mean){
+  for (i in 1:nrow(cnvs)){
+    cnv <- cnvs[i, ]
+    targets <- as.numeric(unlist(strsplit(as.character(cnv$TARGETS), "..", fixed=T)))
+    cnv.data <- subset(data, target >= targets[1] & target <= targets[2])
+    state.target.means <- t(apply(data.frame(x=cnv.data$countsmean), 1, 
+                                  function(x) c(C1=x*1/2, C2=x, C3=x*3/2, 
+                                                C4=x * 2, C5=x * 5/2, C6=x*6/2)))
+    # calculate the expected size (given the predicted variance)
+    size <- cnv.data$countsmean ^ 2 / (cnv.data$varestimate - cnv.data$countsmean)
+    emission.probs <- matrix(NA, nrow(cnv.data), 7)
+    colnames(emission.probs) <- c("C0", "C1", "C2", "C3", "C4", "C5", "C6")
+    #colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob")
+    # calculate the emission probabilities given the read count
+    emission.probs[, 1] <- dpois(cnv.data$sample, homdel.mean, log=T)
+    for (s in 1:6){
+      size.state <- size * s/2
+      emission.probs[, s+1] <- dnbinom(cnv.data$sample, mu=state.target.means[, s], 
+                                       size=size.state, log=T)
+    }
+    cs <- colSums(emission.probs)
+    ml.state <- which.max(cs) - 1
+    if (ml.state==2){
+      ml.state <- ifelse(cnv$CNV=="DEL", 1, 3)
+    }
+    cnvs$MLCN[i] <- ml.state
+  }  
+  return(cnvs)
+}
+
diff --git a/R/CANOES/R/run_CANOES.R b/R/CANOES/R/run_CANOES.R
new file mode 100644
index 0000000..65ddb36
--- /dev/null
+++ b/R/CANOES/R/run_CANOES.R
@@ -0,0 +1,32 @@
+Test <- function(){
+  # read in the data
+  gc <- read.table("gc.txt")$V2
+  canoes.reads <- read.table("canoes.reads.txt")
+  # rename the columns of canoes.reads
+  sample.names <- paste("S", seq(1:26), sep="")
+  names(canoes.reads) <- c("chromosome", "start", "end", sample.names)
+  # create a vector of consecutive target ids
+  target <- seq(1, nrow(canoes.reads))
+  # combine the data into one data frame
+  canoes.reads <- cbind(target, gc, canoes.reads)
+  # call CNVs in each sample
+  # create a vector to hold the results for each sample
+  xcnv.list <- vector('list', length(sample.names))
+  for (i in 1:length(sample.names)){
+    xcnv.list[[i]] <- CallCNVs(sample.names[i], canoes.reads) 
+  }
+  # combine the results into one data frame
+  xcnvs <- do.call('rbind', xcnv.list)
+  # inspect the first two CNV calls
+  print(head(xcnvs, 2))
+  # plot all the CNV calls to a pdf
+  pdf("CNVplots.pdf")
+  for (i in 1:nrow(xcnvs)){
+     PlotCNV(canoes.reads, xcnvs[i, "SAMPLE"], xcnvs[i, "TARGETS"])
+  }
+  dev.off()
+  # genotype all the CNVs calls made above in sample S2
+  genotyping.S2 <- GenotypeCNVs(xcnvs, "S2", canoes.reads)
+  # inspect the genotype scores for the first two CNV calls
+  print(head(genotyping.S2, 2))
+}

From 0edbf3686e6dd7a011b1896c1b0c5846f6294b72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 20 Mar 2018 16:36:44 +0100
Subject: [PATCH 009/114] CANOES package in Jenkinsfile

---
 Jenkinsfile          |  1 +
 R/CANOES/DESCRIPTION | 14 +++-----------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 4c88e48..dca1340 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -29,6 +29,7 @@ pipeline {
                                  sh "cd R && R CMD build CODEXCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CODEXCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CODEXCOV_0.0.1.tar.gz"
                                  sh "cd R && R CMD build EXOMEDEPTHCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file EXOMEDEPTHCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/EXOMEDEPTHCOV_0.0.1.tar.gz"
                                  sh "cd R && R CMD build CANOESCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOESCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOESCOV_0.0.1.tar.gz"
+                                 sh "cd R && R CMD build CANOES/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOES_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOES_0.0.1.tar.gz"
                                  sh "cd R && R CMD build CNVCALLER.RUNNER/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.RUNNER_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.RUNNER_0.0.1.tar.gz"
                                  sh "cd R && R CMD build CNVCALLER.EVALUATOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.EVALUATOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.EVALUATOR_0.0.1.tar.gz"
                              }
diff --git a/R/CANOES/DESCRIPTION b/R/CANOES/DESCRIPTION
index 314f419..7f4bc54 100644
--- a/R/CANOES/DESCRIPTION
+++ b/R/CANOES/DESCRIPTION
@@ -1,25 +1,17 @@
 Package: CANOESCOV
-Title: CANOES Package With Interface To External Coverage File
+Title: CANOES Package
 Version: 0.0.1
 Authors@R: c(
     person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")),
     person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")),
     person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut")))
-Description: An extended implementation of the CANOES package in R. It extends 
-    original implementation by using external coverage file, which should
-    speed up calculations for running application with multiple sets of input
-    parameters.
+Description: An implementation of the CANOES package in R.
 Depends:
     R (>= 3.2.3),
-    devtools (>= 1.13.2),
-    DBI (== 0.8),
-    optparse (== 1.4.4),
-    IRanges (>= 2.0.0),
     plyr (>= 1.8.4),
     nnls (>= 1.4.0),
     Hmisc (>= 4.0.0),
-    mgcv (>= 1.8.0),
-    REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1)
+    mgcv (>= 1.8.0)
 License: GPL-3
 Encoding: UTF-8
 LazyData: true

From 28e60cdf6dbf047ccb86a26ff31e906287b2071d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 20 Mar 2018 17:10:40 +0100
Subject: [PATCH 010/114] bugfix

---
 R/CANOES/DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/CANOES/DESCRIPTION b/R/CANOES/DESCRIPTION
index 7f4bc54..0824435 100644
--- a/R/CANOES/DESCRIPTION
+++ b/R/CANOES/DESCRIPTION
@@ -1,4 +1,4 @@
-Package: CANOESCOV
+Package: CANOES
 Title: CANOES Package
 Version: 0.0.1
 Authors@R: c(

From f3bb5c97bcba8dc057bd350932c9c1c9f480e387 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 20 Mar 2018 17:33:29 +0100
Subject: [PATCH 011/114] Canoes added to docker container

---
 Docker/cnv-opt-canoes/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Docker/cnv-opt-canoes/Dockerfile b/Docker/cnv-opt-canoes/Dockerfile
index 6667651..b404cab 100644
--- a/Docker/cnv-opt-canoes/Dockerfile
+++ b/Docker/cnv-opt-canoes/Dockerfile
@@ -15,3 +15,4 @@ RUN Rscript -e "install.packages('nnls', repos = 'http://cran.us.r-project.org')
 RUN Rscript -e "install.packages('Hmisc', repos = 'http://cran.us.r-project.org')"
 RUN Rscript -e "install.packages('mgcv', repos = 'http://cran.us.r-project.org')"
 RUN Rscript -e "install.packages('plyr', repos = 'http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages('CANOES', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"

From 7a925405474bd21cf1c6a11fbfac67567eb2828f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Thu, 22 Mar 2018 16:20:15 +0100
Subject: [PATCH 012/114] TARGET.QC package in docker

---
 Docker/cnv-opt-target-qc/Dockerfile | 9 +++++++--
 R/TARGET.QC/R/run_TARGET.QC.R       | 2 --
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile
index ab19c2f..cccf0e0 100644
--- a/Docker/cnv-opt-target-qc/Dockerfile
+++ b/Docker/cnv-opt-target-qc/Dockerfile
@@ -1,11 +1,16 @@
 FROM ubuntu:xenial
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
+RUN apt-get update
+RUN apt-get install -y software-properties-common
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
+RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/'
+RUN apt-get install -y apt-transport-https
+
 RUN apt-get update && \
     apt-get upgrade -y && \
     apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
 
-RUN Rscript -e "install.packages('devtools', repos='http://cran.cnr.berkeley.edu'); devtools::install_github('hadley/testthat')"
-
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')"
 
+RUN Rscript -e "install.packages('TARGET.QC', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R
index 23eed55..be7b841 100644
--- a/R/TARGET.QC/R/run_TARGET.QC.R
+++ b/R/TARGET.QC/R/run_TARGET.QC.R
@@ -13,8 +13,6 @@ run_TARGET.QC <- function(mapp_thresh,
   #length_thresh_to <- 2000
   #gc_thresh_from <- 20
   #gc_thresh_to <- 80
-  #K_from <- 1
-  #K_to <- 9
   #lmax <- 200
   sampname <- unique(cov_table[,"sample_name"])
   targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]

From 28c33116848c102d384ba05db77b69bc3a7b8cbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Thu, 22 Mar 2018 19:07:30 +0100
Subject: [PATCH 013/114] switch off R tests to speed up development

---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index dca1340..ce4967f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -9,7 +9,7 @@ pipeline {
                     }
                 }
 
-        stage('Test R code') {
+        /*stage('Test R code') {
                     steps {
                         echo 'Testing R code....'
                         sh 'docker run -i --rm --network="host" -e CNV_OPT_PSQL_USER="cnv-opt" -e CNV_OPT_PSQL_PASSWORD="zsibio321" -e CNV_OPT_PSQL_DRV_URL="http://zsibio.ii.pw.edu.pl/nexus/repository/zsi-bio-raw/common/jdbc/postgresql-42.1.1.jar" -e CNV_OPT_PSQL_CONN_URL="jdbc:postgresql://cdh00.ii.pw.edu.pl:15432/cnv-opt" -w="/tmp" -v $(pwd | sed "s|/var/jenkins_home|/data/home/jenkins|g")/R:/tmp zsibio.ii.pw.edu.pl:50009/zsi-bio-toolset Rscript tests/run_tests.R'
@@ -19,7 +19,7 @@ pipeline {
                         junit '**R/tests/*.xml'
                       }
                     }
-         }
+         }*/
 
          stage('Build R package') {
                              steps {

From e515fefa952cecc9bdbacfdc6bbe2e2ce69c33fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Thu, 22 Mar 2018 19:07:51 +0100
Subject: [PATCH 014/114] TARGET.QC package with reading and writing coverage
 table

---
 R/TARGET.QC/R/run_TARGET.QC.R | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R
index be7b841..ca7e3be 100644
--- a/R/TARGET.QC/R/run_TARGET.QC.R
+++ b/R/TARGET.QC/R/run_TARGET.QC.R
@@ -5,7 +5,8 @@ run_TARGET.QC <- function(mapp_thresh,
                           length_thresh_to,
                           gc_thresh_from,
                           gc_thresh_to,
-                          cov_table){
+                          input_cov_table,
+                          output_cov_table){
   #mapp_thresh <- 0.9
   #cov_thresh_from <- 20
   #cov_thresh_to <- 4000
@@ -14,6 +15,7 @@ run_TARGET.QC <- function(mapp_thresh,
   #gc_thresh_from <- 20
   #gc_thresh_to <- 80
   #lmax <- 200
+  cov_table <- read.csv(input_cov_table)
   sampname <- unique(cov_table[,"sample_name"])
   targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]
   targets <- targets[!duplicated(targets[,"target_id"]),]
@@ -50,7 +52,7 @@ run_TARGET.QC <- function(mapp_thresh,
   cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"])
   cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"])
   cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"])
-  cov_table_qc
+  write.csv(cov_table_qc, output_cov_table)
 }
 
 #  sample_name target_id chr  pos_min  pos_max read_count

From 905ed26d4eae0c2fb4cd37230236f45e5f042214 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Thu, 22 Mar 2018 19:38:28 +0100
Subject: [PATCH 015/114] bugfixes in TARGET.QC package

---
 Docker/cnv-opt-target-qc/Dockerfile | 14 +-------------
 R/TARGET.QC/R/run_TARGET.QC.R       |  2 +-
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile
index cccf0e0..8391b91 100644
--- a/Docker/cnv-opt-target-qc/Dockerfile
+++ b/Docker/cnv-opt-target-qc/Dockerfile
@@ -1,16 +1,4 @@
-FROM ubuntu:xenial
+FROM biodatageeks/cnv-opt-codex
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-RUN apt-get update
-RUN apt-get install -y software-properties-common
-RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
-RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/'
-RUN apt-get install -y apt-transport-https
-
-RUN apt-get update && \
-    apt-get upgrade -y && \
-    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
-
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')"
-
 RUN Rscript -e "install.packages('TARGET.QC', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R
index ca7e3be..cef44ee 100644
--- a/R/TARGET.QC/R/run_TARGET.QC.R
+++ b/R/TARGET.QC/R/run_TARGET.QC.R
@@ -52,7 +52,7 @@ run_TARGET.QC <- function(mapp_thresh,
   cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"])
   cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"])
   cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"])
-  write.csv(cov_table_qc, output_cov_table)
+  write.csv(cov_table_qc, output_cov_table, row.names=F, quote=F)
 }
 
 #  sample_name target_id chr  pos_min  pos_max read_count

From 53dcff69eeb4d892bf0bda4fb1c77f6fb73ea685 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Thu, 22 Mar 2018 19:49:45 +0100
Subject: [PATCH 016/114] building dockers without cache - to reload R packages
 while development

---
 build.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index cae7f57..f2dba85 100755
--- a/build.sh
+++ b/build.sh
@@ -34,8 +34,10 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    docker build  -t $image:$version .
-    docker build  -t $image:latest .
+    #docker build -t $image:$version .
+    #docker build -t $image:latest .
+    docker build --no-cache -t $image:$version .
+    docker build --no-cache -t $image:latest .
     if [[ ${BUILD_MODE} != "local" ]]; then
       docker push docker.io/$image:latest
       docker push docker.io/$image:$version

From 40292ef35ea48bde0f08be2e1f6862d62f3c4390 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Thu, 22 Mar 2018 20:36:09 +0100
Subject: [PATCH 017/114] proper order of column in TARGET.QC results

---
 R/TARGET.QC/R/run_TARGET.QC.R | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R
index cef44ee..664c997 100644
--- a/R/TARGET.QC/R/run_TARGET.QC.R
+++ b/R/TARGET.QC/R/run_TARGET.QC.R
@@ -52,12 +52,8 @@ run_TARGET.QC <- function(mapp_thresh,
   cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"])
   cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"])
   cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"])
+  cov_table_qc <- cbind(cov_table_qc[,"chr"], cov_table_qc[,"sample_name"], cov_table_qc[,"pos_min"], cov_table_qc[,"pos_max"], cov_table_qc[,"read_count"], cov_table_qc[,"target_id"])
+  colnames(cov_table_qc) <- c("chr", "sample_name", "pos_min", "pos_max", "read_count", "target_id")
   write.csv(cov_table_qc, output_cov_table, row.names=F, quote=F)
 }
 
-#  sample_name target_id chr  pos_min  pos_max read_count
-#1     NA19012    193524   Y 25426932 25427053          0
-#2     NA19012    193525   Y 25431556 25431676          0
-#3     NA19012    193526   Y 25535089 25535239          0
-#4     NA19012    193527   Y 25537286 25537526          0
-#5     NA19012    193528   Y 25538793 25538913          0

From 3a7951d95f8f82ffea28618201ba8cf88a146d38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Thu, 22 Mar 2018 20:49:43 +0100
Subject: [PATCH 018/114] bugfix

---
 R/TARGET.QC/R/run_TARGET.QC.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R
index 664c997..42ff0e6 100644
--- a/R/TARGET.QC/R/run_TARGET.QC.R
+++ b/R/TARGET.QC/R/run_TARGET.QC.R
@@ -52,8 +52,7 @@ run_TARGET.QC <- function(mapp_thresh,
   cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"])
   cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"])
   cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"])
-  cov_table_qc <- cbind(cov_table_qc[,"chr"], cov_table_qc[,"sample_name"], cov_table_qc[,"pos_min"], cov_table_qc[,"pos_max"], cov_table_qc[,"read_count"], cov_table_qc[,"target_id"])
-  colnames(cov_table_qc) <- c("chr", "sample_name", "pos_min", "pos_max", "read_count", "target_id")
+  colnames(cov_table_qc) <- c("sample_name", "target_id", "chr", "pos_min", "pos_max", "read_count")
   write.csv(cov_table_qc, output_cov_table, row.names=F, quote=F)
 }
 

From 1e5f202af2e4c7be3b30bf45e0cfb2c5c20a9800 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 23 Mar 2018 12:57:19 +0100
Subject: [PATCH 019/114] dockerfile for REFERENCE.SAMPLE.SET.SELECTOR

---
 Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
index 5ff1007..7e82158 100644
--- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
+++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
@@ -1,7 +1,6 @@
-FROM biodatageeks/bdg-spark
+FROM biodatageeks/cnv-opt-codex
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
+RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')"
 
-
-
-
+RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"

From 86022de887a13f44678ad24a93a29619be13c3fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 23 Mar 2018 14:47:28 +0100
Subject: [PATCH 020/114] new interface to run_REFERENCE.SAMPLE.SET.SELECTOR
 method

---
 .../functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 14 +++++-
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R     | 44 ++++++++++++++-----
 2 files changed, 45 insertions(+), 13 deletions(-)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index 6ac3736..70c9fe4 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -1,4 +1,15 @@
-library(ExomeDepth)
+
+coverageObj1 <- function(cov_table, sampname, targets_for_chr){
+  Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0)
+  for(sample in sampname) {
+    cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,]
+    cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ]
+    Y <- cbind(Y, cov_targets_for_sample[,"read_count"])
+  }
+  colnames(Y) <- sampname
+  rownames(Y) <- targets_for_chr[,"target_id"]
+  return(list(Y=Y))
+}
 
 canoes_method <- function(investigated_sample, Y, num_refs){
   if (num_refs == 0) {
@@ -14,6 +25,7 @@ canoes_method <- function(investigated_sample, Y, num_refs){
 }
 
 exomedepth_method <- function(investigated_sample, Y, num_refs, target_length){
+  library(ExomeDepth)
   samples <- colnames(Y)
   reference_samples <- setdiff(samples, investigated_sample)
   reference_set <- select.reference.set(test.counts = Y[,investigated_sample],
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index f025b95..cc38ad3 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -1,16 +1,36 @@
-run_REFERENCE.SAMPLE.SET.SELECTOR <- function(investigated_sample,
-                                              Y,
+run_REFERENCE.SAMPLE.SET.SELECTOR <- function(input_cov_table,
                                               select_method,
                                               num_refs,
-                                              target_length){
-  if(select_method == "canoes") {
-    reference_samples <- canoes_method(investigated_sample, Y, num_refs)$reference_samples
-  } else if(select_method == "codex") {
-    #reference_samples <- codex_method(investigated_sample, Y, num_refs)$reference_samples
-  } else if(select_method == "exomedepth") {
-    reference_samples <- exomedepth_method(investigated_sample, Y, num_refs, target_length)$reference_samples
-  } else if(select_method == "clamms") {
-    #reference_samples <- clamms_method(investigated_sample, Y, num_refs)$reference_samples
+                                              output_reference_file){
+
+  cov_table <- read.csv(input_cov_table)
+  sampname <- unique(cov_table[,"sample_name"])
+  targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]
+  targets <- targets[!duplicated(targets[,"target_id"]),]
+  targets <- targets[with(targets, order(target_id)), ]
+  target_length <- targets[,"pos_max"] - targets[,"pos_min"]
+  Y <- coverageObj1(cov_table, sampname, targets)$Y
+  reference_samples <- list()
+
+  for(i in 1:length(sampname)) {
+    investigated_sample <- as.character(sampname[i])
+    if(select_method == "canoes") {
+      reference_samples_for_investigated_sample <- canoes_method(investigated_sample, Y, num_refs)$reference_samples
+      reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
+    } else if(select_method == "codex") {
+      #reference_samples_for_investigated_sample <- codex_method(investigated_sample, Y, num_refs)$reference_samples
+      #reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
+    } else if(select_method == "exomedepth") {
+      reference_samples_for_investigated_sample <- exomedepth_method(investigated_sample, Y, num_refs, target_length)$reference_samples
+      reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
+    } else if(select_method == "clamms") {
+      #reference_samples_for_investigated_sample <- clamms_method(investigated_sample, Y, num_refs)$reference_samples
+      #reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
+    }
   }
-  reference_samples
+  resultant_string <- ''
+  for(i in 1:length(reference_samples)) {
+    resultant_string <- paste(resultant_string, paste(reference_samples[[i]], collapse=","), '\n', sep="")
+  }
+  write(resultant_string, output_reference_file)
 }

From da94fbde2b8f48df0e9042cea8fd7aa55cfd5770 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 23 Mar 2018 15:11:33 +0100
Subject: [PATCH 021/114] change arguments order in
 run_REFERENCE.SAMPLE.SET.SELECTOR function

---
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R                     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index cc38ad3..c4492df 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -1,6 +1,6 @@
-run_REFERENCE.SAMPLE.SET.SELECTOR <- function(input_cov_table,
-                                              select_method,
+run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
                                               num_refs,
+                                              input_cov_table,
                                               output_reference_file){
 
   cov_table <- read.csv(input_cov_table)

From c75ff28705cbb72e79bc76a6589ee2464d6b1a66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 23 Mar 2018 18:48:08 +0100
Subject: [PATCH 022/114] draft of DAG for CODEX cnv caller

---
 airflow/dags/codex.py | 52 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100755 airflow/dags/codex.py

diff --git a/airflow/dags/codex.py b/airflow/dags/codex.py
new file mode 100755
index 0000000..55ee94b
--- /dev/null
+++ b/airflow/dags/codex.py
@@ -0,0 +1,52 @@
+from airflow import DAG
+from airflow.operators.bash_operator import BashOperator
+from airflow.models import Variable
+from datetime import datetime, timedelta
+
+default_args = {
+    'owner': 'biodatageeks',
+    'depends_on_past': False,
+    'start_date': datetime(2017, 10, 18),
+    'email': ['team@biodatageeks.ii.pw.edu.pl'],
+    'email_on_failure': False,
+    'email_on_retry': False,
+    'retries': 0
+}
+
+dag = DAG(
+    'codex', default_args=default_args, schedule_interval=None)
+
+##############################################
+########## RUN RAW CODEX CNV CALLER ##########
+##############################################
+
+### target qc parameters
+mapp_thresh = '0.9'
+cov_thresh_from = '20'
+cov_thresh_to = '4000'
+length_thresh_from = '20'
+length_thresh_to = '2000'
+gc_thresh_from = '20'
+gc_thresh_to = '80'
+raw_cov_table = 'input_cov_table.csv'
+qc_cov_table = 'output_cov_table.csv'
+
+### select reference sample set parameters
+select_method = 'exomedepth' # "canoes", "codex" or "exomedepth"
+num_refs = '30'
+reference_sample_set_file = 'reference_sample_set.csv'
+
+run_codex_caller_cmd= " \
+docker pull biodatageeks/cnv-opt-target-qc; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \
+docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \
+"
+
+run_codex_caller_task= BashOperator (
+    bash_command=run_codex_caller_cmd,
+    task_id='run_codex_caller_task',
+    dag=dag
+)
+
+run_codex_caller_task

From 3e8801676f171cd4461458f14ff24ac57b569102 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 23 Mar 2018 23:53:30 +0100
Subject: [PATCH 023/114] EXOMEDEPTH package in docker

---
 R/EXOMEDEPTHCOV/DESCRIPTION                 |  1 -
 R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R |  2 +-
 R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R       | 34 ++++++++++-----------
 3 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/R/EXOMEDEPTHCOV/DESCRIPTION b/R/EXOMEDEPTHCOV/DESCRIPTION
index d4a3025..8305596 100644
--- a/R/EXOMEDEPTHCOV/DESCRIPTION
+++ b/R/EXOMEDEPTHCOV/DESCRIPTION
@@ -16,7 +16,6 @@ Depends:
     optparse (== 1.4.4),
     IRanges (>= 2.0.0),
     ExomeDepth (>= 1.1.10),
-    REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1)
 License: GPL-3
 Encoding: UTF-8
 LazyData: true
diff --git a/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R
index 2b38d6b..550b4d0 100644
--- a/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R
+++ b/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R
@@ -1,6 +1,6 @@
 library(ExomeDepth)
 
-coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){
+coverageObj1 <- function(cov_table, sampname, targets_for_chr){
   Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0)
   for(sample in sampname) {
     cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,]
diff --git a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
index 295306e..f187712 100644
--- a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
+++ b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
@@ -1,15 +1,17 @@
 library(ExomeDepth)
 library(methods)
 
-run_EXOMEDEPTHCOV <- function(reference_set_select_method,
-                              num_of_samples_in_reference_set,
-                              cov_table){
+run_EXOMEDEPTHCOV <- function(input_cov_table,
+                              reference_sample_set_file,
+                              output_calls_file){
 
+  con <- file(reference_sample_set_file, open='r')
+  reference_sample_set <- readLines(con)
+  cov_table <- read.csv(input_cov_table)
   sampname <- unique(cov_table[,"sample_name"])
   targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]
   targets <- targets[!duplicated(targets[,"target_id"]),]
   targets <- targets[with(targets, order(target_id)), ]
-  
   calls <- data.frame(matrix(nrow=0, ncol=13))
   chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y")))
   library(IRanges)
@@ -19,20 +21,16 @@ run_EXOMEDEPTHCOV <- function(reference_set_select_method,
     if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
       next()
     }
-    Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y
+    Y <- coverageObj1(cov_table, sampname, targets_for_chr)$Y
 
-    for (actual_sample_id in 1:length(sampname)) {
-      actual_sample <- sampname[actual_sample_id]
-      ## ----reference.selection-------------------------------------------------
-      target_length <- c()
-      for (i in 1:nrow(Y)) {
-        target_length <- c(target_length, width(ref[i]))
+    for (i in 1:length(reference_sample_set)) {
+      if (reference_sample_set[[i]] == '') {
+        next()
       }
-      reference_samples <- run_REFERENCE.SAMPLE.SET.SELECTOR(actual_sample,
-                                                             Y,
-                                                             reference_set_select_method,
-                                                             num_of_samples_in_reference_set,
-                                                             target_length)
+      samples <- unlist(strsplit(reference_sample_set[[i]], ','))
+      print(samples)
+      actual_sample <- samples[1]
+      reference_samples <- samples[-1]
 
       ## ----construct.ref-------------------------------------------------------
       my.matrix <- as.matrix(Y[,reference_samples])
@@ -42,7 +40,7 @@ run_EXOMEDEPTHCOV <- function(reference_set_select_method,
 
       ## ----build.complete------------------------------------------------------
       all.exons <- new('ExomeDepth',
-                       test = Y[,actual_sample_id],
+                       test = Y[,actual_sample],
                        reference = my.reference.selected,
                        formula = 'cbind(test, reference) ~ 1')
 
@@ -86,5 +84,5 @@ run_EXOMEDEPTHCOV <- function(reference_set_select_method,
   colnames(calls)[colnames(calls) == 'reads.observed'] <- 'raw_cov'
   colnames(calls)[colnames(calls) == 'reads.ratio'] <- 'copy_no'
   calls[colnames(calls) == 'copy_no'] <- round(calls[colnames(calls) == 'raw_cov'] / (calls[colnames(calls) == 'norm_cov'] / 2))
-  calls
+  write.csv(calls, output_calls_file, row.names=F)
 }

From 91c7e3c52c31cfb848c67fc619c64894d8aa48e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sat, 24 Mar 2018 13:18:32 +0100
Subject: [PATCH 024/114] Dockerfile for EXOMEDEPTHCOV package

---
 Docker/cnv-opt-exomedepthcov/Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 Docker/cnv-opt-exomedepthcov/Dockerfile

diff --git a/Docker/cnv-opt-exomedepthcov/Dockerfile b/Docker/cnv-opt-exomedepthcov/Dockerfile
new file mode 100644
index 0000000..28448f8
--- /dev/null
+++ b/Docker/cnv-opt-exomedepthcov/Dockerfile
@@ -0,0 +1,5 @@
+FROM biodatageeks/cnv-opt-exomedepth
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+RUN Rscript -e "install.packages('EXOMEDEPTHCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
+

From 862aabecde56668e4b07f803a416372305623795 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 15:57:19 +0200
Subject: [PATCH 025/114] first version of CODEXCOV package in docker image

---
 Docker/cnv-opt-canoescov/Dockerfile   | 18 ++++++
 Docker/cnv-opt-codexcov/Dockerfile    | 14 +++++
 R/CODEXCOV/CODEXCOV.Rproj             | 16 -----
 R/CODEXCOV/R/run_CODEXCOV.R           | 84 +++++++++++++--------------
 R/CODEXCOV/man/coverageObj1.Rd        | 14 -----
 R/CODEXCOV/man/gcmapp1.Rd             | 14 -----
 R/CODEXCOV/man/normObj1.Rd            | 14 -----
 R/CODEXCOV/man/normObj2.Rd            | 14 -----
 R/CODEXCOV/man/qcObj1.Rd              | 15 -----
 R/CODEXCOV/man/run_CODEXCOV.Rd        | 14 -----
 R/CODEXCOV/man/segment1.Rd            | 14 -----
 R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R |  1 -
 12 files changed, 71 insertions(+), 161 deletions(-)
 create mode 100644 Docker/cnv-opt-canoescov/Dockerfile
 create mode 100644 Docker/cnv-opt-codexcov/Dockerfile
 delete mode 100755 R/CODEXCOV/CODEXCOV.Rproj
 delete mode 100644 R/CODEXCOV/man/coverageObj1.Rd
 delete mode 100644 R/CODEXCOV/man/gcmapp1.Rd
 delete mode 100644 R/CODEXCOV/man/normObj1.Rd
 delete mode 100644 R/CODEXCOV/man/normObj2.Rd
 delete mode 100644 R/CODEXCOV/man/qcObj1.Rd
 delete mode 100644 R/CODEXCOV/man/run_CODEXCOV.Rd
 delete mode 100644 R/CODEXCOV/man/segment1.Rd

diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile
new file mode 100644
index 0000000..b404cab
--- /dev/null
+++ b/Docker/cnv-opt-canoescov/Dockerfile
@@ -0,0 +1,18 @@
+FROM ubuntu:xenial
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+RUN apt-get update
+RUN apt-get install -y software-properties-common
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
+RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/'
+RUN apt-get install -y apt-transport-https
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
+
+RUN Rscript -e "install.packages('nnls', repos = 'http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages('Hmisc', repos = 'http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages('mgcv', repos = 'http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages('plyr', repos = 'http://cran.us.r-project.org')"
+RUN Rscript -e "install.packages('CANOES', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile
new file mode 100644
index 0000000..6f56eb5
--- /dev/null
+++ b/Docker/cnv-opt-codexcov/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:xenial
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+RUN apt-get update
+RUN apt-get install -y software-properties-common
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
+RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/'
+RUN apt-get install -y apt-transport-https
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
+
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')"
diff --git a/R/CODEXCOV/CODEXCOV.Rproj b/R/CODEXCOV/CODEXCOV.Rproj
deleted file mode 100755
index d848a9f..0000000
--- a/R/CODEXCOV/CODEXCOV.Rproj
+++ /dev/null
@@ -1,16 +0,0 @@
-Version: 1.0
-
-RestoreWorkspace: No
-SaveWorkspace: No
-AlwaysSaveHistory: Default
-
-EnableCodeIndexing: Yes
-Encoding: UTF-8
-
-AutoAppendNewline: Yes
-StripTrailingWhitespace: Yes
-
-BuildType: Package
-PackageUseDevtools: Yes
-PackageInstallArgs: --no-multiarch --with-keep.source
-PackageRoxygenize: rd,collate,namespace
diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R
index 78929de..246aa22 100644
--- a/R/CODEXCOV/R/run_CODEXCOV.R
+++ b/R/CODEXCOV/R/run_CODEXCOV.R
@@ -9,10 +9,13 @@
 run_CODEXCOV <- function(K_from,
                          K_to,
                          lmax,
-                         reference_set_select_method,
-                         num_of_samples_in_reference_set,
-                         cov_table){
-  
+                         input_cov_table,
+                         reference_sample_set_file,
+                         output_calls_file){
+
+  con <- file(reference_sample_set_file, open='r')
+  reference_sample_set <- readLines(con)
+  cov_table <- read.csv(input_cov_table)
   sampname <- unique(cov_table[,"sample_name"])
   sampname <- as.character(sampname)
   targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]
@@ -28,6 +31,7 @@ run_CODEXCOV <- function(K_from,
     if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
       next()
     }
+
     ###################################################
     ### code chunk number 4: coverageObj1
     ###################################################
@@ -39,47 +43,37 @@ run_CODEXCOV <- function(K_from,
     gcmapp1_result <- gcmapp1(chr, ref)
     gc <- gcmapp1_result$gc
 
-    ###################################################
-    ### code chunk number 7: normObj1
-    ###################################################
-    normObj_result <- normObj1(Y, gc, K = K_from:K_to)
-    Yhat <- normObj_result$Yhat
-    AIC <- normObj_result$AIC
-    BIC <- normObj_result$BIC
-    RSS <- normObj_result$RSS
-    K <- normObj_result$K
-    
-    ###################################################
-    ### code chunk number 8: normObj2 (eval = FALSE)
-    ###################################################
-    ## normObj_result <- normObj2(Y, gc, K = 1:9, normal_index=seq(1,45,2))
-    ## Yhat <- normObj_result$Yhat
-    ## AIC <- normObj_result$AIC
-    ## BIC <- normObj_result$BIC
-    ## RSS <- normObj_result$RSS
-    ## K <- normObj_result$K
-    
-    ###################################################
-    ### code chunk number 9: choiceofK (eval = FALSE)
-    ###################################################
-    #choiceofK(AIC, BIC, RSS, K, filename = paste("choiceofK_", chr, ".pdf", sep = ""))
-    
-    ###################################################
-    ### code chunk number 10: fig1
-    ###################################################
-    #plot(K, RSS, type = "b", xlab = "Number of latent variables")
-    #plot(K, AIC, type = "b", xlab = "Number of latent variables")
-    #plot(K, BIC, type = "b", xlab = "Number of latent variables")
-    
-    ###################################################
-    ### code chunk number 11: segment1
-    ###################################################
-    finalcallIt <- segment1(Y, Yhat, K[which.max(BIC)], K, sampname,
-                            ref, chr, lmax, mode = "integer")$finalcall
-    if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))} 
-    finalcall <- rbind(finalcall, finalcallIt)
-  
+    for (i in 1:length(reference_sample_set)) {
+      if (reference_sample_set[[i]] == '') {
+        next()
+      }
+      samples <- unlist(strsplit(reference_sample_set[[i]], ','))
+      actual_sample <- samples[1]
+      reference_samples <- samples[-1]
+      samples <- samples[order(samples[,1]),]
+      Y_subset <- Y[,samples]
+
+      ###################################################
+      ### code chunk number 7: normObj1
+      ###################################################
+      normObj_result <- normObj1(Y_subset, gc, K = K_from:K_to)
+      Yhat <- normObj_result$Yhat
+      AIC <- normObj_result$AIC
+      BIC <- normObj_result$BIC
+      RSS <- normObj_result$RSS
+      K <- normObj_result$K
+
+      ###################################################
+      ### code chunk number 11: segment1
+      ###################################################
+      finalcallIt <- segment1(Y_subset, Yhat, K[which.max(BIC)], K, samples,
+                              ref, chr, lmax, mode = "integer")$finalcall
+      finalcallIt <- finalcallIt[finalcallIt[,"sample_name"] == actual_sample,]
+      if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))}
+      finalcall <- rbind(finalcall, finalcallIt)
+      print(finalcall)
+    }
   }
   finalcall <- unify_calls_format(finalcall)$finalcall
-  finalcall
+  write.csv(finalcall, output_calls_file, row.names=F)
 }
diff --git a/R/CODEXCOV/man/coverageObj1.Rd b/R/CODEXCOV/man/coverageObj1.Rd
deleted file mode 100644
index f6c6b1c..0000000
--- a/R/CODEXCOV/man/coverageObj1.Rd
+++ /dev/null
@@ -1,14 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/functions_CODEXCOV.R
-\name{coverageObj1}
-\alias{coverageObj1}
-\title{Function Dexcription}
-\usage{
-coverageObj1(cov_file, sampname)
-}
-\description{
-Function description.
-}
-\examples{
-coverageObj1
-}
diff --git a/R/CODEXCOV/man/gcmapp1.Rd b/R/CODEXCOV/man/gcmapp1.Rd
deleted file mode 100644
index 2fa53f1..0000000
--- a/R/CODEXCOV/man/gcmapp1.Rd
+++ /dev/null
@@ -1,14 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/functions_CODEXCOV.R
-\name{gcmapp1}
-\alias{gcmapp1}
-\title{Function Dexcription}
-\usage{
-gcmapp1(chr, ref)
-}
-\description{
-Function description.
-}
-\examples{
-coverageObj1
-}
diff --git a/R/CODEXCOV/man/normObj1.Rd b/R/CODEXCOV/man/normObj1.Rd
deleted file mode 100644
index 66b0a96..0000000
--- a/R/CODEXCOV/man/normObj1.Rd
+++ /dev/null
@@ -1,14 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/functions_CODEXCOV.R
-\name{normObj1}
-\alias{normObj1}
-\title{Function Dexcription}
-\usage{
-normObj1(Y_qc, gc_qc, K)
-}
-\description{
-Function description.
-}
-\examples{
-coverageObj1
-}
diff --git a/R/CODEXCOV/man/normObj2.Rd b/R/CODEXCOV/man/normObj2.Rd
deleted file mode 100644
index 4a10d47..0000000
--- a/R/CODEXCOV/man/normObj2.Rd
+++ /dev/null
@@ -1,14 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/functions_CODEXCOV.R
-\name{normObj2}
-\alias{normObj2}
-\title{Function Dexcription}
-\usage{
-normObj2(Y_qc, gc_qc, K, normal_index)
-}
-\description{
-Function description.
-}
-\examples{
-coverageObj1
-}
diff --git a/R/CODEXCOV/man/qcObj1.Rd b/R/CODEXCOV/man/qcObj1.Rd
deleted file mode 100644
index 806a098..0000000
--- a/R/CODEXCOV/man/qcObj1.Rd
+++ /dev/null
@@ -1,15 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/functions_CODEXCOV.R
-\name{qcObj1}
-\alias{qcObj1}
-\title{Function Dexcription}
-\usage{
-qcObj1(Y, sampname, chr, ref, mapp, gc, cov_thresh, length_thresh, mapp_thresh,
-  gc_thresh)
-}
-\description{
-Function description.
-}
-\examples{
-coverageObj1
-}
diff --git a/R/CODEXCOV/man/run_CODEXCOV.Rd b/R/CODEXCOV/man/run_CODEXCOV.Rd
deleted file mode 100644
index f80759e..0000000
--- a/R/CODEXCOV/man/run_CODEXCOV.Rd
+++ /dev/null
@@ -1,14 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/run_CODEXCOV.R
-\name{run_CODEXCOV}
-\alias{run_CODEXCOV}
-\title{Function Dexcription}
-\usage{
-run_CODEXCOV(cov_file, sampname)
-}
-\description{
-Function description.
-}
-\examples{
-run_codexcov
-}
diff --git a/R/CODEXCOV/man/segment1.Rd b/R/CODEXCOV/man/segment1.Rd
deleted file mode 100644
index c7cf654..0000000
--- a/R/CODEXCOV/man/segment1.Rd
+++ /dev/null
@@ -1,14 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/functions_CODEXCOV.R
-\name{segment1}
-\alias{segment1}
-\title{Function Dexcription}
-\usage{
-segment1(Y_qc, Yhat, optK, K, sampname_qc, ref_qc, chr, lmax, mode)
-}
-\description{
-Function description.
-}
-\examples{
-coverageObj1
-}
diff --git a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
index f187712..a8e8e3c 100644
--- a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
+++ b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
@@ -28,7 +28,6 @@ run_EXOMEDEPTHCOV <- function(input_cov_table,
         next()
       }
       samples <- unlist(strsplit(reference_sample_set[[i]], ','))
-      print(samples)
       actual_sample <- samples[1]
       reference_samples <- samples[-1]
 

From 626f23abdd385804cfb8d3a58f7d06bef9582cd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 16:50:41 +0200
Subject: [PATCH 026/114] finished dag for ExomeDepth, draft od dag for CANOES

---
 airflow/dags/canoes.py     | 52 ++++++++++++++++++++++++++++++++++
 airflow/dags/exomedepth.py | 57 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+)
 create mode 100755 airflow/dags/canoes.py
 create mode 100755 airflow/dags/exomedepth.py

diff --git a/airflow/dags/canoes.py b/airflow/dags/canoes.py
new file mode 100755
index 0000000..7c74842
--- /dev/null
+++ b/airflow/dags/canoes.py
@@ -0,0 +1,52 @@
+from airflow import DAG
+from airflow.operators.bash_operator import BashOperator
+from airflow.models import Variable
+from datetime import datetime, timedelta
+
+default_args = {
+    'owner': 'biodatageeks',
+    'depends_on_past': False,
+    'start_date': datetime(2017, 10, 18),
+    'email': ['team@biodatageeks.ii.pw.edu.pl'],
+    'email_on_failure': False,
+    'email_on_retry': False,
+    'retries': 0
+}
+
+dag = DAG(
+    'canoes', default_args=default_args, schedule_interval=None)
+
+##############################################
+########## RUN RAW CANOES CNV CALLER ##########
+##############################################
+
+### target qc parameters
+mapp_thresh = '0.9'
+cov_thresh_from = '20'
+cov_thresh_to = '4000'
+length_thresh_from = '20'
+length_thresh_to = '2000'
+gc_thresh_from = '20'
+gc_thresh_to = '80'
+raw_cov_table = 'input_cov_table.csv'
+qc_cov_table = 'output_cov_table.csv'
+
+### select reference sample set parameters
+select_method = 'exomedepth' # "canoes", "codex" or "exomedepth"
+num_refs = '30'
+reference_sample_set_file = 'reference_sample_set.csv'
+
+run_canoes_caller_cmd= " \
+docker pull biodatageeks/cnv-opt-target-qc; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \
+docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \
+"
+
+run_canoes_caller_task= BashOperator (
+    bash_command=run_canoes_caller_cmd,
+    task_id='run_canoes_caller_task',
+    dag=dag
+)
+
+run_canoes_caller_task
diff --git a/airflow/dags/exomedepth.py b/airflow/dags/exomedepth.py
new file mode 100755
index 0000000..e8629be
--- /dev/null
+++ b/airflow/dags/exomedepth.py
@@ -0,0 +1,57 @@
+from airflow import DAG
+from airflow.operators.bash_operator import BashOperator
+from airflow.models import Variable
+from datetime import datetime, timedelta
+
+default_args = {
+    'owner': 'biodatageeks',
+    'depends_on_past': False,
+    'start_date': datetime(2017, 10, 18),
+    'email': ['team@biodatageeks.ii.pw.edu.pl'],
+    'email_on_failure': False,
+    'email_on_retry': False,
+    'retries': 0
+}
+
+dag = DAG(
+    'exomedepth', default_args=default_args, schedule_interval=None)
+
+###################################################
+########## RUN RAW EXOMEDEPTH CNV CALLER ##########
+###################################################
+
+### target qc parameters
+mapp_thresh = '0.9'
+cov_thresh_from = '20'
+cov_thresh_to = '4000'
+length_thresh_from = '20'
+length_thresh_to = '2000'
+gc_thresh_from = '20'
+gc_thresh_to = '80'
+raw_cov_table = 'input_cov_table.csv'
+qc_cov_table = 'output_cov_table.csv'
+
+### select reference sample set parameters
+select_method = 'exomedepth' # "canoes", "codex" or "exomedepth"
+num_refs = '30'
+reference_sample_set_file = 'reference_sample_set.csv'
+
+### exomedepth parameters
+output_calls_file = 'calls.csv'
+
+run_exomedepth_caller_cmd= " \
+docker pull biodatageeks/cnv-opt-target-qc; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \
+docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \
+docker pull biodatageeks/cnv-opt-exomedepthcov; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-exomedepthcov Rscript -e \"library(\'EXOMEDEPTHCOV\');run_EXOMEDEPTHCOV('" + qc_cov_table + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \
+"
+
+run_exomedepth_caller_task= BashOperator (
+    bash_command=run_exomedepth_caller_cmd,
+    task_id='run_exomedepth_caller_task',
+    dag=dag
+)
+
+run_exomedepth_caller_task

From 7a7beef8b0bd82de81413bad1aeb84882b7888cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 16:51:44 +0200
Subject: [PATCH 027/114] draft of CANOESCOV with new interface

---
 Docker/cnv-opt-canoescov/Dockerfile |  18 +-
 Docker/cnv-opt-codexcov/Dockerfile  |  14 +-
 R/CANOESCOV/DESCRIPTION             |   3 +-
 R/CANOESCOV/R/functions_CANOESCOV.R | 487 +---------------------------
 R/CANOESCOV/R/run_CANOESCOV.R       |  27 +-
 R/CODEXCOV/DESCRIPTION              |   3 +-
 6 files changed, 30 insertions(+), 522 deletions(-)

diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile
index b404cab..44080ff 100644
--- a/Docker/cnv-opt-canoescov/Dockerfile
+++ b/Docker/cnv-opt-canoescov/Dockerfile
@@ -1,18 +1,4 @@
-FROM ubuntu:xenial
+FROM biodatageeks/cnv-opt-canoes
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-RUN apt-get update
-RUN apt-get install -y software-properties-common
-RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
-RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/'
-RUN apt-get install -y apt-transport-https
-
-RUN apt-get update && \
-    apt-get upgrade -y && \
-    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
-
-RUN Rscript -e "install.packages('nnls', repos = 'http://cran.us.r-project.org')"
-RUN Rscript -e "install.packages('Hmisc', repos = 'http://cran.us.r-project.org')"
-RUN Rscript -e "install.packages('mgcv', repos = 'http://cran.us.r-project.org')"
-RUN Rscript -e "install.packages('plyr', repos = 'http://cran.us.r-project.org')"
-RUN Rscript -e "install.packages('CANOES', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
+RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile
index 6f56eb5..258bbf8 100644
--- a/Docker/cnv-opt-codexcov/Dockerfile
+++ b/Docker/cnv-opt-codexcov/Dockerfile
@@ -1,14 +1,4 @@
-FROM ubuntu:xenial
+FROM biodatageeks/cnv-opt-codex
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-RUN apt-get update
-RUN apt-get install -y software-properties-common
-RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
-RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/'
-RUN apt-get install -y apt-transport-https
-
-RUN apt-get update && \
-    apt-get upgrade -y && \
-    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
-
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')"
+RUN Rscript -e "install.packages('CODEXCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/R/CANOESCOV/DESCRIPTION b/R/CANOESCOV/DESCRIPTION
index 314f419..0fa3115 100644
--- a/R/CANOESCOV/DESCRIPTION
+++ b/R/CANOESCOV/DESCRIPTION
@@ -18,8 +18,7 @@ Depends:
     plyr (>= 1.8.4),
     nnls (>= 1.4.0),
     Hmisc (>= 4.0.0),
-    mgcv (>= 1.8.0),
-    REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1)
+    mgcv (>= 1.8.0)
 License: GPL-3
 Encoding: UTF-8
 LazyData: true
diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R
index 873e894..b55c138 100644
--- a/R/CANOESCOV/R/functions_CANOESCOV.R
+++ b/R/CANOESCOV/R/functions_CANOESCOV.R
@@ -11,13 +11,6 @@ coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){
   return(list(Y=Y))
 }
 
-# Constants
-NUM.ABNORMAL.STATES=2
-NUM.STATES=3
-DELETION=1
-NORMAL=2
-DUPLICATION=3
-
 # CallCNVs
 #     Calls CNVs in sample of interest
 # Arguments:
@@ -53,7 +46,7 @@ DUPLICATION=3
 #      TARGETS: target numbers of CNV in the form start..stop
 #      NUM_TARG: how many targets are in the CNV
 #      Q_SOME: a Phred-scaled quality score for the CNV
-CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, reference_set_select_method='canoes', num_of_samples_in_reference_set=30, get.dfs=F, homdel.mean=0.2, target_length){
+CallCNVs <- function(sample.name, reference.samples, counts, p=1e-08, Tnum=6, D=70000, get.dfs=F, homdel.mean=0.2){
   if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
   if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){
     stop("First five columns of counts matrix must be target, chromosome, start, end, gc")
@@ -97,12 +90,12 @@ CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, reference_se
   #reference.samples <- names(sort(covariances, 
   #        decreasing=T)[1:min(numrefs, length(covariances))])
   Y <- data.matrix(counts[,6:ncol(counts)])
-  library('REFERENCE.SAMPLE.SET.SELECTOR')
-  reference.samples <- run_REFERENCE.SAMPLE.SET.SELECTOR(sample.name,
-                                                         Y,
-                                                         reference_set_select_method,
-                                                         num_of_samples_in_reference_set,
-                                                         target_length)
+  #library('REFERENCE.SAMPLE.SET.SELECTOR')
+  #reference.samples <- run_REFERENCE.SAMPLE.SET.SELECTOR(sample.name,
+  #                                                       Y,
+  #                                                       reference_set_select_method,
+  #                                                       num_of_samples_in_reference_set,
+  #                                                       target_length)
   sample.mean.counts <- mean(counts[, sample.name])
   sample.sumcounts <- apply(counts[, reference.samples], 2, sum)
   # normalize reference samples to sample of interest
@@ -168,469 +161,3 @@ CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, reference_se
   }
   return(cnvs)
 }
-
-# GenotypeCNVs
-#     Genotype CNVs in sample of interest
-# Arguments:
-#   xcnv
-#     data frame with the following columns, and one row for each
-#     CNV to genotype
-#      INTERVAL: CNV coordinates in the form chr:start-stop
-#      TARGETS: target numbers of CNV in the form start..stop
-#               these should correspond to the target numbers in counts
-#   sample.name:
-#     sample to genotype CNVs in (should correspond to a column in counts)
-#   counts: 
-#     count matrix, first five columns should be 
-#       target: consecutive numbers for targets (integer)
-#       chromosome: chromosome number (integer-valued) 
-#         (support for sex chromosomes to come)
-#       start: start position of probe (integer)
-#       end: end position of probe (integer)
-#       gc: gc content (real between 0 and 1)
-#       subsequent columns should include counts for each probe for samples
-#   p:
-#     average rate of occurrence of CNVs (real) default is 1e-08
-#   D:
-#     expected distance between targets in a CNV (integer) default is 70,000
-#   Tnum:
-#     expected number of targets in a CNV (integer) default is 6
-#   numrefs
-#     maximum number of reference samples to use (integer) default is 30
-#     the weighted variance calculations will take a long time if too 
-#     many reference samples are used
-#   emission.probs and distances are for internal use only
-# Returns: 
-#   data frame with the following columns and one row for each genotyped CNV:
-#      INTERVAL: CNV coordinates in the form chr:start-stop
-#      NQDEL: a Phred-scaled quality score that sample.name has no deletion 
-#             in the interval
-#      SQDEL: a Phred-scaled quality score that sample.name has a deletion 
-#             in the interval
-#      NQDUP and SQDUP: same, but for a duplication
-GenotypeCNVs <- function(xcnvs, sample.name, counts, p=1e-08, Tnum=6, 
-                    D=70000, numrefs=30,
-                    emission.probs=NULL, 
-                    distances=NULL){
-  if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
-  if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){
-    stop("First five columns of counts matrix must be target, chromosome, start, end, gc")
-  }
-  if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) {
-    # remove sex chromosomes
-    cat("Trying to remove sex chromosomes and 'chr' prefixes\n")
-    counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y"))
-    if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){
-      counts$chromosome <- gsub("chr", "", counts$chromosome)
-    }
-    counts$chromosome <- as.numeric(counts$chromosome)
-    if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) 
-      stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)")
-  }
-  library(plyr)
-  counts <- arrange(counts, chromosome, start)
-  if (p <= 0){
-    stop("parameter p must be positive")
-  }
-  if (Tnum <= 0){
-    stop("parameter Tnum must be positive")
-  }
-  if (D <= 0){
-    stop("parameter D must be positive")
-  }
-  if (numrefs <= 0){
-    stop("parameter numrefs must be positive")
-  }
-  num.cnvs <- nrow(xcnvs)
-  cnv.intervals <- as.character(xcnvs$INTERVAL)
-  # if no emission probs matrix is passed in, generate a new one
-  if (is.null(emission.probs)){
-    l <- CANOESCOV::CallCNVs(sample.name, counts, p, Tnum=6, D=70000, numrefs=30, get.dfs=T)
-    emission.probs <- l[['emission.probs']]
-    distances <- l[['distances']]
-  }
-  forward.m <- GetForwardMatrix(emission.probs, distances, p, Tnum, D)
-  backward.m <- GetBackwardMatrix(emission.probs, distances, p, Tnum, D)
-  qualities <- matrix(0, nrow=num.cnvs, ncol=5, 
-                      dimnames=list(cnv.intervals, 
-                                    c("INTERVAL", "NQDel", "SQDel", "NQDup", "SQDup")))
-  for (i in 1:num.cnvs){
-    interval <- as.character(xcnvs[i, "INTERVAL"])
-    targets <- as.numeric(strsplit(as.character(xcnvs[i, "TARGETS"]), ".", fixed=T)[[1]][c(1,3)])
-    left.target <- targets[1]
-    right.target <- targets[2]
-    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
-                                         emission.probs, distances, 
-                                         left.target, right.target, 
-                                         c(DUPLICATION, DELETION), p, Tnum, D)
-    modified.likelihood <- likelihoods[1]; 
-    unmodified.likelihood <- likelihoods[2]
-    Prob.All.Normal <- exp(modified.likelihood - unmodified.likelihood)
-    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
-                                         emission.probs, distances, 
-                                         left.target, right.target, DELETION, p, Tnum, D)
-    modified.likelihood <- likelihoods[1]; 
-    unmodified.likelihood <- likelihoods[2]
-    Prob.No.Deletion <- exp(modified.likelihood - unmodified.likelihood)
-    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
-                                         emission.probs, distances, 
-                                         left.target, right.target, DUPLICATION, p, Tnum, D)
-    modified.likelihood <- likelihoods[1]; 
-    unmodified.likelihood <- likelihoods[2]
-    Prob.No.Duplication <- exp(modified.likelihood - unmodified.likelihood)
-    # Check if probabilities greater than 1 are numerical error or bug
-    Phred <- function(prob){
-      return(round(min(99, -10 * log10(1 - prob))))
-    }
-    qualities[i, "NQDel"] <- Phred(Prob.No.Deletion)       
-    qualities[i, "SQDel"] <- Phred(Prob.No.Duplication - Prob.All.Normal)
-    qualities[i, "NQDup"] <- Phred(Prob.No.Duplication)       
-    qualities[i, "SQDup"] <- Phred(Prob.No.Deletion - Prob.All.Normal)
-    qualities[i, "INTERVAL"] <- interval
-  }
-  qualities <- as.data.frame(qualities, stringsAsFactors=F)
-  qualities$NQDel <- as.integer(qualities$NQDel)
-  qualities$NQDup <- as.integer(qualities$NQDup)
-  qualities$SQDel <- as.integer(qualities$SQDel)
-  qualities$SQDup <- as.integer(qualities$SQDup)
-  return(qualities)
-}
-
-# returns data frame with distance to each target from the previous target 
-# (0 in the case of the first target on chromosome 1, a very big number
-# for the first target on each other chromosome--this resets the HMM
-# for each chromosome)
-GetDistances <- function(counts){
-  chromosome <- counts[, "chromosome"]
-  startbase <- counts[, "start"]
-  num.nonzero.exons <- length(startbase)
-  distances <- c(0, startbase[2:num.nonzero.exons] - 
-                   startbase[1:(num.nonzero.exons - 1)] + 
-                   1000000000000 * (chromosome[2:num.nonzero.exons] - 
-                                      chromosome[1:(num.nonzero.exons - 1)]))
-  return(data.frame(target=counts[, "target"], distance=distances))
-}
-
-EstimateVariance <- function(counts, ref.sample.names, sample.weights){
-  library(Hmisc)
-  counts$var <- apply(counts[, ref.sample.names], 1, wtd.var, sample.weights, normwt=T)
-  set.seed(1)
-  counts.subset <- counts[sample(nrow(counts), min(36000, nrow(counts))), ]
-  library(mgcv)
-  # can't do gamma regression with negative 
-  counts.subset$var[counts.subset$var==0] <- 0.1 
-  fit <- gam(var ~ s(mean) + s(gc), family=Gamma(link=log), data=counts.subset)
-  # we don't want variance less than Poisson
-  # we take maximum of genome-wide estimate, method of moments estimate
-  # and Poisson variance
-  v.estimate <- pmax(predict(fit, counts, type="response"), counts$var, 
-                     counts$mean * 1.01)
-  return(data.frame(target=counts$target, var.estimate=v.estimate))
-}
-
-EmissionProbs <- function(test.counts, target.means, 
-                                      var.estimate, targets){
-  num.targets <- length(test.counts)
-  # calculate the means for the deletion, normal and duplication states
-  state.target.means <- t(apply(data.frame(x=target.means), 1, function(x) c(x*1/2, x, x*3/2)))
-  # calculate the expected size (given the predicted variance)
-  size <- target.means ^ 2 / (var.estimate - target.means)
-  emission.probs <- matrix(NA, num.targets, 4)
-  colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob")
-  # calculate the emission probabilities given the read count
-  size.del <- size
-  size.dup <- size
-  size.del <- size / 2
-  size.dup <- size * 3 / 2
-  emission.probs[, "delprob"] <- dnbinom(
-    test.counts,
-    mu=state.target.means[, 1],
-    size=size.del, log=T)
-  emission.probs[, "normalprob"] <- dnbinom(
-    test.counts,
-    mu=state.target.means[, 2],
-    size=size, log=T)
-  emission.probs[, "dupprob"] <- dnbinom(
-    test.counts,
-    mu=state.target.means[, 3],
-    size=size.dup, log=T)
-  emission.probs[, "target"] <- targets
-  # some values may be infinite as a result of extreme read count
-  row.all.inf <- which(apply(emission.probs, 1, function(x){all(is.infinite(x))}))
-  if (length(row.all.inf) > 0){
-    for (i in row.all.inf){
-      if (test.counts[i] >= state.target.means[i, 3]){
-        emission.probs[i, 2:4] <- c(-Inf, -Inf, -0.01)
-      }
-      else if (test.counts[i] <= state.target.means[i, 1]){
-        emission.probs[i, 2:4] <- c(-0.01, -Inf, -Inf)
-      }
-      else emission.probs[i, 2:4] <- c(-Inf, -0.01, -Inf)
-    }
-  }
-  return(emission.probs)
-}
-
-# Viterbi algorithm
-Viterbi <- function(emission.probs.matrix, distances, p, Tnum, D){
-  targets <- emission.probs.matrix[, 1]
-  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
-  num.exons <- dim(emission.probs.matrix)[1]
-  viterbi.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)
-  viterbi.pointers <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)
-  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
-  viterbi.matrix[1, ] <- initial.state + emission.probs.matrix[1,]
-  for (i in 2:num.exons) {
-    temp.matrix <- viterbi.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
-    viterbi.matrix[i, ] <- apply(temp.matrix, 2, max)
-    emission.probs <- c(emission.probs.matrix[i,])
-    dim(emission.probs) <- c(NUM.STATES, 1)
-    viterbi.matrix[i, ] <- viterbi.matrix[i, ] + emission.probs
-    viterbi.pointers[i, ] <- apply(temp.matrix, 2, which.max)
-  }
-  viterbi.states = vector(length = num.exons)
-  viterbi.states[num.exons] = which.max(viterbi.matrix[num.exons, ])
-  for (i in (num.exons - 1):1) {
-    viterbi.states[i] <- viterbi.pointers[i + 1, viterbi.states[i + 1]]
-  }
-  return(data.frame(target=targets, viterbi.state=viterbi.states))
-}
-
-# returns a transition matrix
-#                              to state
-#                    deletion   normal    duplication
-#           deletion   
-#from state   normal
-#        duplication
-GetTransitionMatrix <- function(distance, p, Tnum, D){
-  q <- 1 / Tnum
-  f = exp(-distance/D)
-  prob.abnormal.abnormal <- f * (1 - q) + (1 - f) * p
-  prob.abnormal.normal <- f * q + (1 - f) * (1 - 2 * p)
-  prob.abnormal.diff.abnormal <- (1 - f) * p
-  prob.normal.normal <- 1 - 2 * p
-  prob.normal.abnormal <- p
-  transition.probs <- 
-    c(prob.abnormal.abnormal, prob.abnormal.normal, prob.abnormal.diff.abnormal, 
-      prob.normal.abnormal, prob.normal.normal, prob.normal.abnormal,
-      prob.abnormal.diff.abnormal, prob.abnormal.normal, prob.abnormal.abnormal)
-  transition.m = log(matrix(transition.probs, NUM.STATES, NUM.STATES, byrow=TRUE))
-  return(transition.m)
-}
-
-# adds two log-space probabilities using the identity
-# log (p1 + p2) = log p1 + log(1 + exp(log p2 - log p1))
-AddTwoProbabilities <- function(x, y){
-  if (is.infinite(x)) return (y)
-  if (is.infinite(y)) return (x)
-  sum.probs <- max(x, y) + log1p(exp(-abs(x - y)))
-}
-
-# adds multiple log-space probabilities
-SumProbabilities <- function(x){
-  sum.probs <- x[1]
-  for (i in 2:length(x)){
-    sum.probs <- AddTwoProbabilities(sum.probs, x[i])
-  }
-  return(sum.probs)
-}
-
-# finds the data likelihood by summing the product of the corresponding 
-# forward and backward probabilities at any token (should give the same value
-# regardless of the token)
-GetLikelihood <- function(forward.matrix, backward.matrix, x){
-  SumProbabilities(forward.matrix[x, ] + backward.matrix[x, ])
-}
-
-# get the forward probabilities
-GetForwardMatrix <- function(emission.probs.matrix, distances, p, Tnum, D){
-  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
-  num.exons <- dim(emission.probs.matrix)[1]
-  forward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)   # matrix to hold forward probabilities
-  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
-  forward.matrix[1, ] <- initial.state + emission.probs.matrix[1, ]
-  for (i in 2:num.exons){
-    # compute matrix with probability we were in state j and are now in state i
-    # in temp.matrix[j, i] (ignoring emission of current token)
-    temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
-    # find the probability that we are in each of the three states
-    sum.probs <- apply(temp.matrix, 2, SumProbabilities)
-    forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ]
-  }  
-  return(forward.matrix)  
-}
-
-# get the backward probabilities
-GetBackwardMatrix <- function(emission.probs.matrix, distances, 
-                                  p, Tnum, D){
-  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
-  num.exons <- dim(emission.probs.matrix)[1]
-  backward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)   # matrix to hold backward probabilities
-  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
-  backward.matrix[num.exons, ] <- rep(0, NUM.STATES)
-  for (i in (num.exons - 1):1){
-    temp.matrix <- GetTransitionMatrix(distances$distance[i+1], p, Tnum, D) + 
-      matrix(backward.matrix[i + 1, ], 3, 3, byrow=T) +
-      matrix(emission.probs.matrix[i+1, ], 3, 3, byrow=T)
-    backward.matrix[i, ] <- apply(temp.matrix, 1, SumProbabilities)
-  }  
-  final.prob <- backward.matrix[1, ] + emission.probs.matrix[1, ] + initial.state
-  return(backward.matrix)  
-}
-
-# find the likelihood of the data given that certain states are disallowed
-# between start target and end target
-GetModifiedLikelihood <- function(forward.matrix, backward.matrix, emission.probs.matrix, distances, 
-                                      start.target, end.target, disallowed.states, p, Tnum, D){
-  targets <- emission.probs.matrix[, 1]
-  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
-  # there may be missing targets in this sample, we genotype the largest stretch of 
-  # targets that lie in the CNV
-  left.target <- min(which(targets >= start.target))
-  right.target <- max(which(targets <= end.target))
-  num.exons <- dim(emission.probs.matrix)[1]
-  unmodified.likelihood <- GetLikelihood(forward.matrix, 
-                                             backward.matrix, min(right.target + 1, num.exons))
-  #right.target or left.target may be empty
-  
-  #if (right.target >= left.target) return(c(NA, unmodified.likelihood))
-  stopifnot(right.target >= left.target)
-  modified.emission.probs.matrix <- emission.probs.matrix
-  modified.emission.probs.matrix[left.target:right.target, 
-                                 disallowed.states] <- -Inf
-  
-  # if the start target is the first target we need to recalculate the 
-  # forward probabilities
-  # for that target, using the modified emission probabilities
-  if (left.target == 1){
-    initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
-    forward.matrix[1, ] <- initial.state + modified.emission.probs.matrix[1, ]
-    left.target <- left.target + 1
-  } 
-  for (i in seq(left.target, min(right.target + 1, num.exons))){
-    # compute matrix with probability we were in state j and are now in state i
-    # in temp.matrix[j, i] (ignoring emission of current token)
-    temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
-    # find the probability that we are in each of the three states
-    sum.probs <- apply(temp.matrix, 2, SumProbabilities) 
-    if (!i == (right.target + 1)){
-      forward.matrix[i, ] <- sum.probs + modified.emission.probs.matrix[i, ]
-    } else{
-      forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ]
-    }
-  }  
-  # find the modified likelihood of the sequence
-  modified.likelihood <- GetLikelihood(forward.matrix, backward.matrix, min(right.target + 1, num.exons))
-  return(c(modified.likelihood, unmodified.likelihood))
-}
-
-SummarizeCNVs <- function(cnv.targets, counts, sample.name, state){
-  sample.name <- sample.name
-  cnv.type <- ifelse(state==3, "DUP", "DEL")
-  cnv.start <- min(cnv.targets$target)
-  cnv.end <- max(cnv.targets$target)
-  cnv.chromosome <- counts[cnv.start, "chromosome"]
-  cnv.start.base <- counts[cnv.start, "start"]
-  cnv.start.target <- counts[cnv.start, "target"]
-  cnv.end.base <- counts[cnv.end, "end"]
-  cnv.end.target <- counts[cnv.end, "target"]
-  cnv.kbs <- (cnv.end.base - cnv.start.base) / 1000
-  cnv.midbp <- round((cnv.end.base - cnv.start.base) / 2) + cnv.start.base
-  cnv.targets <- paste(cnv.start.target, "..", cnv.end.target, sep="")
-  cnv.interval <- paste(cnv.chromosome, ":", cnv.start.base, "-", cnv.end.base, sep="")
-  num.targets <- cnv.end.target - cnv.start.target + 1
-  return(data.frame(sample.name=sample.name, cnv.type=cnv.type, cnv.interval=cnv.interval, 
-                    cnv.kbs=cnv.kbs, cnv.chromosome=cnv.chromosome, 
-                    cnv.midbp=cnv.midbp, cnv.targets=cnv.targets, num.targets=num.targets))
-}
-
-PrintCNVs <- function(test.sample.name, viterbi.state, 
-                      nonzero.counts){  
-  consecutiveGroups <- function(sequence){
-    num <- length(sequence)
-    group <- 1
-    groups <- rep(0, num)
-    groups[1] <- group
-    if (num > 1){
-      for (i in 2:num){
-        if (!sequence[i] == (sequence[i - 1] + 1)) group <- group + 1
-        groups[i] <- group
-      }
-    }
-    return(groups)
-  }
-  num.duplications <- 0
-  num.deletions <- 0
-  for (state in c(1, 3)){
-    cnv.targets <- which(viterbi.state$viterbi.state == state)
-    if (!length(cnv.targets) == 0){
-      groups <- consecutiveGroups(cnv.targets)
-      library(plyr)
-      cnvs.temp.df <- ddply(data.frame(target=cnv.targets, group=groups), 
-                            "group", SummarizeCNVs, nonzero.counts, test.sample.name, 
-                            state)
-      if (state == 1){
-        deletions.df <- cnvs.temp.df
-        if (!is.null(dim(deletions.df))){
-          num.deletions <- dim(deletions.df)[1]
-        }
-      } else {
-        duplications.df <- cnvs.temp.df
-        if (!is.null(dim(duplications.df))){
-          num.duplications <- dim(duplications.df)[1]
-        }
-      }
-    }
-  }
-  num.calls <- num.deletions + num.duplications
-  cat(num.calls, "CNVs called in sample", test.sample.name, "\n")
-  if (num.deletions == 0 & num.duplications == 0){
-    df <- data.frame(SAMPLE=character(0), CNV=character(0), INTERVAL=character(0), 
-                     KB=numeric(0), CHR=character(0), 
-                     MID_BP=numeric(), TARGETS=character(0), NUM_TARG=numeric(0), Q_SOME=numeric(0), MLCN=numeric(0))
-    return(df)
-  }
-  if (num.deletions > 0 & num.duplications > 0){
-    cnvs.df <- rbind(deletions.df, duplications.df)
-  } else {
-    ifelse(num.deletions > 0, 
-           cnvs.df <- deletions.df, cnvs.df <- duplications.df)
-  }
-  xcnv <- cbind(cnvs.df[, c("sample.name", "cnv.type", "cnv.interval", 
-                      "cnv.kbs", "cnv.chromosome", "cnv.midbp", 
-                      "cnv.targets", "num.targets")], 0)
-  colnames(xcnv) <- c("SAMPLE", "CNV", "INTERVAL", "KB", "CHR", "MID_BP", "TARGETS",
-                      "NUM_TARG", "MLCN")
-  xcnv$Q_SOME <- NA
-  return(xcnv)
-}
-
-CalcCopyNumber <- function(data, cnvs, homdel.mean){
-  for (i in 1:nrow(cnvs)){
-    cnv <- cnvs[i, ]
-    targets <- as.numeric(unlist(strsplit(as.character(cnv$TARGETS), "..", fixed=T)))
-    cnv.data <- subset(data, target >= targets[1] & target <= targets[2])
-    state.target.means <- t(apply(data.frame(x=cnv.data$countsmean), 1, 
-                                  function(x) c(C1=x*1/2, C2=x, C3=x*3/2, 
-                                                C4=x * 2, C5=x * 5/2, C6=x*6/2)))
-    # calculate the expected size (given the predicted variance)
-    size <- cnv.data$countsmean ^ 2 / (cnv.data$varestimate - cnv.data$countsmean)
-    emission.probs <- matrix(NA, nrow(cnv.data), 7)
-    colnames(emission.probs) <- c("C0", "C1", "C2", "C3", "C4", "C5", "C6")
-    #colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob")
-    # calculate the emission probabilities given the read count
-    emission.probs[, 1] <- dpois(cnv.data$sample, homdel.mean, log=T)
-    for (s in 1:6){
-      size.state <- size * s/2
-      emission.probs[, s+1] <- dnbinom(cnv.data$sample, mu=state.target.means[, s], 
-                                       size=size.state, log=T)
-    }
-    cs <- colSums(emission.probs)
-    ml.state <- which.max(cs) - 1
-    if (ml.state==2){
-      ml.state <- ifelse(cnv$CNV=="DEL", 1, 3)
-    }
-    cnvs$MLCN[i] <- ml.state
-  }  
-  return(cnvs)
-}
diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R
index 980f6b1..f6742c7 100644
--- a/R/CANOESCOV/R/run_CANOESCOV.R
+++ b/R/CANOESCOV/R/run_CANOESCOV.R
@@ -1,10 +1,13 @@
 library(methods)
 library(CODEX)
 
-run_CANOESCOV <- function(reference_set_select_method,
-                          num_of_samples_in_reference_set,
-                          cov_table){
+run_CANOESCOV <- function(input_cov_table,
+                          reference_sample_set_file,
+                          output_calls_file){
 
+  con <- file(reference_sample_set_file, open='r')
+  reference_sample_set <- readLines(con)
+  cov_table <- read.csv(input_cov_table)
   sampname <- unique(cov_table[,"sample_name"])
   targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]
   targets <- targets[!duplicated(targets[,"target_id"]),]
@@ -37,12 +40,16 @@ run_CANOESCOV <- function(reference_set_select_method,
     colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname)
     write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T)
     xcnv.list <- vector('list', length(sampname))
-    for (i in 1:length(sampname)){
-      xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=sampname[i],
-                                            counts=canoes.reads,
-                                            reference_set_select_method=reference_set_select_method,
-                                            num_of_samples_in_reference_set=num_of_samples_in_reference_set,
-                                            target_length=target_length)
+    for (i in 1:length(reference_sample_set)) {
+      if (reference_sample_set[[i]] == '') {
+        next()
+      }
+      samples <- unlist(strsplit(reference_sample_set[[i]], ','))
+      actual_sample <- samples[1]
+      reference_samples <- samples[-1]
+      xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample,
+                                            reference.samples=reference_samples,
+                                            counts=canoes.reads)
     }
     xcnvs <- do.call('rbind', xcnv.list)
     if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} 
@@ -80,7 +87,7 @@ run_CANOESCOV <- function(reference_set_select_method,
   calls[colnames(calls) == 'ed_bp'] <- as.character(unlist(calls[colnames(calls) == 'ed_bp']))
   calls[colnames(calls) == 'st_exon'] <- as.character(unlist(calls[colnames(calls) == 'st_exon']))
   calls[colnames(calls) == 'ed_exon'] <- as.character(unlist(calls[colnames(calls) == 'ed_exon']))
-  calls
+  write.csv(calls, output_calls_file, row.names=F)
 }
 
 #   SAMPLE CNV             INTERVAL     KB CHR   MID_BP    TARGETS NUM_TARG MLCN Q_SOME
diff --git a/R/CODEXCOV/DESCRIPTION b/R/CODEXCOV/DESCRIPTION
index d9e2385..f6516a3 100755
--- a/R/CODEXCOV/DESCRIPTION
+++ b/R/CODEXCOV/DESCRIPTION
@@ -14,8 +14,7 @@ Depends:
     devtools (>= 1.13.2),
     DBI (== 0.8),
     optparse (== 1.4.4),
-    CODEX (>= 1.8.0),
-    REFERENCE.SAMPLE.SET.SELECTOR (>= 0.0.1)
+    CODEX (>= 1.8.0)
 License: GPL-3
 Encoding: UTF-8
 LazyData: true

From 14b19674f65e21de512c5bd3ca385b7f49d5cfe1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 17:44:04 +0200
Subject: [PATCH 028/114] bugfix

---
 R/CODEXCOV/R/run_CODEXCOV.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R
index 246aa22..a1ce428 100644
--- a/R/CODEXCOV/R/run_CODEXCOV.R
+++ b/R/CODEXCOV/R/run_CODEXCOV.R
@@ -50,7 +50,7 @@ run_CODEXCOV <- function(K_from,
       samples <- unlist(strsplit(reference_sample_set[[i]], ','))
       actual_sample <- samples[1]
       reference_samples <- samples[-1]
-      samples <- samples[order(samples[,1]),]
+      samples <- sort(samples)
       Y_subset <- Y[,samples]
 
       ###################################################

From 02e0c01e9f28612218bd8081887f5fb370aea86e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 18:05:50 +0200
Subject: [PATCH 029/114] finished dag file for CODEX cnv caller

---
 airflow/dags/codex.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/airflow/dags/codex.py b/airflow/dags/codex.py
index 55ee94b..1da827f 100755
--- a/airflow/dags/codex.py
+++ b/airflow/dags/codex.py
@@ -36,11 +36,19 @@
 num_refs = '30'
 reference_sample_set_file = 'reference_sample_set.csv'
 
+### codex parameters
+k_from = '1'
+k_to = '3'
+lmax = '200'
+output_calls_file = 'calls.csv'
+
 run_codex_caller_cmd= " \
 docker pull biodatageeks/cnv-opt-target-qc; \
 docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \
 docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \
 docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \
+docker pull biodatageeks/cnv-opt-codexcov; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-codexcov Rscript -e \"library(\'CODEXCOV\');run_CODEXCOV(" + k_from + "," + k_to + "," + lmax + ",'" + qc_cov_table + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \
 "
 
 run_codex_caller_task= BashOperator (

From 066be6833ec83ccb108bda1751d75ce853a73090 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 18:15:26 +0200
Subject: [PATCH 030/114] IRanges in docker for CANOESCOV package

---
 Docker/cnv-opt-canoescov/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile
index 44080ff..402047a 100644
--- a/Docker/cnv-opt-canoescov/Dockerfile
+++ b/Docker/cnv-opt-canoescov/Dockerfile
@@ -1,4 +1,5 @@
 FROM biodatageeks/cnv-opt-canoes
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')"
 RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"

From f9f4ffd5c65e390c55d250a70ad98479f40e6555 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 18:29:19 +0200
Subject: [PATCH 031/114] add getgc function to CANOESCOV package

---
 Docker/cnv-opt-canoescov/Dockerfile |  6 ++++++
 R/CANOESCOV/R/functions_CANOESCOV.R | 19 +++++++++++++++++++
 R/CANOESCOV/R/run_CANOESCOV.R       |  2 +-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile
index 402047a..fa7683b 100644
--- a/Docker/cnv-opt-canoescov/Dockerfile
+++ b/Docker/cnv-opt-canoescov/Dockerfile
@@ -2,4 +2,10 @@ FROM biodatageeks/cnv-opt-canoes
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrings')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')"
+
 RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R
index b55c138..bac5cc0 100644
--- a/R/CANOESCOV/R/functions_CANOESCOV.R
+++ b/R/CANOESCOV/R/functions_CANOESCOV.R
@@ -11,6 +11,25 @@ coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){
   return(list(Y=Y))
 }
 
+
+# from CODEX package
+getgc <- function(chr, ref) {
+    if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") {
+        chrtemp <- 23
+    } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") {
+        chrtemp <- 24
+    } else {
+        chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1])
+    }
+    if (length(chrtemp) == 0) 
+        message("Chromosome cannot be found in NCBI Homo sapiens database!")
+    chrm <- unmasked(Hsapiens[[chrtemp]])
+    seqs <- Views(chrm, ref)
+    af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE)
+    gc <- round((af[, "G"] + af[, "C"]) * 100,2)
+    gc
+}
+
 # CallCNVs
 #     Calls CNVs in sample of interest
 # Arguments:
diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R
index f6742c7..8d45c82 100644
--- a/R/CANOESCOV/R/run_CANOESCOV.R
+++ b/R/CANOESCOV/R/run_CANOESCOV.R
@@ -1,5 +1,5 @@
 library(methods)
-library(CODEX)
+#library(CODEX)
 
 run_CANOESCOV <- function(input_cov_table,
                           reference_sample_set_file,

From d510815f4a66cbfc490d3aa4649c9ca95f1fd264 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 19:30:27 +0200
Subject: [PATCH 032/114] missing libraries in getgc function

---
 R/CANOESCOV/R/functions_CANOESCOV.R | 30 +++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R
index bac5cc0..70ff41d 100644
--- a/R/CANOESCOV/R/functions_CANOESCOV.R
+++ b/R/CANOESCOV/R/functions_CANOESCOV.R
@@ -14,20 +14,22 @@ coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){
 
 # from CODEX package
 getgc <- function(chr, ref) {
-    if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") {
-        chrtemp <- 23
-    } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") {
-        chrtemp <- 24
-    } else {
-        chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1])
-    }
-    if (length(chrtemp) == 0) 
-        message("Chromosome cannot be found in NCBI Homo sapiens database!")
-    chrm <- unmasked(Hsapiens[[chrtemp]])
-    seqs <- Views(chrm, ref)
-    af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE)
-    gc <- round((af[, "G"] + af[, "C"]) * 100,2)
-    gc
+  library(GenomeInfoDb)
+  library(BSgenome.Hsapiens.UCSC.hg19)
+  if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") {
+    chrtemp <- 23
+  } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") {
+    chrtemp <- 24
+  } else {
+    chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1])
+  }
+  if (length(chrtemp) == 0) 
+    message("Chromosome cannot be found in NCBI Homo sapiens database!")
+  chrm <- unmasked(Hsapiens[[chrtemp]])
+  seqs <- Views(chrm, ref)
+  af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE)
+  gc <- round((af[, "G"] + af[, "C"]) * 100,2)
+  gc
 }
 
 # CallCNVs

From 43cec0bab5d5b63988d3a3ddcd88e80d7a87d65b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 21:15:24 +0200
Subject: [PATCH 033/114] bugfixes

---
 Jenkinsfile                         | 7 -------
 R/CANOESCOV/R/functions_CANOESCOV.R | 2 --
 R/CANOESCOV/R/run_CANOESCOV.R       | 7 ++++++-
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index ce4967f..fc11b6f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -2,13 +2,6 @@ pipeline {
     agent any
        stages {
 
-        stage('Building Docker images') {
-                    steps {
-                        echo 'Building Docker images....'
-                        sh './build.sh'
-                    }
-                }
-
         /*stage('Test R code') {
                     steps {
                         echo 'Testing R code....'
diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R
index 70ff41d..564f073 100644
--- a/R/CANOESCOV/R/functions_CANOESCOV.R
+++ b/R/CANOESCOV/R/functions_CANOESCOV.R
@@ -14,8 +14,6 @@ coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){
 
 # from CODEX package
 getgc <- function(chr, ref) {
-  library(GenomeInfoDb)
-  library(BSgenome.Hsapiens.UCSC.hg19)
   if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") {
     chrtemp <- 23
   } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") {
diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R
index 8d45c82..eb1b243 100644
--- a/R/CANOESCOV/R/run_CANOESCOV.R
+++ b/R/CANOESCOV/R/run_CANOESCOV.R
@@ -1,5 +1,10 @@
 library(methods)
-#library(CODEX)
+library(IRanges)
+library(BSgenome.Hsapiens.UCSC.hg19)
+library(Biostrings)
+library(Rsamtools)
+library(GenomeInfoDb)
+library(S4Vectors)
 
 run_CANOESCOV <- function(input_cov_table,
                           reference_sample_set_file,

From f0a6ea0a5583bc11915022e834e0eba9a5e9c7ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 22:16:12 +0200
Subject: [PATCH 034/114] missing libraries

---
 R/CANOESCOV/R/functions_CANOESCOV.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R
index 564f073..70ff41d 100644
--- a/R/CANOESCOV/R/functions_CANOESCOV.R
+++ b/R/CANOESCOV/R/functions_CANOESCOV.R
@@ -14,6 +14,8 @@ coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){
 
 # from CODEX package
 getgc <- function(chr, ref) {
+  library(GenomeInfoDb)
+  library(BSgenome.Hsapiens.UCSC.hg19)
   if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") {
     chrtemp <- 23
   } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") {

From 43997d9ebf7ea83801b9981281d6449fae10a852 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 26 Mar 2018 22:50:44 +0200
Subject: [PATCH 035/114] libraries in another place

---
 R/CANOESCOV/R/functions_CANOESCOV.R | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R
index 70ff41d..6b7fac3 100644
--- a/R/CANOESCOV/R/functions_CANOESCOV.R
+++ b/R/CANOESCOV/R/functions_CANOESCOV.R
@@ -68,6 +68,12 @@ getgc <- function(chr, ref) {
 #      NUM_TARG: how many targets are in the CNV
 #      Q_SOME: a Phred-scaled quality score for the CNV
 CallCNVs <- function(sample.name, reference.samples, counts, p=1e-08, Tnum=6, D=70000, get.dfs=F, homdel.mean=0.2){
+  library(IRanges)
+  library(BSgenome.Hsapiens.UCSC.hg19)
+  library(Biostrings)
+  library(Rsamtools)
+  library(GenomeInfoDb)
+  library(S4Vectors)
   if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
   if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){
     stop("First five columns of counts matrix must be target, chromosome, start, end, gc")

From 68507f078bdd8f847c0dcacaa8213353d8ee1d91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 27 Mar 2018 14:51:36 +0200
Subject: [PATCH 036/114] remove libraries from run_CANOESCOV function

---
 R/CANOESCOV/R/run_CANOESCOV.R | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R
index eb1b243..f85e619 100644
--- a/R/CANOESCOV/R/run_CANOESCOV.R
+++ b/R/CANOESCOV/R/run_CANOESCOV.R
@@ -1,10 +1,4 @@
 library(methods)
-library(IRanges)
-library(BSgenome.Hsapiens.UCSC.hg19)
-library(Biostrings)
-library(Rsamtools)
-library(GenomeInfoDb)
-library(S4Vectors)
 
 run_CANOESCOV <- function(input_cov_table,
                           reference_sample_set_file,

From 6e6cc476df17fbbeac45703e60f8891272f80aad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 27 Mar 2018 14:56:47 +0200
Subject: [PATCH 037/114] CODEX in CANOESCOV package (only for tests)

---
 Docker/cnv-opt-canoescov/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile
index fa7683b..b138c36 100644
--- a/Docker/cnv-opt-canoescov/Dockerfile
+++ b/Docker/cnv-opt-canoescov/Dockerfile
@@ -7,5 +7,6 @@ RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrin
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')"
 
 RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"

From 1b82b56863b5abfd39ab0e74802f549fe35b9b76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 27 Mar 2018 15:37:57 +0200
Subject: [PATCH 038/114] tests for building CANOESCOV docker with CODEX
 dependencies

---
 Docker/cnv-opt-canoescov/Dockerfile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile
index b138c36..27a19c2 100644
--- a/Docker/cnv-opt-canoescov/Dockerfile
+++ b/Docker/cnv-opt-canoescov/Dockerfile
@@ -1,12 +1,6 @@
 FROM biodatageeks/cnv-opt-canoes
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')"
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')"
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrings')"
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')"
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')"
 
 RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"

From 755562282918d2eb9c365bd6423ed3a390f22262 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 27 Mar 2018 16:29:15 +0200
Subject: [PATCH 039/114] missing CANOES library

---
 R/CANOESCOV/R/functions_CANOESCOV.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R
index 6b7fac3..e909d2e 100644
--- a/R/CANOESCOV/R/functions_CANOESCOV.R
+++ b/R/CANOESCOV/R/functions_CANOESCOV.R
@@ -74,6 +74,7 @@ CallCNVs <- function(sample.name, reference.samples, counts, p=1e-08, Tnum=6, D=
   library(Rsamtools)
   library(GenomeInfoDb)
   library(S4Vectors)
+  library(CANOES)
   if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
   if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){
     stop("First five columns of counts matrix must be target, chromosome, start, end, gc")

From 381555c03049ac7111e6dd6c1f37388ae1dcf999 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 27 Mar 2018 17:36:05 +0200
Subject: [PATCH 040/114] bugfix in CANOESCOV libraries

---
 Docker/cnv-opt-canoescov/Dockerfile | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile
index 27a19c2..fa7683b 100644
--- a/Docker/cnv-opt-canoescov/Dockerfile
+++ b/Docker/cnv-opt-canoescov/Dockerfile
@@ -1,6 +1,11 @@
 FROM biodatageeks/cnv-opt-canoes
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('CODEX')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Biostrings')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')"
 
 RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"

From 6e4c1cca7b9cf853c3a037bc85af0cedaadf991b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 12:48:33 +0200
Subject: [PATCH 041/114] first version of new interface to TARGET.QC package

---
 R/TARGET.QC/R/functions_TARGET.QC.R | 12 ------
 R/TARGET.QC/R/run_TARGET.QC.R       | 60 +++++++++--------------------
 2 files changed, 19 insertions(+), 53 deletions(-)

diff --git a/R/TARGET.QC/R/functions_TARGET.QC.R b/R/TARGET.QC/R/functions_TARGET.QC.R
index 63f2278..850803e 100644
--- a/R/TARGET.QC/R/functions_TARGET.QC.R
+++ b/R/TARGET.QC/R/functions_TARGET.QC.R
@@ -1,17 +1,5 @@
 library(CODEX)
 
-coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){
-  Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0)
-  for(sample in sampname) {
-    cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,]
-    cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ]
-    Y <- cbind(Y, cov_targets_for_sample[,"read_count"])
-  }
-  colnames(Y) <- sampname
-  rownames(Y) <- targets_for_chr[,"target_id"]
-  return(list(Y=Y))
-}
-
 gcmapp1 <- function(chr, ref){
   gc <- getgc(chr, ref)
   mapp <- getmapp(chr, ref)
diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R
index 42ff0e6..f43787e 100644
--- a/R/TARGET.QC/R/run_TARGET.QC.R
+++ b/R/TARGET.QC/R/run_TARGET.QC.R
@@ -6,7 +6,9 @@ run_TARGET.QC <- function(mapp_thresh,
                           gc_thresh_from,
                           gc_thresh_to,
                           input_cov_table,
-                          output_cov_table){
+                          output_cov_table,
+                          input_bed,
+                          output_bed){
   #mapp_thresh <- 0.9
   #cov_thresh_from <- 20
   #cov_thresh_to <- 4000
@@ -14,45 +16,21 @@ run_TARGET.QC <- function(mapp_thresh,
   #length_thresh_to <- 2000
   #gc_thresh_from <- 20
   #gc_thresh_to <- 80
-  #lmax <- 200
-  cov_table <- read.csv(input_cov_table)
-  sampname <- unique(cov_table[,"sample_name"])
-  targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]
-  targets <- targets[!duplicated(targets[,"target_id"]),]
-  targets <- targets[with(targets, order(target_id)), ]
-  cov_table_qc <- matrix(nrow=0, ncol=6)
-  colnames(cov_table_qc) <- colnames(cov_table)
-
-  chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y")))
-  for(chr in chrs) {
-    targets_for_chr <- targets[targets[,"chr"] == chr,]
-    ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"])
-    if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
-      next()
-    }
-    Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y
-    gcmapp1_result <- gcmapp1(chr, ref)
-    gc <- gcmapp1_result$gc
-    mapp <- gcmapp1_result$mapp
-
-    qcObj1_result <- qcObj1(Y, sampname, chr, ref, mapp, gc, cov_thresh = c(cov_thresh_from, cov_thresh_to), 
-                        length_thresh = c(length_thresh_from, length_thresh_to), mapp_thresh, 
-                        gc_thresh = c(gc_thresh_from, gc_thresh_to))
-    Y_qc <- qcObj1_result$Y_qc
-    sampname_qc <- qcObj1_result$sampname_qc
-    ref_qc <- qcObj1_result$ref_qc
-    colnames(Y_qc) <- sampname_qc
-    for(sample in colnames(Y_qc)) {
-      new_cov_table_qc_rows <- cbind(sample, rownames(Y_qc), chr, start(ref_qc), end(ref_qc), Y_qc[,sample])
-      cov_table_qc <- rbind(cov_table_qc, new_cov_table_qc_rows)
-    }
-  }
-  cov_table_qc <- as.data.frame(cov_table_qc)
-  cov_table_qc[,"pos_min"] <- strtoi(cov_table_qc[,"pos_min"])
-  cov_table_qc[,"pos_max"] <- strtoi(cov_table_qc[,"pos_max"])
-  cov_table_qc[,"target_id"] <- strtoi(cov_table_qc[,"target_id"])
-  cov_table_qc[,"read_count"] <- strtoi(cov_table_qc[,"read_count"])
-  colnames(cov_table_qc) <- c("sample_name", "target_id", "chr", "pos_min", "pos_max", "read_count")
-  write.csv(cov_table_qc, output_cov_table, row.names=F, quote=F)
+  Y <- read.csv(input_cov_table)
+  sampname <- colnames(Y)
+  targets <- read.delim(input_bed)
+  ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"])
+  gcmapp1_result <- gcmapp1(targets[1,"chr"], ref)
+  gc <- gcmapp1_result$gc
+  mapp <- gcmapp1_result$mapp
+  qcObj1_result <- qcObj1(Y, sampname, targets[1,"chr"], ref, mapp, gc, cov_thresh = c(cov_thresh_from, cov_thresh_to), 
+                          length_thresh = c(length_thresh_from, length_thresh_to), mapp_thresh, 
+                          gc_thresh = c(gc_thresh_from, gc_thresh_to))
+  Y_qc <- qcObj1_result$Y_qc
+  sampname_qc <- qcObj1_result$sampname_qc
+  ref_qc <- qcObj1_result$ref_qc
+  colnames(Y_qc) <- sampname_qc
+  write.csv(Y_qc, output_cov_table, row.names=F, quote=F)
+  write.csv(ref_qc, output_bed, row.names=F, quote=F)
 }
 

From 9a7c04394b64f825fe6044535c67a305e71d8b92 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 12:56:59 +0200
Subject: [PATCH 042/114] first version of new interface to
 REFERENCE.SAMPLE.SET.SELECTOR package

---
 .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R      | 12 ------------
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R            | 12 +++++-------
 2 files changed, 5 insertions(+), 19 deletions(-)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index 70c9fe4..d92bf29 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -1,16 +1,4 @@
 
-coverageObj1 <- function(cov_table, sampname, targets_for_chr){
-  Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0)
-  for(sample in sampname) {
-    cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,]
-    cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ]
-    Y <- cbind(Y, cov_targets_for_sample[,"read_count"])
-  }
-  colnames(Y) <- sampname
-  rownames(Y) <- targets_for_chr[,"target_id"]
-  return(list(Y=Y))
-}
-
 canoes_method <- function(investigated_sample, Y, num_refs){
   if (num_refs == 0) {
     num_refs <- 30  # in CANOES application num_refs is default set to 30
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index c4492df..4783c73 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -1,15 +1,13 @@
 run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
                                               num_refs,
                                               input_cov_table,
+                                              input_bed,
                                               output_reference_file){
 
-  cov_table <- read.csv(input_cov_table)
-  sampname <- unique(cov_table[,"sample_name"])
-  targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]
-  targets <- targets[!duplicated(targets[,"target_id"]),]
-  targets <- targets[with(targets, order(target_id)), ]
-  target_length <- targets[,"pos_max"] - targets[,"pos_min"]
-  Y <- coverageObj1(cov_table, sampname, targets)$Y
+  Y <- read.csv(input_cov_table)
+  sampname <- colnames(Y)
+  targets <- read.delim(input_bed)
+  target_length <- targets[,"st_bp"] - targets[,"ed_bp"]
   reference_samples <- list()
 
   for(i in 1:length(sampname)) {

From b069d278afc5d53269e77b9788270793ebeac15c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 13:18:31 +0200
Subject: [PATCH 043/114] test of forcing rebuilding specified docker

---
 build.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index f2dba85..c2c3df5 100755
--- a/build.sh
+++ b/build.sh
@@ -34,8 +34,9 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    #docker build -t $image:$version .
-    #docker build -t $image:latest .
+    if [[ ${image} == "biodatageeks/target-qc" ]]; then
+      docker rmi $image
+    fi
     docker build --no-cache -t $image:$version .
     docker build --no-cache -t $image:latest .
     if [[ ${BUILD_MODE} != "local" ]]; then

From 7999948f0aa8b98f77ef256e4bafd837bf208dcd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 13:21:39 +0200
Subject: [PATCH 044/114] proper name of packege

---
 build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index c2c3df5..462c3d9 100755
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,7 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/target-qc" ]]; then
+    if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]]; then
       docker rmi $image
     fi
     docker build --no-cache -t $image:$version .

From bc168e90f32b414c54918df1ff93ae9c6cf3d8fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 13:25:24 +0200
Subject: [PATCH 045/114] another test of forcing docker rebuilding

---
 build.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/build.sh b/build.sh
index 462c3d9..8460c5f 100755
--- a/build.sh
+++ b/build.sh
@@ -34,9 +34,7 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]]; then
-      docker rmi $image
-    fi
+    docker rmi biodatageeks/cnv-opt-target-qc
     docker build --no-cache -t $image:$version .
     docker build --no-cache -t $image:latest .
     if [[ ${BUILD_MODE} != "local" ]]; then

From 200a8f678f506df1302e3eadc3f47baa7a3a369f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 13:38:41 +0200
Subject: [PATCH 046/114] another test of rebuilding docker image

---
 Docker/cnv-opt-target-qc/Dockerfile | 2 ++
 build.sh                            | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile
index 8391b91..e78cc1a 100644
--- a/Docker/cnv-opt-target-qc/Dockerfile
+++ b/Docker/cnv-opt-target-qc/Dockerfile
@@ -1,4 +1,6 @@
 FROM biodatageeks/cnv-opt-codex
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
+ARG CACHE_DATE=not_a_date
+
 RUN Rscript -e "install.packages('TARGET.QC', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/build.sh b/build.sh
index 8460c5f..e8b07ff 100755
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,11 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    docker rmi biodatageeks/cnv-opt-target-qc
+    if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]]; then
+      echo "Rebuilf of ${image} image forced..."
+      docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
+      docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .
+    fi
     docker build --no-cache -t $image:$version .
     docker build --no-cache -t $image:latest .
     if [[ ${BUILD_MODE} != "local" ]]; then

From d3dd1407a227d969ab3364eb5e9046bd42ed6110 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 13:52:55 +0200
Subject: [PATCH 047/114] force to rebuild
 cnv-opt-reference-sample-set-selector docker

---
 Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 2 ++
 build.sh                                                | 7 ++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
index 7e82158..716a5f4 100644
--- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
+++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
@@ -1,6 +1,8 @@
 FROM biodatageeks/cnv-opt-codex
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
+ARG CACHE_DATE=not_a_date
+
 RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')"
 
 RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/build.sh b/build.sh
index e8b07ff..7e5be95 100755
--- a/build.sh
+++ b/build.sh
@@ -34,13 +34,14 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]]; then
+    if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]] || [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then
       echo "Rebuilf of ${image} image forced..."
       docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
       docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .
+    else
+      docker build --no-cache -t $image:$version .
+      docker build --no-cache -t $image:latest .
     fi
-    docker build --no-cache -t $image:$version .
-    docker build --no-cache -t $image:latest .
     if [[ ${BUILD_MODE} != "local" ]]; then
       docker push docker.io/$image:latest
       docker push docker.io/$image:$version

From b3d1bc92bb2a6c72dfe21ff4ff8a1fed768102ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 13:54:07 +0200
Subject: [PATCH 048/114] bugfix

---
 R/TARGET.QC/R/run_TARGET.QC.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R
index f43787e..6d56276 100644
--- a/R/TARGET.QC/R/run_TARGET.QC.R
+++ b/R/TARGET.QC/R/run_TARGET.QC.R
@@ -31,6 +31,6 @@ run_TARGET.QC <- function(mapp_thresh,
   ref_qc <- qcObj1_result$ref_qc
   colnames(Y_qc) <- sampname_qc
   write.csv(Y_qc, output_cov_table, row.names=F, quote=F)
-  write.csv(ref_qc, output_bed, row.names=F, quote=F)
+  write.csv(ref[rownames(ref_qc),], output_bed, row.names=F, quote=F)
 }
 

From 2247ef34da81b31b46de0b33b396ac3b1e0589d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 13:59:31 +0200
Subject: [PATCH 049/114] bugfix one more time

---
 R/TARGET.QC/R/run_TARGET.QC.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R
index 6d56276..b6cab3a 100644
--- a/R/TARGET.QC/R/run_TARGET.QC.R
+++ b/R/TARGET.QC/R/run_TARGET.QC.R
@@ -31,6 +31,6 @@ run_TARGET.QC <- function(mapp_thresh,
   ref_qc <- qcObj1_result$ref_qc
   colnames(Y_qc) <- sampname_qc
   write.csv(Y_qc, output_cov_table, row.names=F, quote=F)
-  write.csv(ref[rownames(ref_qc),], output_bed, row.names=F, quote=F)
+  write.csv(targets[rownames(ref_qc),], output_bed, row.names=F, quote=F)
 }
 

From 961e8b38e975c911873e9e71ba38c09d6e8447f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 14:14:57 +0200
Subject: [PATCH 050/114] propoer indices

---
 R/TARGET.QC/R/run_TARGET.QC.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R
index b6cab3a..2c3da28 100644
--- a/R/TARGET.QC/R/run_TARGET.QC.R
+++ b/R/TARGET.QC/R/run_TARGET.QC.R
@@ -19,6 +19,8 @@ run_TARGET.QC <- function(mapp_thresh,
   Y <- read.csv(input_cov_table)
   sampname <- colnames(Y)
   targets <- read.delim(input_bed)
+  rownames(Y) <- 1:nrow(Y)
+  rownames(targets) <- 1:nrow(targets)
   ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"])
   gcmapp1_result <- gcmapp1(targets[1,"chr"], ref)
   gc <- gcmapp1_result$gc
@@ -31,6 +33,6 @@ run_TARGET.QC <- function(mapp_thresh,
   ref_qc <- qcObj1_result$ref_qc
   colnames(Y_qc) <- sampname_qc
   write.csv(Y_qc, output_cov_table, row.names=F, quote=F)
-  write.csv(targets[rownames(ref_qc),], output_bed, row.names=F, quote=F)
+  write.csv(targets[rownames(Y_qc),], output_bed, row.names=F, quote=F)
 }
 

From 3da288a44fb00f8ea2308fa7c982644c7885a483 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 14:22:29 +0200
Subject: [PATCH 051/114] bugfix in saveing qc_bed to file

---
 R/TARGET.QC/R/run_TARGET.QC.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/TARGET.QC/R/run_TARGET.QC.R b/R/TARGET.QC/R/run_TARGET.QC.R
index 2c3da28..f5560c6 100644
--- a/R/TARGET.QC/R/run_TARGET.QC.R
+++ b/R/TARGET.QC/R/run_TARGET.QC.R
@@ -33,6 +33,6 @@ run_TARGET.QC <- function(mapp_thresh,
   ref_qc <- qcObj1_result$ref_qc
   colnames(Y_qc) <- sampname_qc
   write.csv(Y_qc, output_cov_table, row.names=F, quote=F)
-  write.csv(targets[rownames(Y_qc),], output_bed, row.names=F, quote=F)
+  write.table(targets[rownames(Y_qc),], output_bed, row.names=F, quote=F, sep="\t")
 }
 

From 54415b6a20ce591ed04058a0d5c62acebc1c465d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 14:32:53 +0200
Subject: [PATCH 052/114] progress in selecting reference sample set

---
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R                        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index 4783c73..37d63e5 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -12,6 +12,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
 
   for(i in 1:length(sampname)) {
     investigated_sample <- as.character(sampname[i])
+    print(paste("Processing ", investigated_sample, " sample ...", sep=""))
     if(select_method == "canoes") {
       reference_samples_for_investigated_sample <- canoes_method(investigated_sample, Y, num_refs)$reference_samples
       reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)

From ef805c789eef5cf3a525bf620ada4bdee3be5099 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 15:19:39 +0200
Subject: [PATCH 053/114] firsst version of new interface for exomedepthcov
 package

---
 Docker/cnv-opt-exomedepthcov/Dockerfile     |  2 +
 R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R | 15 ----
 R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R       | 81 ++++++++++-----------
 build.sh                                    |  2 +-
 4 files changed, 40 insertions(+), 60 deletions(-)

diff --git a/Docker/cnv-opt-exomedepthcov/Dockerfile b/Docker/cnv-opt-exomedepthcov/Dockerfile
index 28448f8..9d2d135 100644
--- a/Docker/cnv-opt-exomedepthcov/Dockerfile
+++ b/Docker/cnv-opt-exomedepthcov/Dockerfile
@@ -1,5 +1,7 @@
 FROM biodatageeks/cnv-opt-exomedepth
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
+ARG CACHE_DATE=not_a_date
+
 RUN Rscript -e "install.packages('EXOMEDEPTHCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
 
diff --git a/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R
index 550b4d0..e69de29 100644
--- a/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R
+++ b/R/EXOMEDEPTHCOV/R/functions_EXOMEDEPTHCOV.R
@@ -1,15 +0,0 @@
-library(ExomeDepth)
-
-coverageObj1 <- function(cov_table, sampname, targets_for_chr){
-  Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0)
-  for(sample in sampname) {
-    cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,]
-    cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ]
-    Y <- cbind(Y, cov_targets_for_sample[,"read_count"])
-  }
-  colnames(Y) <- sampname
-  rownames(Y) <- targets_for_chr[,"target_id"]
-  return(list(Y=Y))
-}
-
-
diff --git a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
index a8e8e3c..2933dc2 100644
--- a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
+++ b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
@@ -2,62 +2,55 @@ library(ExomeDepth)
 library(methods)
 
 run_EXOMEDEPTHCOV <- function(input_cov_table,
+                              input_bed,
                               reference_sample_set_file,
                               output_calls_file){
 
   con <- file(reference_sample_set_file, open='r')
   reference_sample_set <- readLines(con)
-  cov_table <- read.csv(input_cov_table)
-  sampname <- unique(cov_table[,"sample_name"])
-  targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]
-  targets <- targets[!duplicated(targets[,"target_id"]),]
-  targets <- targets[with(targets, order(target_id)), ]
+  Y <- read.csv(input_cov_table)
+  sampname <- colnames(Y)
+  targets <- read.delim(input_bed)
+  rownames(Y) <- 1:nrow(Y)
+  rownames(targets) <- 1:nrow(targets)
   calls <- data.frame(matrix(nrow=0, ncol=13))
-  chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y")))
   library(IRanges)
-  for(chr in chrs) {
-    targets_for_chr <- targets[targets[,"chr"] == chr,]
-    ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"])
-    if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
+  ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"])
+
+  for (i in 1:length(reference_sample_set)) {
+    if (reference_sample_set[[i]] == '') {
       next()
     }
-    Y <- coverageObj1(cov_table, sampname, targets_for_chr)$Y
-
-    for (i in 1:length(reference_sample_set)) {
-      if (reference_sample_set[[i]] == '') {
-        next()
-      }
-      samples <- unlist(strsplit(reference_sample_set[[i]], ','))
-      actual_sample <- samples[1]
-      reference_samples <- samples[-1]
+    samples <- unlist(strsplit(reference_sample_set[[i]], ','))
+    actual_sample <- samples[1]
+    reference_samples <- samples[-1]
 
-      ## ----construct.ref-------------------------------------------------------
-      my.matrix <- as.matrix(Y[,reference_samples])
-      my.reference.selected <- apply(X = my.matrix, 
-                                     MAR = 1, 
-                                     FUN = sum)
+    ## ----construct.ref-------------------------------------------------------
+    my.matrix <- as.matrix(Y[,reference_samples])
+    my.reference.selected <- apply(X = my.matrix, 
+                                   MAR = 1, 
+                                   FUN = sum)
 
-      ## ----build.complete------------------------------------------------------
-      all.exons <- new('ExomeDepth',
-                       test = Y[,actual_sample],
-                       reference = my.reference.selected,
-                       formula = 'cbind(test, reference) ~ 1')
+    ## ----build.complete------------------------------------------------------
+    all.exons <- new('ExomeDepth',
+                     test = Y[,actual_sample],
+                     reference = my.reference.selected,
+                     formula = 'cbind(test, reference) ~ 1')
 
-      ## ----call.CNVs-----------------------------------------------------------
-      all.exons <- ExomeDepth::CallCNVs(x = all.exons, 
-                                        transition.probability = 10^-4, 
-                                        chromosome = rep(chr, nrow(Y)), 
-                                        start = start(ref), 
-                                        end = end(ref), 
-                                        name = rep('name', nrow(Y)))
-      print(all.exons@CNV.calls)
-      if (nrow(all.exons@CNV.calls) > 0) {
-        actual_sample_column <- data.frame(matrix(rep(actual_sample, nrow(all.exons@CNV.calls)), nrow=nrow(all.exons@CNV.calls))) 
-        callsIt <- cbind(actual_sample_column, all.exons@CNV.calls)
-        colnames(callsIt) <- c(c, colnames(all.exons@CNV.calls))
-        if (nrow(calls)==0){calls <- data.frame(matrix(nrow=0, ncol=ncol(callsIt)))} 
-        calls <- rbind(calls, callsIt)
-      }
+    ## ----call.CNVs-----------------------------------------------------------
+    all.exons <- ExomeDepth::CallCNVs(x = all.exons, 
+                                      transition.probability = 10^-4, 
+                                      chromosome = rep(targets[1,'chr'], nrow(Y)), 
+                                      start = start(ref), 
+                                      end = end(ref), 
+                                      name = rep('name', nrow(Y)))
+    print(all.exons@CNV.calls)
+    if (nrow(all.exons@CNV.calls) > 0) {
+      actual_sample_column <- data.frame(matrix(rep(actual_sample, nrow(all.exons@CNV.calls)), nrow=nrow(all.exons@CNV.calls))) 
+      callsIt <- cbind(actual_sample_column, all.exons@CNV.calls)
+      colnames(callsIt) <- c(c, colnames(all.exons@CNV.calls))
+      if (nrow(calls)==0){calls <- data.frame(matrix(nrow=0, ncol=ncol(callsIt)))} 
+      calls <- rbind(calls, callsIt)
     }
   }
   # unify names of output columns
diff --git a/build.sh b/build.sh
index 7e5be95..a8668ac 100755
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,7 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]] || [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then
+    if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]] || [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-exomedepthcov" ]]; then
       echo "Rebuilf of ${image} image forced..."
       docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
       docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .

From bd4c87a5083909dea9ccafdb43a3a0760312d083 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 15:32:35 +0200
Subject: [PATCH 054/114] some clean up in EXOMEDEPTHCOV package

---
 R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
index 2933dc2..5694e4b 100644
--- a/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
+++ b/R/EXOMEDEPTHCOV/R/run_EXOMEDEPTHCOV.R
@@ -14,8 +14,6 @@ run_EXOMEDEPTHCOV <- function(input_cov_table,
   rownames(Y) <- 1:nrow(Y)
   rownames(targets) <- 1:nrow(targets)
   calls <- data.frame(matrix(nrow=0, ncol=13))
-  library(IRanges)
-  ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"])
 
   for (i in 1:length(reference_sample_set)) {
     if (reference_sample_set[[i]] == '') {
@@ -40,9 +38,9 @@ run_EXOMEDEPTHCOV <- function(input_cov_table,
     ## ----call.CNVs-----------------------------------------------------------
     all.exons <- ExomeDepth::CallCNVs(x = all.exons, 
                                       transition.probability = 10^-4, 
-                                      chromosome = rep(targets[1,'chr'], nrow(Y)), 
-                                      start = start(ref), 
-                                      end = end(ref), 
+                                      chromosome = targets[,"chr"], 
+                                      start = targets[,"st_bp"], 
+                                      end = targets[,"ed_bp"], 
                                       name = rep('name', nrow(Y)))
     print(all.exons@CNV.calls)
     if (nrow(all.exons@CNV.calls) > 0) {

From bb2b86d4a63160e4cb26ef2315fd4b6c70f7e5e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 15:43:04 +0200
Subject: [PATCH 055/114] first version new interface to CODEXCOV package

---
 Docker/cnv-opt-codexcov/Dockerfile |  2 +
 R/CODEXCOV/R/functions_CODEXCOV.R  | 21 --------
 R/CODEXCOV/R/run_CODEXCOV.R        | 83 +++++++++++++-----------------
 build.sh                           |  2 +-
 4 files changed, 38 insertions(+), 70 deletions(-)

diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile
index 258bbf8..f06904c 100644
--- a/Docker/cnv-opt-codexcov/Dockerfile
+++ b/Docker/cnv-opt-codexcov/Dockerfile
@@ -1,4 +1,6 @@
 FROM biodatageeks/cnv-opt-codex
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
+ARG CACHE_DATE=not_a_date
+
 RUN Rscript -e "install.packages('CODEXCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/R/CODEXCOV/R/functions_CODEXCOV.R b/R/CODEXCOV/R/functions_CODEXCOV.R
index 3f7522c..25ad2de 100644
--- a/R/CODEXCOV/R/functions_CODEXCOV.R
+++ b/R/CODEXCOV/R/functions_CODEXCOV.R
@@ -1,26 +1,5 @@
 library(CODEX)
 
-#' Function Dexcription
-#'
-#' Function description.
-#' @param cov_file
-#' @param sampname
-#' @keywords 
-#' @export
-#' @examples
-#' coverageObj1
-coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){
-  Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0)
-  for(sample in sampname) {
-    cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,]
-    cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ]
-    Y <- cbind(Y, cov_targets_for_sample[,"read_count"])
-  }
-  colnames(Y) <- sampname
-  rownames(Y) <- targets_for_chr[,"target_id"]
-  return(list(Y=Y))
-}
-
 #' Function Dexcription
 #'
 #' Function description.
diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R
index a1ce428..cbb5d0a 100644
--- a/R/CODEXCOV/R/run_CODEXCOV.R
+++ b/R/CODEXCOV/R/run_CODEXCOV.R
@@ -10,69 +10,56 @@ run_CODEXCOV <- function(K_from,
                          K_to,
                          lmax,
                          input_cov_table,
+                         input_bed,
                          reference_sample_set_file,
                          output_calls_file){
 
   con <- file(reference_sample_set_file, open='r')
   reference_sample_set <- readLines(con)
-  cov_table <- read.csv(input_cov_table)
-  sampname <- unique(cov_table[,"sample_name"])
-  sampname <- as.character(sampname)
-  targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]
-  targets <- targets[!duplicated(targets[,"target_id"]),]
-  targets <- targets[with(targets, order(target_id)), ]
+  Y <- read.csv(input_cov_table)
+  sampname <- colnames(Y)
+  targets <- read.delim(input_bed)
+  rownames(Y) <- 1:nrow(Y)
+  rownames(targets) <- 1:nrow(targets)
   
   finalcall <- matrix(nrow=0, ncol=13)
-  chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y")))
-  
-  for(chr in chrs) {
-    targets_for_chr <- targets[targets[,"chr"] == chr,]
-    ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"])
-    if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
+  ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"])
+
+  ###################################################
+  ### code chunk number 5: gcmapp1
+  ###################################################
+  gcmapp1_result <- gcmapp1(targets[1,'chr'], ref)
+  gc <- gcmapp1_result$gc
+
+  for (i in 1:length(reference_sample_set)) {
+    if (reference_sample_set[[i]] == '') {
       next()
     }
+    samples <- unlist(strsplit(reference_sample_set[[i]], ','))
+    actual_sample <- samples[1]
+    reference_samples <- samples[-1]
+    samples <- sort(samples)
+    Y_subset <- Y[,samples]
 
     ###################################################
-    ### code chunk number 4: coverageObj1
+    ### code chunk number 7: normObj1
     ###################################################
-    Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y
+    normObj_result <- normObj1(Y_subset, gc, K = K_from:K_to)
+    Yhat <- normObj_result$Yhat
+    AIC <- normObj_result$AIC
+    BIC <- normObj_result$BIC
+    RSS <- normObj_result$RSS
+    K <- normObj_result$K
 
     ###################################################
-    ### code chunk number 5: gcmapp1
+    ### code chunk number 11: segment1
     ###################################################
-    gcmapp1_result <- gcmapp1(chr, ref)
-    gc <- gcmapp1_result$gc
-
-    for (i in 1:length(reference_sample_set)) {
-      if (reference_sample_set[[i]] == '') {
-        next()
-      }
-      samples <- unlist(strsplit(reference_sample_set[[i]], ','))
-      actual_sample <- samples[1]
-      reference_samples <- samples[-1]
-      samples <- sort(samples)
-      Y_subset <- Y[,samples]
-
-      ###################################################
-      ### code chunk number 7: normObj1
-      ###################################################
-      normObj_result <- normObj1(Y_subset, gc, K = K_from:K_to)
-      Yhat <- normObj_result$Yhat
-      AIC <- normObj_result$AIC
-      BIC <- normObj_result$BIC
-      RSS <- normObj_result$RSS
-      K <- normObj_result$K
-
-      ###################################################
-      ### code chunk number 11: segment1
-      ###################################################
-      finalcallIt <- segment1(Y_subset, Yhat, K[which.max(BIC)], K, samples,
-                              ref, chr, lmax, mode = "integer")$finalcall
-      finalcallIt <- finalcallIt[finalcallIt[,"sample_name"] == actual_sample,]
-      if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))}
-      finalcall <- rbind(finalcall, finalcallIt)
-      print(finalcall)
-    }
+    finalcallIt <- segment1(Y_subset, Yhat, K[which.max(BIC)], K, samples,
+                            ref, targets[1,'chr'], lmax, mode = "integer")$finalcall
+    finalcallIt <- finalcallIt[finalcallIt[,"sample_name"] == actual_sample,]
+    if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))}
+    finalcall <- rbind(finalcall, finalcallIt)
+    print(finalcall)
   }
   finalcall <- unify_calls_format(finalcall)$finalcall
   write.csv(finalcall, output_calls_file, row.names=F)
diff --git a/build.sh b/build.sh
index a8668ac..6a3bee1 100755
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,7 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/cnv-opt-target-qc" ]] || [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-exomedepthcov" ]]; then
+    if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then
       echo "Rebuilf of ${image} image forced..."
       docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
       docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .

From 100b6f19c6e92fe5c0302e07a559fcf9880fe9b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 16:04:02 +0200
Subject: [PATCH 056/114] first version of new interface to CANOESCOV package

---
 R/CANOESCOV/R/functions_CANOESCOV.R | 13 -----
 R/CANOESCOV/R/run_CANOESCOV.R       | 80 ++++++++++++++---------------
 R/CODEXCOV/R/run_CODEXCOV.R         |  1 -
 3 files changed, 38 insertions(+), 56 deletions(-)

diff --git a/R/CANOESCOV/R/functions_CANOESCOV.R b/R/CANOESCOV/R/functions_CANOESCOV.R
index e909d2e..f3195a7 100644
--- a/R/CANOESCOV/R/functions_CANOESCOV.R
+++ b/R/CANOESCOV/R/functions_CANOESCOV.R
@@ -1,17 +1,4 @@
 
-coverageObj1 <- function(cov_table, sampname, targets_for_chr, chr){
-  Y <- matrix(data=as.integer(0), nrow = nrow(targets_for_chr), ncol = 0)
-  for(sample in sampname) {
-    cov_targets_for_sample <- cov_table[cov_table[,"sample_name"] == sample,]
-    cov_targets_for_sample <- cov_targets_for_sample[with(cov_targets_for_sample, order(target_id)), ]
-    Y <- cbind(Y, cov_targets_for_sample[,"read_count"])
-  }
-  colnames(Y) <- sampname
-  rownames(Y) <- targets_for_chr[,"target_id"]
-  return(list(Y=Y))
-}
-
-
 # from CODEX package
 getgc <- function(chr, ref) {
   library(GenomeInfoDb)
diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R
index f85e619..403ab35 100644
--- a/R/CANOESCOV/R/run_CANOESCOV.R
+++ b/R/CANOESCOV/R/run_CANOESCOV.R
@@ -1,59 +1,55 @@
 library(methods)
 
 run_CANOESCOV <- function(input_cov_table,
+                          input_bed,
                           reference_sample_set_file,
                           output_calls_file){
 
   con <- file(reference_sample_set_file, open='r')
   reference_sample_set <- readLines(con)
-  cov_table <- read.csv(input_cov_table)
-  sampname <- unique(cov_table[,"sample_name"])
-  targets <- cov_table[,c("target_id", "chr", "pos_min", "pos_max")]
-  targets <- targets[!duplicated(targets[,"target_id"]),]
-  targets <- targets[with(targets, order(target_id)), ]
+  Y <- read.csv(input_cov_table)
+  targets <- read.delim(input_bed)
+  rownames(Y) <- 1:nrow(Y)
+  rownames(targets) <- 1:nrow(targets)
   
   calls <- data.frame(matrix(nrow=0, ncol=13))
-  chrs <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y")))
-  for(chr in chrs) {
-    targets_for_chr <- targets[targets[,"chr"] == chr,]
-    ref <- IRanges(start = targets_for_chr[,"pos_min"], end = targets_for_chr[,"pos_max"])
-    if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
-      next()
-    }
-    Y <- coverageObj1(cov_table, sampname, targets_for_chr, chr)$Y
-    Y <- cbind(rep(chr, nrow(Y)), start(ref), end(ref), Y)
-    target_length <- c()
-    for (i in 1:nrow(Y)) {
-      target_length <- c(target_length, width(ref[i]))
-    }
+  chr <- targets[1,'chr']
+  ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"])
+  if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
+    next()
+  }
+  Y <- cbind(rep(chr, nrow(Y)), start(ref), end(ref), Y)
+  target_length <- c()
+  for (i in 1:nrow(Y)) {
+    target_length <- c(target_length, width(ref[i]))
+  }
 
-    # TODO better transformation
-    write.table(Y, file=paste('cov_', chr, '.tsv', sep=""), quote=FALSE, sep="\t", col.names = F, row.names = F)
-    canoes.reads <- read.table(paste('cov_', chr, '.tsv', sep=""))
+  # TODO better transformation
+  write.table(Y, file=paste('cov_', chr, '.tsv', sep=""), quote=FALSE, sep="\t", col.names = F, row.names = F)
+  canoes.reads <- read.table(paste('cov_', chr, '.tsv', sep=""))
 
-    gc <- getgc(chr, ref)
-    target <- seq(1, nrow(Y))
-    canoes.reads <- cbind(target, gc, canoes.reads)
-    sampname <- as.vector(sampname)
-    names(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname)
-    colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname)
-    write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T)
-    xcnv.list <- vector('list', length(sampname))
-    for (i in 1:length(reference_sample_set)) {
-      if (reference_sample_set[[i]] == '') {
-        next()
-      }
-      samples <- unlist(strsplit(reference_sample_set[[i]], ','))
-      actual_sample <- samples[1]
-      reference_samples <- samples[-1]
-      xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample,
-                                            reference.samples=reference_samples,
-                                            counts=canoes.reads)
+  gc <- getgc(chr, ref)
+  target <- seq(1, nrow(Y))
+  canoes.reads <- cbind(target, gc, canoes.reads)
+  sampname <- as.vector(sampname)
+  names(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname)
+  colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname)
+  write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T)
+  xcnv.list <- vector('list', length(sampname))
+  for (i in 1:length(reference_sample_set)) {
+    if (reference_sample_set[[i]] == '') {
+      next()
     }
-    xcnvs <- do.call('rbind', xcnv.list)
-    if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} 
-    calls <- rbind(calls, xcnvs)
+    samples <- unlist(strsplit(reference_sample_set[[i]], ','))
+    actual_sample <- samples[1]
+    reference_samples <- samples[-1]
+    xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample,
+                                          reference.samples=reference_samples,
+                                          counts=canoes.reads)
   }
+  xcnvs <- do.call('rbind', xcnv.list)
+  if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} 
+  calls <- rbind(calls, xcnvs)
 
   # unify results format
   if (nrow(calls) != 0) {
diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R
index cbb5d0a..0d8fa58 100644
--- a/R/CODEXCOV/R/run_CODEXCOV.R
+++ b/R/CODEXCOV/R/run_CODEXCOV.R
@@ -17,7 +17,6 @@ run_CODEXCOV <- function(K_from,
   con <- file(reference_sample_set_file, open='r')
   reference_sample_set <- readLines(con)
   Y <- read.csv(input_cov_table)
-  sampname <- colnames(Y)
   targets <- read.delim(input_bed)
   rownames(Y) <- 1:nrow(Y)
   rownames(targets) <- 1:nrow(targets)

From b7ff8056ceabf1af0af9bb25ef164adcb66f8deb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 16:18:31 +0200
Subject: [PATCH 057/114] force to rebuild CANOESCOV package

---
 Docker/cnv-opt-canoescov/Dockerfile     | 2 ++
 Docker/cnv-opt-exomedepthcov/Dockerfile | 2 --
 Docker/cnv-opt-target-qc/Dockerfile     | 2 --
 build.sh                                | 2 +-
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile
index fa7683b..06d4dea 100644
--- a/Docker/cnv-opt-canoescov/Dockerfile
+++ b/Docker/cnv-opt-canoescov/Dockerfile
@@ -8,4 +8,6 @@ RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtool
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')"
 
+RUN pwd
+
 RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/Docker/cnv-opt-exomedepthcov/Dockerfile b/Docker/cnv-opt-exomedepthcov/Dockerfile
index 9d2d135..28448f8 100644
--- a/Docker/cnv-opt-exomedepthcov/Dockerfile
+++ b/Docker/cnv-opt-exomedepthcov/Dockerfile
@@ -1,7 +1,5 @@
 FROM biodatageeks/cnv-opt-exomedepth
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-ARG CACHE_DATE=not_a_date
-
 RUN Rscript -e "install.packages('EXOMEDEPTHCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
 
diff --git a/Docker/cnv-opt-target-qc/Dockerfile b/Docker/cnv-opt-target-qc/Dockerfile
index e78cc1a..8391b91 100644
--- a/Docker/cnv-opt-target-qc/Dockerfile
+++ b/Docker/cnv-opt-target-qc/Dockerfile
@@ -1,6 +1,4 @@
 FROM biodatageeks/cnv-opt-codex
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-ARG CACHE_DATE=not_a_date
-
 RUN Rscript -e "install.packages('TARGET.QC', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/build.sh b/build.sh
index 6a3bee1..797e83c 100755
--- a/build.sh
+++ b/build.sh
@@ -35,7 +35,7 @@ do
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
     if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then
-      echo "Rebuilf of ${image} image forced..."
+      echo "Rebuild of ${image} image forced..."
       docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
       docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .
     else

From f6c966f3c47d61703e2785405687fc94e83d022e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 17:51:08 +0200
Subject: [PATCH 058/114] bugfix

---
 R/CANOESCOV/R/run_CANOESCOV.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/CANOESCOV/R/run_CANOESCOV.R b/R/CANOESCOV/R/run_CANOESCOV.R
index 403ab35..62e2cfd 100644
--- a/R/CANOESCOV/R/run_CANOESCOV.R
+++ b/R/CANOESCOV/R/run_CANOESCOV.R
@@ -8,6 +8,7 @@ run_CANOESCOV <- function(input_cov_table,
   con <- file(reference_sample_set_file, open='r')
   reference_sample_set <- readLines(con)
   Y <- read.csv(input_cov_table)
+  sampname <- colnames(Y)
   targets <- read.delim(input_bed)
   rownames(Y) <- 1:nrow(Y)
   rownames(targets) <- 1:nrow(targets)

From e4b6f80b1a248cdc424103e23175c73771f11911 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 17:53:49 +0200
Subject: [PATCH 059/114] forcing rebuilding only specified dockers - to speed
 up dev process

---
 build.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/build.sh b/build.sh
index 797e83c..bcf6c28 100755
--- a/build.sh
+++ b/build.sh
@@ -36,11 +36,11 @@ do
     cd $dir
     if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then
       echo "Rebuild of ${image} image forced..."
-      docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
-      docker build --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .
+      docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
+      docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .
     else
-      docker build --no-cache -t $image:$version .
-      docker build --no-cache -t $image:latest .
+      docker build -t $image:$version .
+      docker build -t $image:latest .
     fi
     if [[ ${BUILD_MODE} != "local" ]]; then
       docker push docker.io/$image:latest

From 7c0447ff46984d71af2e8d2a8b7284a5b6edc828 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 22:56:44 +0200
Subject: [PATCH 060/114] coverage table as matrix

---
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R                       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index 37d63e5..bed1ec8 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -4,7 +4,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
                                               input_bed,
                                               output_reference_file){
 
-  Y <- read.csv(input_cov_table)
+  Y <- data.matrix(read.csv(input_cov_table))
   sampname <- colnames(Y)
   targets <- read.delim(input_bed)
   target_length <- targets[,"st_bp"] - targets[,"ed_bp"]

From 820a7512736e7d4b76a8e90f516ad85d7a023a1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 30 Mar 2018 23:11:17 +0200
Subject: [PATCH 061/114] code clean up

---
 Docker/cnv-opt-canoescov/Dockerfile | 2 --
 build.sh                            | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/Docker/cnv-opt-canoescov/Dockerfile b/Docker/cnv-opt-canoescov/Dockerfile
index 06d4dea..fa7683b 100644
--- a/Docker/cnv-opt-canoescov/Dockerfile
+++ b/Docker/cnv-opt-canoescov/Dockerfile
@@ -8,6 +8,4 @@ RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtool
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('S4Vectors')"
 
-RUN pwd
-
 RUN Rscript -e "install.packages('CANOESCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/build.sh b/build.sh
index bcf6c28..fc80400 100755
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,7 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]] || [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then
+    if [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then
       echo "Rebuild of ${image} image forced..."
       docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
       docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .

From f2646eaa1e7ce3d090759f638cdb9811b115983b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sun, 1 Apr 2018 14:46:47 +0200
Subject: [PATCH 062/114] bugfix

---
 R/CODEXCOV/R/run_CODEXCOV.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/CODEXCOV/R/run_CODEXCOV.R b/R/CODEXCOV/R/run_CODEXCOV.R
index 0d8fa58..e5565ae 100644
--- a/R/CODEXCOV/R/run_CODEXCOV.R
+++ b/R/CODEXCOV/R/run_CODEXCOV.R
@@ -38,7 +38,7 @@ run_CODEXCOV <- function(K_from,
     actual_sample <- samples[1]
     reference_samples <- samples[-1]
     samples <- sort(samples)
-    Y_subset <- Y[,samples]
+    Y_subset <- as.matrix(Y[,samples])
 
     ###################################################
     ### code chunk number 7: normObj1

From f89d041567459035a3f07739348916251fe46326 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sun, 1 Apr 2018 14:51:01 +0200
Subject: [PATCH 063/114] force to reload CODEXCOV package

---
 Docker/cnv-opt-codexcov/Dockerfile |  2 ++
 build.sh                           | 12 ++++++------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile
index f06904c..93989f6 100644
--- a/Docker/cnv-opt-codexcov/Dockerfile
+++ b/Docker/cnv-opt-codexcov/Dockerfile
@@ -3,4 +3,6 @@ MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
 ARG CACHE_DATE=not_a_date
 
+RUN pwd
+
 RUN Rscript -e "install.packages('CODEXCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/build.sh b/build.sh
index fc80400..d9b6ccf 100755
--- a/build.sh
+++ b/build.sh
@@ -34,14 +34,14 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then
-      echo "Rebuild of ${image} image forced..."
-      docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
-      docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .
-    else
+    #if [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then
+    #  echo "Rebuild of ${image} image forced..."
+    #  docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
+    #  docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .
+    #else
       docker build -t $image:$version .
       docker build -t $image:latest .
-    fi
+    #fi
     if [[ ${BUILD_MODE} != "local" ]]; then
       docker push docker.io/$image:latest
       docker push docker.io/$image:$version

From 7b9c82dd177cb63fc451feaa5557cc0613e75365 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sun, 1 Apr 2018 18:36:56 +0200
Subject: [PATCH 064/114] dags for codex, canoes and exomedept callers

---
 Docker/cnv-opt-codexcov/Dockerfile |  4 ----
 airflow/dags/canoes.py             | 15 +++++++++++----
 airflow/dags/codex.py              | 12 +++++++-----
 airflow/dags/exomedepth.py         | 12 +++++++-----
 4 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/Docker/cnv-opt-codexcov/Dockerfile b/Docker/cnv-opt-codexcov/Dockerfile
index 93989f6..258bbf8 100644
--- a/Docker/cnv-opt-codexcov/Dockerfile
+++ b/Docker/cnv-opt-codexcov/Dockerfile
@@ -1,8 +1,4 @@
 FROM biodatageeks/cnv-opt-codex
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-ARG CACHE_DATE=not_a_date
-
-RUN pwd
-
 RUN Rscript -e "install.packages('CODEXCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/airflow/dags/canoes.py b/airflow/dags/canoes.py
index 7c74842..fe08612 100755
--- a/airflow/dags/canoes.py
+++ b/airflow/dags/canoes.py
@@ -28,19 +28,26 @@
 length_thresh_to = '2000'
 gc_thresh_from = '20'
 gc_thresh_to = '80'
-raw_cov_table = 'input_cov_table.csv'
-qc_cov_table = 'output_cov_table.csv'
+raw_cov_table = 'raw_cov_table.csv'
+qc_cov_table = 'qc_cov_table.csv'
+raw_bed = 'raw_bed.bed'
+qc_bed = 'qc_bed.bed'
 
 ### select reference sample set parameters
 select_method = 'exomedepth' # "canoes", "codex" or "exomedepth"
 num_refs = '30'
 reference_sample_set_file = 'reference_sample_set.csv'
 
+### canoes parameters
+output_calls_file = 'calls.csv'
+
 run_canoes_caller_cmd= " \
 docker pull biodatageeks/cnv-opt-target-qc; \
-docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "','" + raw_bed + "','" + qc_bed + "')\"; \
 docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \
-docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "')\"; \
+docker pull biodatageeks/cnv-opt-canoescov; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-canoescov Rscript -e \"library(\'CANOESCOV\');run_CANOESCOV('" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \
 "
 
 run_canoes_caller_task= BashOperator (
diff --git a/airflow/dags/codex.py b/airflow/dags/codex.py
index 1da827f..210b3bb 100755
--- a/airflow/dags/codex.py
+++ b/airflow/dags/codex.py
@@ -28,8 +28,10 @@
 length_thresh_to = '2000'
 gc_thresh_from = '20'
 gc_thresh_to = '80'
-raw_cov_table = 'input_cov_table.csv'
-qc_cov_table = 'output_cov_table.csv'
+raw_cov_table = 'raw_cov_table.csv'
+qc_cov_table = 'qc_cov_table.csv'
+raw_bed = 'raw_bed.bed'
+qc_bed = 'qc_bed.bed'
 
 ### select reference sample set parameters
 select_method = 'exomedepth' # "canoes", "codex" or "exomedepth"
@@ -44,11 +46,11 @@
 
 run_codex_caller_cmd= " \
 docker pull biodatageeks/cnv-opt-target-qc; \
-docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "','" + raw_bed + "','" + qc_bed + "')\"; \
 docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \
-docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "')\"; \
 docker pull biodatageeks/cnv-opt-codexcov; \
-docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-codexcov Rscript -e \"library(\'CODEXCOV\');run_CODEXCOV(" + k_from + "," + k_to + "," + lmax + ",'" + qc_cov_table + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-codexcov Rscript -e \"library(\'CODEXCOV\');run_CODEXCOV(" + k_from + "," + k_to + "," + lmax + ",'" + qc_cov_table + "','" + qc_bed  + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \
 "
 
 run_codex_caller_task= BashOperator (
diff --git a/airflow/dags/exomedepth.py b/airflow/dags/exomedepth.py
index e8629be..7952c2b 100755
--- a/airflow/dags/exomedepth.py
+++ b/airflow/dags/exomedepth.py
@@ -28,8 +28,10 @@
 length_thresh_to = '2000'
 gc_thresh_from = '20'
 gc_thresh_to = '80'
-raw_cov_table = 'input_cov_table.csv'
-qc_cov_table = 'output_cov_table.csv'
+raw_cov_table = 'raw_cov_table.csv'
+qc_cov_table = 'qc_cov_table.csv'
+raw_bed = 'raw_bed.bed'
+qc_bed = 'qc_bed.bed'
 
 ### select reference sample set parameters
 select_method = 'exomedepth' # "canoes", "codex" or "exomedepth"
@@ -41,11 +43,11 @@
 
 run_exomedepth_caller_cmd= " \
 docker pull biodatageeks/cnv-opt-target-qc; \
-docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "')\"; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-target-qc Rscript -e \"library(\'TARGET.QC\');run_TARGET.QC(" + mapp_thresh + "," + cov_thresh_from + "," + cov_thresh_to + "," + length_thresh_from + "," + length_thresh_to + "," + gc_thresh_from + "," + gc_thresh_to + ",'" + raw_cov_table + "','" + qc_cov_table + "','" + raw_bed + "','" + qc_bed + "')\"; \
 docker pull biodatageeks/cnv-opt-reference-sample-set-selector; \
-docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + reference_sample_set_file + "')\"; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-reference-sample-set-selector Rscript -e \"library(\'REFERENCE.SAMPLE.SET.SELECTOR\');run_REFERENCE.SAMPLE.SET.SELECTOR('" + select_method + "'," + num_refs + ",'" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "')\"; \
 docker pull biodatageeks/cnv-opt-exomedepthcov; \
-docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-exomedepthcov Rscript -e \"library(\'EXOMEDEPTHCOV\');run_EXOMEDEPTHCOV('" + qc_cov_table + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \
+docker run --rm -v /tmp:/tmp -w=\"/tmp\" biodatageeks/cnv-opt-exomedepthcov Rscript -e \"library(\'EXOMEDEPTHCOV\');run_EXOMEDEPTHCOV('" + qc_cov_table + "','" + qc_bed + "','" + reference_sample_set_file + "','" + output_calls_file + "')\"; \
 "
 
 run_exomedepth_caller_task= BashOperator (

From ab5bd6d7fbeb5f1586d47e52e5637af9d0822707 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 6 Apr 2018 19:45:36 +0200
Subject: [PATCH 065/114] init version of EXOMECOPYCOV package

---
 Docker/cnv-opt-exomecopy/Dockerfile       |  14 +++
 Docker/cnv-opt-exomecopycov/Dockerfile    |   5 +
 Jenkinsfile                               |   1 +
 R/EXOMECOPYCOV/DESCRIPTION                |  22 ++++
 R/EXOMECOPYCOV/NAMESPACE                  |   2 +
 R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R |  20 ++++
 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R       | 134 ++++++++++++++++++++++
 7 files changed, 198 insertions(+)
 create mode 100644 Docker/cnv-opt-exomecopy/Dockerfile
 create mode 100644 Docker/cnv-opt-exomecopycov/Dockerfile
 create mode 100644 R/EXOMECOPYCOV/DESCRIPTION
 create mode 100644 R/EXOMECOPYCOV/NAMESPACE
 create mode 100644 R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R
 create mode 100644 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R

diff --git a/Docker/cnv-opt-exomecopy/Dockerfile b/Docker/cnv-opt-exomecopy/Dockerfile
new file mode 100644
index 0000000..777cc0d
--- /dev/null
+++ b/Docker/cnv-opt-exomecopy/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:xenial
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+RUN apt-get update
+RUN apt-get install -y software-properties-common
+RUN apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9
+RUN add-apt-repository 'deb [arch=amd64,i386] https://cran.rstudio.com/bin/linux/ubuntu xenial/'
+RUN apt-get install -y apt-transport-https
+
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
+
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges');biocLite('exomeCopy')"
diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
new file mode 100644
index 0000000..11b58f5
--- /dev/null
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -0,0 +1,5 @@
+FROM biodatageeks/cnv-opt-exomecopycov
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
+
diff --git a/Jenkinsfile b/Jenkinsfile
index fc11b6f..70682e1 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -21,6 +21,7 @@ pipeline {
                                  sh "cd R && R CMD build REFERENCE.SAMPLE.SET.SELECTOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file REFERENCE.SAMPLE.SET.SELECTOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/REFERENCE.SAMPLE.SET.SELECTOR_0.0.1.tar.gz"
                                  sh "cd R && R CMD build CODEXCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CODEXCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CODEXCOV_0.0.1.tar.gz"
                                  sh "cd R && R CMD build EXOMEDEPTHCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file EXOMEDEPTHCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/EXOMEDEPTHCOV_0.0.1.tar.gz"
+                                 sh "cd R && R CMD build EXOMECOPYCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file EXOMECOPYCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/EXOMECOPYCOV_0.0.1.tar.gz"
                                  sh "cd R && R CMD build CANOESCOV/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOESCOV_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOESCOV_0.0.1.tar.gz"
                                  sh "cd R && R CMD build CANOES/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOES_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOES_0.0.1.tar.gz"
                                  sh "cd R && R CMD build CNVCALLER.RUNNER/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.RUNNER_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.RUNNER_0.0.1.tar.gz"
diff --git a/R/EXOMECOPYCOV/DESCRIPTION b/R/EXOMECOPYCOV/DESCRIPTION
new file mode 100644
index 0000000..c33386d
--- /dev/null
+++ b/R/EXOMECOPYCOV/DESCRIPTION
@@ -0,0 +1,22 @@
+Package: EXOMECOPYCOV
+Title: EXOMECOPY Package With Interface To External Coverage File
+Version: 0.0.1
+Authors@R: c(
+    person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")),
+    person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")),
+    person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut")))
+Description: An extended implementation of the exomeCopy package in R. It extends 
+    original implementation by using external coverage file, which should
+    speed up calculations for running application with multiple sets of input
+    parameters.
+Depends:
+    R (>= 3.2.3),
+    devtools (>= 1.13.2),
+    DBI (== 0.8),
+    optparse (== 1.4.4),
+    IRanges (>= 2.0.0),
+    exomeCopy (== 1.22)
+License: GPL-3
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.0.1.9000
diff --git a/R/EXOMECOPYCOV/NAMESPACE b/R/EXOMECOPYCOV/NAMESPACE
new file mode 100644
index 0000000..884a631
--- /dev/null
+++ b/R/EXOMECOPYCOV/NAMESPACE
@@ -0,0 +1,2 @@
+# Generated by roxygen2: fake comment so roxygen2 overwrites silently.
+exportPattern("^[^\\.]")
diff --git a/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R
new file mode 100644
index 0000000..e1799d8
--- /dev/null
+++ b/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R
@@ -0,0 +1,20 @@
+
+# from CODEX package
+getgc <- function(chr, ref) {
+  library(GenomeInfoDb)
+  library(BSgenome.Hsapiens.UCSC.hg19)
+  if (chr == "X" | chr == "x" | chr == "chrX" | chr == "chrx") {
+    chrtemp <- 23
+  } else if (chr == "Y" | chr == "y" | chr == "chrY" | chr == "chry") {
+    chrtemp <- 24
+  } else {
+    chrtemp <- as.numeric(mapSeqlevels(as.character(chr), "NCBI")[1])
+  }
+  if (length(chrtemp) == 0) 
+    message("Chromosome cannot be found in NCBI Homo sapiens database!")
+  chrm <- unmasked(Hsapiens[[chrtemp]])
+  seqs <- Views(chrm, ref)
+  af <- alphabetFrequency(seqs, baseOnly = TRUE, as.prob = TRUE)
+  gc <- round((af[, "G"] + af[, "C"]) * 100,2)
+  gc
+}
diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
new file mode 100644
index 0000000..a361fc3
--- /dev/null
+++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
@@ -0,0 +1,134 @@
+library(methods)
+library(exomeCopy)
+
+run_EXOMECOPYCOV <- function(input_cov_table,
+                             input_bed,
+                             reference_sample_set_file,
+                             output_calls_file){
+
+  con <- file(reference_sample_set_file, open='r')
+  reference_sample_set <- readLines(con)
+  Y <- read.csv(input_cov_table)
+  sampname <- colnames(Y)
+  targets <- read.delim(input_bed)
+  rownames(Y) <- 1:nrow(Y)
+  rownames(targets) <- 1:nrow(targets)
+  chr <- targets[1,'chr']
+  ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"])
+  if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
+    next()
+  }
+  sample.names <- sampname[,1]
+  target <- GRanges(seqname = chr, IRanges(start = start(ref) + 1, end = end(ref)))
+  gc <- getgc(chr, ref)
+
+  rdata <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2)  
+
+  for(sample.name in sample.names) {
+    rdata[[sample.name]] <- Y[,sample.name]
+  }
+
+  rdata[["bg"]] <- generateBackground(sample.names, rdata, median)
+  rdata[["log.bg"]] <- log(rdata$bg + .1) 
+  rdata[["bg.sd"]] <- generateBackground(sample.names, rdata, sd)
+
+  fit.list <- lapply(sample.names, function(sample.name) {
+    lapply(seqlevels(target), function(seq.name) {
+      exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq"), S = 0:4, d = 2)
+    })
+  })
+  compiled.segments <- compileCopyCountSegments(fit.list)
+  print(compiled.segments)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  #calls <- data.frame(matrix(nrow=0, ncol=13))
+  #chr <- targets[1,'chr']
+  #ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"])
+  #if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
+  #  next()
+  #}
+  #Y <- cbind(rep(chr, nrow(Y)), start(ref), end(ref), Y)
+  #target_length <- c()
+  #for (i in 1:nrow(Y)) {
+  #  target_length <- c(target_length, width(ref[i]))
+  #}
+
+  # TODO better transformation
+  #write.table(Y, file=paste('cov_', chr, '.tsv', sep=""), quote=FALSE, sep="\t", col.names = F, row.names = F)
+  #canoes.reads <- read.table(paste('cov_', chr, '.tsv', sep=""))
+
+  #gc <- getgc(chr, ref)
+  #target <- seq(1, nrow(Y))
+  #canoes.reads <- cbind(target, gc, canoes.reads)
+  #sampname <- as.vector(sampname)
+  #names(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname)
+  #colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname)
+  #write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T)
+  #xcnv.list <- vector('list', length(sampname))
+  #for (i in 1:length(reference_sample_set)) {
+  #  if (reference_sample_set[[i]] == '') {
+  #    next()
+  #  }
+  #  samples <- unlist(strsplit(reference_sample_set[[i]], ','))
+  #  actual_sample <- samples[1]
+  #  reference_samples <- samples[-1]
+  #  xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample,
+  #                                        reference.samples=reference_samples,
+  #                                        counts=canoes.reads)
+  #}
+  #xcnvs <- do.call('rbind', xcnv.list)
+  #if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} 
+  #calls <- rbind(calls, xcnvs)
+
+  # unify results format
+  #if (nrow(calls) != 0) {
+  #  calls[colnames(calls) == 'CNV'] <- as.character(unlist(calls[colnames(calls) == 'CNV']))
+  #  calls[calls == 'DEL'] <- 'del'
+  #  calls[calls == 'DUP'] <- 'dup'
+  #}
+  #colnames(calls)[colnames(calls) == 'SAMPLE'] <- 'sample_name'
+  #targets <- as.vector(calls[colnames(calls) == 'TARGETS'])
+  #targets <- as.character(unlist(targets))
+  #splitted_targets <- do.call(rbind, strsplit(targets, '..', fixed = TRUE))
+  #calls <- cbind(calls, splitted_targets)
+  #colnames(calls)[colnames(calls) == '1'] <- 'st_exon'
+  #colnames(calls)[colnames(calls) == '2'] <- 'ed_exon'
+  #intervals <- as.vector(calls[colnames(calls) == 'INTERVAL'])
+  #intervals <- as.character(unlist(intervals))
+  #splitted_intervals <- do.call(rbind, strsplit(intervals, c(':'), fixed = TRUE))
+  #intervals <- as.vector(splitted_intervals[,2])
+  #intervals <- as.character(unlist(intervals))
+  #splitted_intervals <- do.call(rbind, strsplit(intervals, c('-'), fixed = TRUE))
+  #calls <- cbind(calls, splitted_intervals)
+  #colnames(calls)[colnames(calls) == '1'] <- 'st_bp'
+  #colnames(calls)[colnames(calls) == '2'] <- 'ed_bp'
+  #colnames(calls)[colnames(calls) == 'CNV'] <- 'cnv'
+  #calls <- calls[,-which(names(calls) %in% c('KB', 'MID_BP', 'NUM_TARG', 'Q_SOME', 'TARGETS', 'INTERVAL'))]
+  #colnames(calls)[colnames(calls) == 'CHR'] <- 'chr'
+  #colnames(calls)[colnames(calls) == 'MLCN'] <- 'copy_no'
+  #calls[colnames(calls) == 'sample_name'] <- as.character(unlist(calls[colnames(calls) == 'sample_name']))
+  #calls[colnames(calls) == 'st_bp'] <- as.character(unlist(calls[colnames(calls) == 'st_bp']))
+  #calls[colnames(calls) == 'ed_bp'] <- as.character(unlist(calls[colnames(calls) == 'ed_bp']))
+  #calls[colnames(calls) == 'st_exon'] <- as.character(unlist(calls[colnames(calls) == 'st_exon']))
+  #calls[colnames(calls) == 'ed_exon'] <- as.character(unlist(calls[colnames(calls) == 'ed_exon']))
+  #write.csv(calls, output_calls_file, row.names=F)
+}
+
+#   SAMPLE CNV             INTERVAL     KB CHR   MID_BP    TARGETS NUM_TARG MLCN Q_SOME
+#1      S2 DEL 22:25713988-25756059 42.071  22 25735024 1132..1137        6    1 99
+#2      S3 DEL 22:24373138-24384231 11.093  22 24378684   936..942        7    0 77
+
+

From d7c1a6a9b07b2945b4075dfa85c6b28b747f83f5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Fri, 6 Apr 2018 19:57:37 +0200
Subject: [PATCH 066/114] bugfix in Dockerfile

---
 Docker/cnv-opt-exomecopycov/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index 11b58f5..d810948 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -1,4 +1,4 @@
-FROM biodatageeks/cnv-opt-exomecopycov
+FROM biodatageeks/cnv-opt-exomecopy
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
 RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"

From 63f76e7d4fdd48cbb58b1a2fa3ac40d92ecc7532 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sat, 7 Apr 2018 12:02:51 +0200
Subject: [PATCH 067/114] bugfix in exomeCopy version

---
 R/EXOMECOPYCOV/DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/EXOMECOPYCOV/DESCRIPTION b/R/EXOMECOPYCOV/DESCRIPTION
index c33386d..54ec59c 100644
--- a/R/EXOMECOPYCOV/DESCRIPTION
+++ b/R/EXOMECOPYCOV/DESCRIPTION
@@ -15,7 +15,7 @@ Depends:
     DBI (== 0.8),
     optparse (== 1.4.4),
     IRanges (>= 2.0.0),
-    exomeCopy (== 1.22)
+    exomeCopy (== 1.24)
 License: GPL-3
 Encoding: UTF-8
 LazyData: true

From ae2061ea2ada9062f44656a7a64398eb43be64d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sat, 7 Apr 2018 12:07:16 +0200
Subject: [PATCH 068/114] force to reload EXOMECOPYCOV package

---
 build.sh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/build.sh b/build.sh
index d9b6ccf..907919c 100755
--- a/build.sh
+++ b/build.sh
@@ -34,14 +34,14 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    #if [[ ${image} == "biodatageeks/cnv-opt-codexcov" ]]; then
-    #  echo "Rebuild of ${image} image forced..."
-    #  docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
-    #  docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .
-    #else
+    if [[ ${image} == "biodatageeks/cnv-opt-exomecopycov" ]]; then
+      echo "Rebuild of ${image} image forced..."
+      docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
+      docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .
+    else
       docker build -t $image:$version .
       docker build -t $image:latest .
-    #fi
+    fi
     if [[ ${BUILD_MODE} != "local" ]]; then
       docker push docker.io/$image:latest
       docker push docker.io/$image:$version

From f21852fe89653bda19e0d2965885543d50ef3c29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sat, 7 Apr 2018 12:11:16 +0200
Subject: [PATCH 069/114] bugfix in forcing reloading package

---
 Docker/cnv-opt-exomecopycov/Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index d810948..c157714 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -1,5 +1,7 @@
 FROM biodatageeks/cnv-opt-exomecopy
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
+ARG CACHE_DATE=unknown
+
 RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
 

From 5933e4e796d313041656ae08b130325c772d237c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sat, 7 Apr 2018 12:18:52 +0200
Subject: [PATCH 070/114] bugfix in sampname type

---
 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
index a361fc3..79ce779 100644
--- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
+++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
@@ -9,7 +9,7 @@ run_EXOMECOPYCOV <- function(input_cov_table,
   con <- file(reference_sample_set_file, open='r')
   reference_sample_set <- readLines(con)
   Y <- read.csv(input_cov_table)
-  sampname <- colnames(Y)
+  sample.names <- colnames(Y)
   targets <- read.delim(input_bed)
   rownames(Y) <- 1:nrow(Y)
   rownames(targets) <- 1:nrow(targets)
@@ -18,7 +18,6 @@ run_EXOMECOPYCOV <- function(input_cov_table,
   if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
     next()
   }
-  sample.names <- sampname[,1]
   target <- GRanges(seqname = chr, IRanges(start = start(ref) + 1, end = end(ref)))
   gc <- getgc(chr, ref)
 

From 9acf2004b2125d52d2785d5833156700e1e10077 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sat, 7 Apr 2018 12:28:09 +0200
Subject: [PATCH 071/114] missing libraries

---
 Docker/cnv-opt-exomecopy/Dockerfile    | 3 ++-
 Docker/cnv-opt-exomecopycov/Dockerfile | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Docker/cnv-opt-exomecopy/Dockerfile b/Docker/cnv-opt-exomecopy/Dockerfile
index 777cc0d..4c14ac5 100644
--- a/Docker/cnv-opt-exomecopy/Dockerfile
+++ b/Docker/cnv-opt-exomecopy/Dockerfile
@@ -11,4 +11,5 @@ RUN apt-get update && \
     apt-get upgrade -y && \
     apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
 
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges');biocLite('exomeCopy')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('exomeCopy')"
diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index c157714..94bba90 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -3,5 +3,8 @@ MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
 ARG CACHE_DATE=unknown
 
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')"
+
 RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
 

From e2f5d9a52888f8b0faa543bbd0c12683e147a63c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sat, 7 Apr 2018 15:26:24 +0200
Subject: [PATCH 072/114] exomeCopy in 1.22 version

---
 Docker/cnv-opt-exomecopy/Dockerfile | 6 +++++-
 R/EXOMECOPYCOV/DESCRIPTION          | 2 +-
 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 7 ++++---
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/Docker/cnv-opt-exomecopy/Dockerfile b/Docker/cnv-opt-exomecopy/Dockerfile
index 4c14ac5..2b10782 100644
--- a/Docker/cnv-opt-exomecopy/Dockerfile
+++ b/Docker/cnv-opt-exomecopy/Dockerfile
@@ -9,7 +9,11 @@ RUN apt-get install -y apt-transport-https
 
 RUN apt-get update && \
     apt-get upgrade -y && \
-    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev
+    apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev wget
 
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')"
+
+RUN wget http://bioconductor.org/packages/3.5/bioc/src/contrib/exomeCopy_1.22.0.tar.gz
+RUN Rscript -e "install.packages('exomeCopy_1.22.0.tar.gz', repos = NULL, type='source')"
+
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('exomeCopy')"
diff --git a/R/EXOMECOPYCOV/DESCRIPTION b/R/EXOMECOPYCOV/DESCRIPTION
index 54ec59c..c33386d 100644
--- a/R/EXOMECOPYCOV/DESCRIPTION
+++ b/R/EXOMECOPYCOV/DESCRIPTION
@@ -15,7 +15,7 @@ Depends:
     DBI (== 0.8),
     optparse (== 1.4.4),
     IRanges (>= 2.0.0),
-    exomeCopy (== 1.24)
+    exomeCopy (== 1.22)
 License: GPL-3
 Encoding: UTF-8
 LazyData: true
diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
index 79ce779..a7e9178 100644
--- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
+++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
@@ -27,13 +27,14 @@ run_EXOMECOPYCOV <- function(input_cov_table,
     rdata[[sample.name]] <- Y[,sample.name]
   }
 
-  rdata[["bg"]] <- generateBackground(sample.names, rdata, median)
+  rdata[["bg"]] <- width(ref) # generateBackground(sample.names, rdata, median)
   rdata[["log.bg"]] <- log(rdata$bg + .1) 
-  rdata[["bg.sd"]] <- generateBackground(sample.names, rdata, sd)
+  rdata[["width"]] <- width(ref)
 
   fit.list <- lapply(sample.names, function(sample.name) {
     lapply(seqlevels(target), function(seq.name) {
-      exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq"), S = 0:4, d = 2)
+      print(paste("Processing sample: ", sample.name, sep=""))
+      exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2)
     })
   })
   compiled.segments <- compileCopyCountSegments(fit.list)

From 1412d072aa5a3a68fde9b1057ddc1807a76b70d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sat, 7 Apr 2018 15:52:29 +0200
Subject: [PATCH 073/114] missing libraries

---
 Docker/cnv-opt-exomecopy/Dockerfile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Docker/cnv-opt-exomecopy/Dockerfile b/Docker/cnv-opt-exomecopy/Dockerfile
index 2b10782..0333434 100644
--- a/Docker/cnv-opt-exomecopy/Dockerfile
+++ b/Docker/cnv-opt-exomecopy/Dockerfile
@@ -12,8 +12,9 @@ RUN apt-get update && \
     apt-get install -y r-base libssl-dev libssh2-1-dev libxml2-dev libcurl4-openssl-dev libpq-dev wget
 
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('IRanges')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomicRanges')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('Rsamtools')"
+RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
 
 RUN wget http://bioconductor.org/packages/3.5/bioc/src/contrib/exomeCopy_1.22.0.tar.gz
 RUN Rscript -e "install.packages('exomeCopy_1.22.0.tar.gz', repos = NULL, type='source')"
-
-RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('exomeCopy')"

From 0e8fa237cac6c19ea46313f37096e4d96b192341 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Sun, 8 Apr 2018 18:34:31 +0200
Subject: [PATCH 074/114] writing detected CNVs to output file

---
 R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R | 14 ++++
 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R       | 97 +----------------------
 2 files changed, 17 insertions(+), 94 deletions(-)

diff --git a/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R
index e1799d8..bcfdd0d 100644
--- a/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R
+++ b/R/EXOMECOPYCOV/R/functions_EXOMECOPYCOV.R
@@ -18,3 +18,17 @@ getgc <- function(chr, ref) {
   gc <- round((af[, "G"] + af[, "C"]) * 100,2)
   gc
 }
+
+unify_calls_format <- function(compiled.segments, chr){
+  calls <- matrix(nrow=length(compiled.segments$sample.name), ncol=7)
+  colnames(calls) <- c('sample_name', 'chr', 'st_bp', 'ed_bp', 'cnv', 'copy_no', 'log_odds')
+  calls[,'sample_name'] <- compiled.segments$sample.name
+  calls[,'chr'] <- rep(chr, nrow(calls))
+  calls[,'st_bp'] <- unlist(start(ranges(compiled.segments)))
+  calls[,'ed_bp'] <- unlist(end(ranges(compiled.segments)))
+  calls[,'copy_no'] <- compiled.segments$copy.count
+  calls[,'cnv'] <- ifelse(calls[,'copy_no'] > 2, 'dup', 'del')
+  calls[,'log_odds'] <- compiled.segments$log.odds
+  calls <- subset(calls, calls[,'copy_no'] != "2")
+  return(list(calls=calls))
+}
diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
index a7e9178..1578feb 100644
--- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
+++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
@@ -27,7 +27,7 @@ run_EXOMECOPYCOV <- function(input_cov_table,
     rdata[[sample.name]] <- Y[,sample.name]
   }
 
-  rdata[["bg"]] <- width(ref) # generateBackground(sample.names, rdata, median)
+  rdata[["bg"]] <- generateBackground(sample.names, rdata, median)
   rdata[["log.bg"]] <- log(rdata$bg + .1) 
   rdata[["width"]] <- width(ref)
 
@@ -38,97 +38,6 @@ run_EXOMECOPYCOV <- function(input_cov_table,
     })
   })
   compiled.segments <- compileCopyCountSegments(fit.list)
-  print(compiled.segments)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-  #calls <- data.frame(matrix(nrow=0, ncol=13))
-  #chr <- targets[1,'chr']
-  #ref <- IRanges(start = targets[,"st_bp"], end = targets[,"ed_bp"])
-  #if (length(ref) == 0) {    # 0 elements for specified chromosome in bed
-  #  next()
-  #}
-  #Y <- cbind(rep(chr, nrow(Y)), start(ref), end(ref), Y)
-  #target_length <- c()
-  #for (i in 1:nrow(Y)) {
-  #  target_length <- c(target_length, width(ref[i]))
-  #}
-
-  # TODO better transformation
-  #write.table(Y, file=paste('cov_', chr, '.tsv', sep=""), quote=FALSE, sep="\t", col.names = F, row.names = F)
-  #canoes.reads <- read.table(paste('cov_', chr, '.tsv', sep=""))
-
-  #gc <- getgc(chr, ref)
-  #target <- seq(1, nrow(Y))
-  #canoes.reads <- cbind(target, gc, canoes.reads)
-  #sampname <- as.vector(sampname)
-  #names(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname)
-  #colnames(canoes.reads) <- c("target", "gc", "chromosome", "start", "end", sampname)
-  #write.table(as.data.frame(canoes.reads),file="canoes.reads.csv", quote=F, sep=",",row.names=T,col.names=T)
-  #xcnv.list <- vector('list', length(sampname))
-  #for (i in 1:length(reference_sample_set)) {
-  #  if (reference_sample_set[[i]] == '') {
-  #    next()
-  #  }
-  #  samples <- unlist(strsplit(reference_sample_set[[i]], ','))
-  #  actual_sample <- samples[1]
-  #  reference_samples <- samples[-1]
-  #  xcnv.list[[i]] <- CANOESCOV::CallCNVs(sample.name=actual_sample,
-  #                                        reference.samples=reference_samples,
-  #                                        counts=canoes.reads)
-  #}
-  #xcnvs <- do.call('rbind', xcnv.list)
-  #if (nrow(calls)==0){calls <- matrix(nrow=0, ncol=ncol(xcnvs))} 
-  #calls <- rbind(calls, xcnvs)
-
-  # unify results format
-  #if (nrow(calls) != 0) {
-  #  calls[colnames(calls) == 'CNV'] <- as.character(unlist(calls[colnames(calls) == 'CNV']))
-  #  calls[calls == 'DEL'] <- 'del'
-  #  calls[calls == 'DUP'] <- 'dup'
-  #}
-  #colnames(calls)[colnames(calls) == 'SAMPLE'] <- 'sample_name'
-  #targets <- as.vector(calls[colnames(calls) == 'TARGETS'])
-  #targets <- as.character(unlist(targets))
-  #splitted_targets <- do.call(rbind, strsplit(targets, '..', fixed = TRUE))
-  #calls <- cbind(calls, splitted_targets)
-  #colnames(calls)[colnames(calls) == '1'] <- 'st_exon'
-  #colnames(calls)[colnames(calls) == '2'] <- 'ed_exon'
-  #intervals <- as.vector(calls[colnames(calls) == 'INTERVAL'])
-  #intervals <- as.character(unlist(intervals))
-  #splitted_intervals <- do.call(rbind, strsplit(intervals, c(':'), fixed = TRUE))
-  #intervals <- as.vector(splitted_intervals[,2])
-  #intervals <- as.character(unlist(intervals))
-  #splitted_intervals <- do.call(rbind, strsplit(intervals, c('-'), fixed = TRUE))
-  #calls <- cbind(calls, splitted_intervals)
-  #colnames(calls)[colnames(calls) == '1'] <- 'st_bp'
-  #colnames(calls)[colnames(calls) == '2'] <- 'ed_bp'
-  #colnames(calls)[colnames(calls) == 'CNV'] <- 'cnv'
-  #calls <- calls[,-which(names(calls) %in% c('KB', 'MID_BP', 'NUM_TARG', 'Q_SOME', 'TARGETS', 'INTERVAL'))]
-  #colnames(calls)[colnames(calls) == 'CHR'] <- 'chr'
-  #colnames(calls)[colnames(calls) == 'MLCN'] <- 'copy_no'
-  #calls[colnames(calls) == 'sample_name'] <- as.character(unlist(calls[colnames(calls) == 'sample_name']))
-  #calls[colnames(calls) == 'st_bp'] <- as.character(unlist(calls[colnames(calls) == 'st_bp']))
-  #calls[colnames(calls) == 'ed_bp'] <- as.character(unlist(calls[colnames(calls) == 'ed_bp']))
-  #calls[colnames(calls) == 'st_exon'] <- as.character(unlist(calls[colnames(calls) == 'st_exon']))
-  #calls[colnames(calls) == 'ed_exon'] <- as.character(unlist(calls[colnames(calls) == 'ed_exon']))
-  #write.csv(calls, output_calls_file, row.names=F)
+  calls <- unify_calls_format(compiled.segments, chr)$calls
+  write.csv(calls, output_calls_file, row.names=F)
 }
-
-#   SAMPLE CNV             INTERVAL     KB CHR   MID_BP    TARGETS NUM_TARG MLCN Q_SOME
-#1      S2 DEL 22:25713988-25756059 42.071  22 25735024 1132..1137        6    1 99
-#2      S3 DEL 22:24373138-24384231 11.093  22 24378684   936..942        7    0 77
-
-

From c446c1f0804d7a3797fa6a064ea0271274937aa5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 30 Apr 2018 12:30:28 +0200
Subject: [PATCH 075/114] random method of selecting reference sample set

---
 .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R            | 7 +++++++
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R                  | 3 +++
 2 files changed, 10 insertions(+)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index d92bf29..7bc6826 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -30,3 +30,10 @@ exomedepth_method <- function(investigated_sample, Y, num_refs, target_length){
   } 
   return(list(reference_samples=reference_samples))
 }
+
+random_method <- function(investigated_sample, Y, num_refs){
+  samples <- colnames(Y)
+  reference_samples <- setdiff(samples, investigated_sample)
+  reference_samples <- reference_samples[sample(1:length(reference_samples), num_refs, replace=F)]
+  return(list(reference_samples=reference_samples))
+}
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index bed1ec8..08bec95 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -25,6 +25,9 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
     } else if(select_method == "clamms") {
       #reference_samples_for_investigated_sample <- clamms_method(investigated_sample, Y, num_refs)$reference_samples
       #reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
+    } else if(select_method == "random") {
+      reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs, target_length)$reference_samples
+      reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
     }
   }
   resultant_string <- ''

From 45e10ba018796ba0599f8cbd5f03be83818d9c8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 30 Apr 2018 12:36:24 +0200
Subject: [PATCH 076/114] reload package for selecting reference sample set

---
 build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index 907919c..b7313d0 100755
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,7 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/cnv-opt-exomecopycov" ]]; then
+    if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then
       echo "Rebuild of ${image} image forced..."
       docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
       docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .

From 2856a8db1f2db49927be3bc94fff362f9fbe81d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 30 Apr 2018 12:39:30 +0200
Subject: [PATCH 077/114] bugfix

---
 Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
index 716a5f4..d854276 100644
--- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
+++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
@@ -1,7 +1,7 @@
 FROM biodatageeks/cnv-opt-codex
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-ARG CACHE_DATE=not_a_date
+ARG CACHE_DATE=not_a_specified_date
 
 RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')"
 

From 2989118dc87cd584a5672e99e47db53f6f8fcd06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 30 Apr 2018 12:45:19 +0200
Subject: [PATCH 078/114] bugfix

---
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R                       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index 08bec95..12e94c2 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -26,7 +26,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
       #reference_samples_for_investigated_sample <- clamms_method(investigated_sample, Y, num_refs)$reference_samples
       #reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
     } else if(select_method == "random") {
-      reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs, target_length)$reference_samples
+      reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs)$reference_samples
       reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
     }
   }

From 7bb4e4b9c2885f3adf7f5800e2bda6427cb2f4a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 30 Apr 2018 13:48:29 +0200
Subject: [PATCH 079/114] select reference set based on CANOES method with cov
 threshold

---
 .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R       | 11 +++++++++++
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R             |  5 +++++
 2 files changed, 16 insertions(+)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index 7bc6826..417758b 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -12,6 +12,17 @@ canoes_method <- function(investigated_sample, Y, num_refs){
   return(list(reference_samples=reference_samples))
 }
 
+canoes_cov_thresh_method <- function(investigated_sample, Y, cov_thresh){
+  samples <- colnames(Y)
+  cov <- cor(Y[, samples], Y[, samples])
+  reference_samples <- setdiff(samples, investigated_sample)
+  covariances <- cov[investigated_sample, reference_samples]
+  num_refs <- sum(covariances > cov_thresh)
+  reference_samples <- names(sort(covariances, 
+          decreasing=T)[1:num_refs])
+  return(list(reference_samples=reference_samples))
+}
+
 exomedepth_method <- function(investigated_sample, Y, num_refs, target_length){
   library(ExomeDepth)
   samples <- colnames(Y)
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index 12e94c2..02254b3 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -1,5 +1,6 @@
 run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
                                               num_refs,
+                                              cov_thresh,
                                               input_cov_table,
                                               input_bed,
                                               output_reference_file){
@@ -29,6 +30,10 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
       reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs)$reference_samples
       reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
     }
+    } else if(select_method == "canoes_cov_thresh") {
+      reference_samples_for_investigated_sample <- canoes_cov_thresh_method(investigated_sample, Y, cov_thresh)$reference_samples
+      reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
+    }
   }
   resultant_string <- ''
   for(i in 1:length(reference_samples)) {

From c54d4361ae0fc296ad31ee2ee2f5cdc09e0ab66e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 30 Apr 2018 13:54:01 +0200
Subject: [PATCH 080/114] bugfix

---
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R                        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index 02254b3..44ff820 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -29,7 +29,6 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
     } else if(select_method == "random") {
       reference_samples_for_investigated_sample <- random_method(investigated_sample, Y, num_refs)$reference_samples
       reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
-    }
     } else if(select_method == "canoes_cov_thresh") {
       reference_samples_for_investigated_sample <- canoes_cov_thresh_method(investigated_sample, Y, cov_thresh)$reference_samples
       reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)

From 7ed767f0a8a0c6d76256478e4506ad1011fa1476 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 12:05:59 +0200
Subject: [PATCH 081/114] investigated reference sample set in EXOMECOPYCOV
 package (first version)

---
 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 36 ++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
index 1578feb..735e972 100644
--- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
+++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
@@ -9,7 +9,6 @@ run_EXOMECOPYCOV <- function(input_cov_table,
   con <- file(reference_sample_set_file, open='r')
   reference_sample_set <- readLines(con)
   Y <- read.csv(input_cov_table)
-  sample.names <- colnames(Y)
   targets <- read.delim(input_bed)
   rownames(Y) <- 1:nrow(Y)
   rownames(targets) <- 1:nrow(targets)
@@ -21,23 +20,34 @@ run_EXOMECOPYCOV <- function(input_cov_table,
   target <- GRanges(seqname = chr, IRanges(start = start(ref) + 1, end = end(ref)))
   gc <- getgc(chr, ref)
 
-  rdata <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2)  
+  rdata_org <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2)  
 
-  for(sample.name in sample.names) {
-    rdata[[sample.name]] <- Y[,sample.name]
-  }
+  for (i in 1:length(reference_sample_set)) {
+    if (reference_sample_set[[i]] == '') {
+      next()
+    }
+    samples <- unlist(strsplit(reference_sample_set[[i]], ','))
+    actual_sample <- samples[1]
+    reference_samples <- samples[-1]
+    samples <- sort(samples)
+    rdata <- rdata_org
+
+    for(sample.name in samples) {
+      rdata[[sample.name]] <- Y[,sample.name]
+    }
 
-  rdata[["bg"]] <- generateBackground(sample.names, rdata, median)
-  rdata[["log.bg"]] <- log(rdata$bg + .1) 
-  rdata[["width"]] <- width(ref)
+    rdata[["bg"]] <- generateBackground(samples, rdata, median)
+    rdata[["log.bg"]] <- log(rdata$bg + .1) 
+    rdata[["width"]] <- width(ref)
 
-  fit.list <- lapply(sample.names, function(sample.name) {
     lapply(seqlevels(target), function(seq.name) {
-      print(paste("Processing sample: ", sample.name, sep=""))
-      exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2)
+      print(paste("Processing sample: ", actual_sample, sep=""))
+      exomeCopy(rdata, actual_sample, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2)
     })
-  })
-  compiled.segments <- compileCopyCountSegments(fit.list)
+    compiled.segments <- compileCopyCountSegments(fit.list)
+    print(compiled.segments)
+
+  }
   calls <- unify_calls_format(compiled.segments, chr)$calls
   write.csv(calls, output_calls_file, row.names=F)
 }

From 43ead0d6ac68c39051605022c7c2fc2bbc7410ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 12:14:28 +0200
Subject: [PATCH 082/114] reload package

---
 build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index b7313d0..907919c 100755
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,7 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then
+    if [[ ${image} == "biodatageeks/cnv-opt-exomecopycov" ]]; then
       echo "Rebuild of ${image} image forced..."
       docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
       docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .

From ca70a8135c2db398f26917095a61b48c02d72f1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 12:16:38 +0200
Subject: [PATCH 083/114] manual forcing package reload

---
 Docker/cnv-opt-exomecopycov/Dockerfile | 2 --
 build.sh                               | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index 94bba90..c12fa2f 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -1,8 +1,6 @@
 FROM biodatageeks/cnv-opt-exomecopy
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-ARG CACHE_DATE=unknown
-
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')"
 
diff --git a/build.sh b/build.sh
index 907919c..b7313d0 100755
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,7 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/cnv-opt-exomecopycov" ]]; then
+    if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then
       echo "Rebuild of ${image} image forced..."
       docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
       docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .

From 407109435f45a749e1d2ffb9796160d10adfe8a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 12:32:00 +0200
Subject: [PATCH 084/114] DBI package greater or equal to 0.8

---
 R/CANOESCOV/DESCRIPTION           | 2 +-
 R/CNVCALLER.EVALUATOR/DESCRIPTION | 2 +-
 R/CNVCALLER.RUNNER/DESCRIPTION    | 2 +-
 R/CODEXCOV/DESCRIPTION            | 2 +-
 R/EXOMECOPYCOV/DESCRIPTION        | 2 +-
 R/EXOMEDEPTHCOV/DESCRIPTION       | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/CANOESCOV/DESCRIPTION b/R/CANOESCOV/DESCRIPTION
index 0fa3115..27c73ca 100644
--- a/R/CANOESCOV/DESCRIPTION
+++ b/R/CANOESCOV/DESCRIPTION
@@ -12,7 +12,7 @@ Description: An extended implementation of the CANOES package in R. It extends
 Depends:
     R (>= 3.2.3),
     devtools (>= 1.13.2),
-    DBI (== 0.8),
+    DBI (>= 0.8),
     optparse (== 1.4.4),
     IRanges (>= 2.0.0),
     plyr (>= 1.8.4),
diff --git a/R/CNVCALLER.EVALUATOR/DESCRIPTION b/R/CNVCALLER.EVALUATOR/DESCRIPTION
index f16487e..5dda4e9 100644
--- a/R/CNVCALLER.EVALUATOR/DESCRIPTION
+++ b/R/CNVCALLER.EVALUATOR/DESCRIPTION
@@ -9,7 +9,7 @@ Description: A package to evaluate CNV callers results.
 Depends:
     R (>= 3.2.3),
     devtools (>= 1.13.2),
-    DBI (== 0.8),
+    DBI (>= 0.8),
     optparse (== 1.4.4)
 License: GPL-3
 Encoding: UTF-8
diff --git a/R/CNVCALLER.RUNNER/DESCRIPTION b/R/CNVCALLER.RUNNER/DESCRIPTION
index aa62f1f..1f68fdc 100644
--- a/R/CNVCALLER.RUNNER/DESCRIPTION
+++ b/R/CNVCALLER.RUNNER/DESCRIPTION
@@ -13,7 +13,7 @@ Depends:
     EXOMEDEPTHCOV (>= 0.0.1),
     CANOESCOV (>= 0.0.1),
     devtools (>= 1.13.2),
-    DBI (== 0.8),
+    DBI (>= 0.8),
     optparse (== 1.4.4)
 License: GPL-3
 Encoding: UTF-8
diff --git a/R/CODEXCOV/DESCRIPTION b/R/CODEXCOV/DESCRIPTION
index f6516a3..aacb351 100755
--- a/R/CODEXCOV/DESCRIPTION
+++ b/R/CODEXCOV/DESCRIPTION
@@ -12,7 +12,7 @@ Description: An extended implementation of the CODEX package in R. It extends
 Depends:
     R (>= 3.2.3),
     devtools (>= 1.13.2),
-    DBI (== 0.8),
+    DBI (>= 0.8),
     optparse (== 1.4.4),
     CODEX (>= 1.8.0)
 License: GPL-3
diff --git a/R/EXOMECOPYCOV/DESCRIPTION b/R/EXOMECOPYCOV/DESCRIPTION
index c33386d..1c27c1d 100644
--- a/R/EXOMECOPYCOV/DESCRIPTION
+++ b/R/EXOMECOPYCOV/DESCRIPTION
@@ -12,7 +12,7 @@ Description: An extended implementation of the exomeCopy package in R. It extend
 Depends:
     R (>= 3.2.3),
     devtools (>= 1.13.2),
-    DBI (== 0.8),
+    DBI (>= 0.8),
     optparse (== 1.4.4),
     IRanges (>= 2.0.0),
     exomeCopy (== 1.22)
diff --git a/R/EXOMEDEPTHCOV/DESCRIPTION b/R/EXOMEDEPTHCOV/DESCRIPTION
index 8305596..4627623 100644
--- a/R/EXOMEDEPTHCOV/DESCRIPTION
+++ b/R/EXOMEDEPTHCOV/DESCRIPTION
@@ -12,7 +12,7 @@ Description: An extended implementation of the ExomeDepth package in R. It exten
 Depends:
     R (>= 3.2.3),
     devtools (>= 1.13.2),
-    DBI (== 0.8),
+    DBI (>= 0.8),
     optparse (== 1.4.4),
     IRanges (>= 2.0.0),
     ExomeDepth (>= 1.1.10),

From 6bcb7f7c39a175603419b5c1ca1e9e6e570d4031 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 12:36:24 +0200
Subject: [PATCH 085/114] force to reload package

---
 Docker/cnv-opt-exomecopycov/Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index c12fa2f..8e189ed 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -4,5 +4,7 @@ MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')"
 
+RUN pwd
+
 RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
 

From fc5fbd2f6c6e1782ff6973a3864e1477ca584dcd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 12:41:44 +0200
Subject: [PATCH 086/114] bugfix

---
 Docker/cnv-opt-exomecopycov/Dockerfile | 2 --
 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R    | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index 8e189ed..c12fa2f 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -4,7 +4,5 @@ MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')"
 
-RUN pwd
-
 RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
 
diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
index 735e972..0ff48d0 100644
--- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
+++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
@@ -40,7 +40,7 @@ run_EXOMECOPYCOV <- function(input_cov_table,
     rdata[["log.bg"]] <- log(rdata$bg + .1) 
     rdata[["width"]] <- width(ref)
 
-    lapply(seqlevels(target), function(seq.name) {
+    fit.list <- lapply(seqlevels(target), function(seq.name) {
       print(paste("Processing sample: ", actual_sample, sep=""))
       exomeCopy(rdata, actual_sample, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2)
     })

From 0662e415f3da46a84e69c6151849f46c1fd53208 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 12:45:22 +0200
Subject: [PATCH 087/114] reload package in docker container

---
 Docker/cnv-opt-exomecopycov/Dockerfile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index c12fa2f..8aa0833 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -4,5 +4,8 @@ MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')"
 
+RUN pwd
+RUN pwd
+
 RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
 

From bf7ac7c5927eee27c72426c33b24569f26f42d0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 12:51:29 +0200
Subject: [PATCH 088/114] bugfix

---
 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
index 0ff48d0..dd20bbb 100644
--- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
+++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
@@ -40,9 +40,12 @@ run_EXOMECOPYCOV <- function(input_cov_table,
     rdata[["log.bg"]] <- log(rdata$bg + .1) 
     rdata[["width"]] <- width(ref)
 
-    fit.list <- lapply(seqlevels(target), function(seq.name) {
-      print(paste("Processing sample: ", actual_sample, sep=""))
-      exomeCopy(rdata, actual_sample, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2)
+    sample.name <- c(actual_sample)
+    fit.list <- lapply(samples, function(sample.name) {
+      lapply(seqlevels(target), function(seq.name) {
+        print(paste("Processing sample: ", sample.name, sep=""))
+        exomeCopy(rdata, sample.name, X.names = c("log.bg", "gc", "gc.sq", "width"), S = 0:4, d = 2)
+      })
     })
     compiled.segments <- compileCopyCountSegments(fit.list)
     print(compiled.segments)

From 6c771b4264a3de7bd474b0723364558fa4f8e3f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 12:53:58 +0200
Subject: [PATCH 089/114] reload package

---
 Docker/cnv-opt-exomecopycov/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index 8aa0833..4391d8f 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -4,6 +4,7 @@ MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')"
 
+RUN pwd
 RUN pwd
 RUN pwd
 

From 0b0d37a916828a91bfed2c39b10651c42badad4a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 12:59:41 +0200
Subject: [PATCH 090/114] bugfix one more time

---
 Docker/cnv-opt-exomecopycov/Dockerfile | 1 +
 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index 4391d8f..611a814 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -7,6 +7,7 @@ RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome
 RUN pwd
 RUN pwd
 RUN pwd
+RUN pwd
 
 RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
 
diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
index dd20bbb..0fea56d 100644
--- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
+++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
@@ -40,7 +40,7 @@ run_EXOMECOPYCOV <- function(input_cov_table,
     rdata[["log.bg"]] <- log(rdata$bg + .1) 
     rdata[["width"]] <- width(ref)
 
-    sample.name <- c(actual_sample)
+    samples <- c(actual_sample)
     fit.list <- lapply(samples, function(sample.name) {
       lapply(seqlevels(target), function(seq.name) {
         print(paste("Processing sample: ", sample.name, sep=""))

From a88e5dbfbb2481a22d365b9d00024a7c3f573032 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 13:12:08 +0200
Subject: [PATCH 091/114] merging results

---
 Docker/cnv-opt-exomecopycov/Dockerfile |  1 +
 R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R    | 11 +++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index 611a814..a89564d 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -8,6 +8,7 @@ RUN pwd
 RUN pwd
 RUN pwd
 RUN pwd
+RUN pwd
 
 RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
 
diff --git a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
index 0fea56d..af45307 100644
--- a/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
+++ b/R/EXOMECOPYCOV/R/run_EXOMECOPYCOV.R
@@ -20,7 +20,8 @@ run_EXOMECOPYCOV <- function(input_cov_table,
   target <- GRanges(seqname = chr, IRanges(start = start(ref) + 1, end = end(ref)))
   gc <- getgc(chr, ref)
 
-  rdata_org <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2)  
+  rdata_org <- RangedData(IRanges(start=start(ref), end=end(ref)), space=rep(chr,nrow(Y)), universe="hg19", gc=gc, gc.sq=gc^2) 
+  finalcall <- matrix(nrow=0, ncol=13)
 
   for (i in 1:length(reference_sample_set)) {
     if (reference_sample_set[[i]] == '') {
@@ -48,9 +49,11 @@ run_EXOMECOPYCOV <- function(input_cov_table,
       })
     })
     compiled.segments <- compileCopyCountSegments(fit.list)
-    print(compiled.segments)
+    finalcallIt <- unify_calls_format(compiled.segments, chr)$calls
+    if (nrow(finalcall)==0){finalcall <- matrix(nrow=0, ncol=ncol(finalcallIt))}
+    finalcall <- rbind(finalcall, finalcallIt)
+    print(finalcallIt)
 
   }
-  calls <- unify_calls_format(compiled.segments, chr)$calls
-  write.csv(calls, output_calls_file, row.names=F)
+  write.csv(finalcall, output_calls_file, row.names=F)
 }

From 5d62b84b6c29d3f67880c24fb9dfdc98d338f013 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 16 May 2018 13:19:49 +0200
Subject: [PATCH 092/114] clean up Dockerfile

---
 Docker/cnv-opt-exomecopycov/Dockerfile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Docker/cnv-opt-exomecopycov/Dockerfile b/Docker/cnv-opt-exomecopycov/Dockerfile
index a89564d..c12fa2f 100644
--- a/Docker/cnv-opt-exomecopycov/Dockerfile
+++ b/Docker/cnv-opt-exomecopycov/Dockerfile
@@ -4,11 +4,5 @@ MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('GenomeInfoDb')"
 RUN Rscript -e "source('https://bioconductor.org/biocLite.R');biocLite('BSgenome.Hsapiens.UCSC.hg19')"
 
-RUN pwd
-RUN pwd
-RUN pwd
-RUN pwd
-RUN pwd
-
 RUN Rscript -e "install.packages('EXOMECOPYCOV', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
 

From 53f94dc7d59292dcfc9ea713309d03a3ddc23202 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 21 May 2018 16:49:15 +0200
Subject: [PATCH 093/114] kmeans in reference sample set selector - first
 version

---
 .../functions_REFERENCE.SAMPLE.SET.SELECTOR.R | 30 +++++++++++++++++++
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R     |  3 ++
 2 files changed, 33 insertions(+)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index 417758b..ef5d0d6 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -48,3 +48,33 @@ random_method <- function(investigated_sample, Y, num_refs){
   reference_samples <- reference_samples[sample(1:length(reference_samples), num_refs, replace=F)]
   return(list(reference_samples=reference_samples))
 }
+
+kmeans_method <- function(investigated_sample, Y, number_of_clusters){
+  samples <- colnames(Y)
+  cov <- cor(Y[, samples], Y[, samples])
+  d <- cov
+  for(i in 1:nrow(d)) {
+    d[i,] <- cov[samples[i], samples]
+  }
+  d <- 1-d
+  c <- c()
+  for(i in 1:ncol(d)-1) {
+    c <- c(c, d[(i+1):nrow(d),i])
+  }
+  d <- dist(d)
+  for(i in 1:length(d)) {
+    d[i] <- c[i]
+  }
+  km1 <- kmeans(d, number_of_clusters, nstart=100)
+  cluster_id <- km1$cluster[investigated_sample]
+  reference_samples <- c()
+  list_index <- 1
+  for(i in km1$cluster) {
+    if(i == cluster_id) {
+      reference_samples <- c(reference_samples, sampname_qc[list_index])
+    }
+    list_index <- list_index + 1
+  }
+  reference_samples <- setdiff(reference_samples, investigated_sample)
+  return(list(reference_samples=reference_samples))
+}
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index 44ff820..4677b37 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -32,6 +32,9 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
     } else if(select_method == "canoes_cov_thresh") {
       reference_samples_for_investigated_sample <- canoes_cov_thresh_method(investigated_sample, Y, cov_thresh)$reference_samples
       reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
+    } else if(select_method == "kmeans") {
+      reference_samples_for_investigated_sample <- kmeans_method(investigated_sample, Y, num_refs)$reference_samples
+      reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
     }
   }
   resultant_string <- ''

From 5878140830bb2fbd43f323a86070de82335330be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 21 May 2018 16:51:30 +0200
Subject: [PATCH 094/114] force to reload Docker image

---
 Docker/cnv-opt-reference-sample-set-selector/Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
index d854276..7775656 100644
--- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
+++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
@@ -5,4 +5,6 @@ ARG CACHE_DATE=not_a_specified_date
 
 RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')"
 
+RUN pwd
+
 RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"

From ef8dace968b01f61d60f11cffefbd279c78246af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 21 May 2018 17:11:03 +0200
Subject: [PATCH 095/114] bugfix

---
 Docker/cnv-opt-reference-sample-set-selector/Dockerfile         | 1 +
 .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R                 | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
index 7775656..8b6f712 100644
--- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
+++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
@@ -5,6 +5,7 @@ ARG CACHE_DATE=not_a_specified_date
 
 RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')"
 
+RUN pwd
 RUN pwd
 
 RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index ef5d0d6..d5150ed 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -71,7 +71,7 @@ kmeans_method <- function(investigated_sample, Y, number_of_clusters){
   list_index <- 1
   for(i in km1$cluster) {
     if(i == cluster_id) {
-      reference_samples <- c(reference_samples, sampname_qc[list_index])
+      reference_samples <- c(reference_samples, samples[list_index])
     }
     list_index <- list_index + 1
   }

From ab4c0e11457ab0e280f75a75e3e464c8389f8e7a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 21 May 2018 17:22:41 +0200
Subject: [PATCH 096/114] speed up kmeans

---
 .../cnv-opt-reference-sample-set-selector/Dockerfile  |  1 +
 .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R       | 11 ++++++++---
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R             |  5 ++++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
index 8b6f712..2945c1d 100644
--- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
+++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
@@ -5,6 +5,7 @@ ARG CACHE_DATE=not_a_specified_date
 
 RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')"
 
+RUN pwd
 RUN pwd
 RUN pwd
 
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index d5150ed..6ef310b 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -49,7 +49,7 @@ random_method <- function(investigated_sample, Y, num_refs){
   return(list(reference_samples=reference_samples))
 }
 
-kmeans_method <- function(investigated_sample, Y, number_of_clusters){
+kmeans_select_groups <- function(Y, number_of_clusters){
   samples <- colnames(Y)
   cov <- cor(Y[, samples], Y[, samples])
   d <- cov
@@ -66,10 +66,15 @@ kmeans_method <- function(investigated_sample, Y, number_of_clusters){
     d[i] <- c[i]
   }
   km1 <- kmeans(d, number_of_clusters, nstart=100)
-  cluster_id <- km1$cluster[investigated_sample]
+  return(list(clusters=km1$cluster))
+}
+
+kmeans_method <- function(investigated_sample, Y, kmeans_clusters){
+  samples <- colnames(Y)
+  cluster_id <- kmeans_clusters[investigated_sample]
   reference_samples <- c()
   list_index <- 1
-  for(i in km1$cluster) {
+  for(i in kmeans_clusters) {
     if(i == cluster_id) {
       reference_samples <- c(reference_samples, samples[list_index])
     }
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index 4677b37..9cc24f2 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -10,6 +10,9 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
   targets <- read.delim(input_bed)
   target_length <- targets[,"st_bp"] - targets[,"ed_bp"]
   reference_samples <- list()
+  if() {
+    kmeans_clusters <- kmeans_select_groups(Y, num_refs)
+  }
 
   for(i in 1:length(sampname)) {
     investigated_sample <- as.character(sampname[i])
@@ -33,7 +36,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
       reference_samples_for_investigated_sample <- canoes_cov_thresh_method(investigated_sample, Y, cov_thresh)$reference_samples
       reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
     } else if(select_method == "kmeans") {
-      reference_samples_for_investigated_sample <- kmeans_method(investigated_sample, Y, num_refs)$reference_samples
+      reference_samples_for_investigated_sample <- kmeans_method(investigated_sample, Y, kmeans_clusters)$reference_samples
       reference_samples[[i]] <- c(investigated_sample, reference_samples_for_investigated_sample)
     }
   }

From 769805ac2ee48497624fd9c7d358559a98560742 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 21 May 2018 17:29:51 +0200
Subject: [PATCH 097/114] bugfix

---
 Docker/cnv-opt-reference-sample-set-selector/Dockerfile         | 1 +
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R                       | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
index 2945c1d..92b7bcc 100644
--- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
+++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
@@ -8,5 +8,6 @@ RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project
 RUN pwd
 RUN pwd
 RUN pwd
+RUN pwd
 
 RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index 9cc24f2..12053e5 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -10,7 +10,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
   targets <- read.delim(input_bed)
   target_length <- targets[,"st_bp"] - targets[,"ed_bp"]
   reference_samples <- list()
-  if() {
+  if(select_method == "kmeans") {
     kmeans_clusters <- kmeans_select_groups(Y, num_refs)
   }
 

From 57b9604c6f495d73971832933763d59dbf03df3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 21 May 2018 17:37:37 +0200
Subject: [PATCH 098/114] changes in order to find a bug

---
 .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R                | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index 6ef310b..995acbf 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -70,8 +70,11 @@ kmeans_select_groups <- function(Y, number_of_clusters){
 }
 
 kmeans_method <- function(investigated_sample, Y, kmeans_clusters){
+  print(kmeans_clusters)
   samples <- colnames(Y)
+  print(samples)
   cluster_id <- kmeans_clusters[investigated_sample]
+  print(cluster_id)
   reference_samples <- c()
   list_index <- 1
   for(i in kmeans_clusters) {

From 85c550ef713f9058721c80969b7b9684030b031c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 21 May 2018 17:52:40 +0200
Subject: [PATCH 099/114] bugfix

---
 .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R             | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index 995acbf..e9b7379 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -66,18 +66,18 @@ kmeans_select_groups <- function(Y, number_of_clusters){
     d[i] <- c[i]
   }
   km1 <- kmeans(d, number_of_clusters, nstart=100)
-  return(list(clusters=km1$cluster))
+  return(list(clusters=km1))
 }
 
 kmeans_method <- function(investigated_sample, Y, kmeans_clusters){
   print(kmeans_clusters)
   samples <- colnames(Y)
   print(samples)
-  cluster_id <- kmeans_clusters[investigated_sample]
+  cluster_id <- kmeans_clusters$cluster[investigated_sample]
   print(cluster_id)
   reference_samples <- c()
   list_index <- 1
-  for(i in kmeans_clusters) {
+  for(i in kmeans_clusters$cluster) {
     if(i == cluster_id) {
       reference_samples <- c(reference_samples, samples[list_index])
     }

From bc2a00201b039be19f4966eedf7d99c165cb5fda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 21 May 2018 18:15:09 +0200
Subject: [PATCH 100/114] changes in order to detect bug

---
 .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R                  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index e9b7379..f0d64f5 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -73,6 +73,7 @@ kmeans_method <- function(investigated_sample, Y, kmeans_clusters){
   print(kmeans_clusters)
   samples <- colnames(Y)
   print(samples)
+  print(kmeans_clusters$cluster)
   cluster_id <- kmeans_clusters$cluster[investigated_sample]
   print(cluster_id)
   reference_samples <- c()

From e408aeaf99d006bebe268be6e080f0d01c484e52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Mon, 21 May 2018 18:48:23 +0200
Subject: [PATCH 101/114] code clean up, bugfix

---
 Docker/cnv-opt-reference-sample-set-selector/Dockerfile      | 5 -----
 .../R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R              | 4 ----
 .../R/run_REFERENCE.SAMPLE.SET.SELECTOR.R                    | 2 +-
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
index 92b7bcc..d854276 100644
--- a/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
+++ b/Docker/cnv-opt-reference-sample-set-selector/Dockerfile
@@ -5,9 +5,4 @@ ARG CACHE_DATE=not_a_specified_date
 
 RUN Rscript -e "install.packages('ExomeDepth', repos = 'http://cran.us.r-project.org')"
 
-RUN pwd
-RUN pwd
-RUN pwd
-RUN pwd
-
 RUN Rscript -e "install.packages('REFERENCE.SAMPLE.SET.SELECTOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
index f0d64f5..4863a15 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/functions_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -70,12 +70,8 @@ kmeans_select_groups <- function(Y, number_of_clusters){
 }
 
 kmeans_method <- function(investigated_sample, Y, kmeans_clusters){
-  print(kmeans_clusters)
   samples <- colnames(Y)
-  print(samples)
-  print(kmeans_clusters$cluster)
   cluster_id <- kmeans_clusters$cluster[investigated_sample]
-  print(cluster_id)
   reference_samples <- c()
   list_index <- 1
   for(i in kmeans_clusters$cluster) {
diff --git a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
index 12053e5..202c284 100644
--- a/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
+++ b/R/REFERENCE.SAMPLE.SET.SELECTOR/R/run_REFERENCE.SAMPLE.SET.SELECTOR.R
@@ -11,7 +11,7 @@ run_REFERENCE.SAMPLE.SET.SELECTOR <- function(select_method,
   target_length <- targets[,"st_bp"] - targets[,"ed_bp"]
   reference_samples <- list()
   if(select_method == "kmeans") {
-    kmeans_clusters <- kmeans_select_groups(Y, num_refs)
+    kmeans_clusters <- kmeans_select_groups(Y, num_refs)$clusters
   }
 
   for(i in 1:length(sampname)) {

From a7b6352455119e828291421d58a125119569ea48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Thu, 11 Oct 2018 09:02:38 +0200
Subject: [PATCH 102/114] CNV.SIMULATOR package init

---
 Docker/cnv-opt-cnv-simulator/Dockerfile     |   6 +
 R/CNV.SIMULATOR/DESCRIPTION                 |  18 +
 R/CNV.SIMULATOR/NAMESPACE                   |   2 +
 R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R | 685 ++++++++++++++++++++
 R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R       |  32 +
 5 files changed, 743 insertions(+)
 create mode 100644 Docker/cnv-opt-cnv-simulator/Dockerfile
 create mode 100644 R/CNV.SIMULATOR/DESCRIPTION
 create mode 100644 R/CNV.SIMULATOR/NAMESPACE
 create mode 100644 R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R
 create mode 100644 R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R

diff --git a/Docker/cnv-opt-cnv-simulator/Dockerfile b/Docker/cnv-opt-cnv-simulator/Dockerfile
new file mode 100644
index 0000000..eec7fc9
--- /dev/null
+++ b/Docker/cnv-opt-cnv-simulator/Dockerfile
@@ -0,0 +1,6 @@
+FROM biodatageeks/cnv-opt-codex
+MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
+
+ARG CACHE_DATE=not_a_specified_date
+
+RUN Rscript -e "install.packages('CNV.SIMULATOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/R/CNV.SIMULATOR/DESCRIPTION b/R/CNV.SIMULATOR/DESCRIPTION
new file mode 100644
index 0000000..0824435
--- /dev/null
+++ b/R/CNV.SIMULATOR/DESCRIPTION
@@ -0,0 +1,18 @@
+Package: CANOES
+Title: CANOES Package
+Version: 0.0.1
+Authors@R: c(
+    person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")),
+    person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")),
+    person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut")))
+Description: An implementation of the CANOES package in R.
+Depends:
+    R (>= 3.2.3),
+    plyr (>= 1.8.4),
+    nnls (>= 1.4.0),
+    Hmisc (>= 4.0.0),
+    mgcv (>= 1.8.0)
+License: GPL-3
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.0.1.9000
diff --git a/R/CNV.SIMULATOR/NAMESPACE b/R/CNV.SIMULATOR/NAMESPACE
new file mode 100644
index 0000000..884a631
--- /dev/null
+++ b/R/CNV.SIMULATOR/NAMESPACE
@@ -0,0 +1,2 @@
+# Generated by roxygen2: fake comment so roxygen2 overwrites silently.
+exportPattern("^[^\\.]")
diff --git a/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R
new file mode 100644
index 0000000..b3077b4
--- /dev/null
+++ b/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R
@@ -0,0 +1,685 @@
+# Constants
+NUM.ABNORMAL.STATES=2
+NUM.STATES=3
+DELETION=1
+NORMAL=2
+DUPLICATION=3
+
+# PlotCNV
+#     Plots count data for targets of interest
+#     highlights sample of interest in red, 
+#     highlights area of interest with a black line
+#     highlights probe locations with black dots
+# Arguments:
+#   counts: 
+#     count matrix, with column "target" with target numbers 
+#     and sample data in columns 6:end
+#   sample.name:
+#     sample of interest (will be highlighted in red in figure)
+#     (should correspond to a column in counts)
+#   targets:
+#     targets of interest in the form start.target..end.target
+#   offset:
+#     number of targets to add on either end (default=1)
+# Returns: 
+#   returns nothing
+PlotCNV <- function(counts, sample.name, targets, offset=1){
+  sample.name <- as.character(sample.name)
+  if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
+  if (length(setdiff("target", names(counts)[1:5]) > 0)){
+    stop("counts matrix must have column named target")
+  }
+  t <- as.character(targets)
+  start.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[1])
+  end.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[2])
+  if (!start.target %in% counts$target){
+    stop("no data for start.target in counts matrix")
+  }
+  if (!end.target %in% counts$target){
+    stop("no data for end.target in counts matrix")
+  }
+  if ((start.target - offset) %in% counts$target){
+    start.target <- start.target - offset
+  }
+  if ((end.target + offset) %in% counts$target){
+    end.target <- end.target + offset
+  }
+  ref.sample.names <- setdiff(as.character(names(counts)[-seq(1,5)]), 
+                              sample.name)
+  data <- subset(counts, target >= start.target & target <= end.target)
+  sample.data <- data[, sample.name]
+  means <- apply(data[, ref.sample.names], 1, mean)
+  sd <- sqrt(apply(data[, ref.sample.names], 1, var))
+  refs.z.scores <- matrix(NA, nrow(data), length(ref.sample.names))
+  sample.z.score <- numeric(length = nrow(data))
+  for (i in seq(1, dim(data)[1])){
+    refs.z.scores[i, ] <- as.numeric((data[i, ref.sample.names] - means[i]) / 
+                                       max(0.000001, sd[i]))
+    sample.z.score[i] <- (sample.data[i] - means[i]) / max(0.000001, sd[i])
+  }
+  ylim <- max(abs(refs.z.scores), abs(sample.z.score))
+  plot(seq(-6, 6), seq(-6, 6), 
+       xlim=c(data[1, "start"], data[dim(data)[1], "start"]), 
+       ylim=c(-ylim - 0.1, ylim + 0.1), type="n", xlab="", ylab="Z-score")
+  for (i in seq(1, length(ref.sample.names))){
+    lines(data[, "start"], refs.z.scores[, i], col="#2f4f4f85")
+  }
+  lines(data[, "start"], sample.z.score, col="red", lwd=3)
+  points(data[, "start"], rep(-ylim - 0.05, length(data[, "start"])), pch=20)
+  lines( c(data[1 + offset, "start"], data[nrow(data) - offset, "end"]) , 
+         c(ylim+0.2, ylim+0.2), lwd=2)
+  title(main=paste("Sample ", sample.name, ", ", 
+                   counts$chromosome[start.target], ":", 
+                   data$start[1], "-", data$end[nrow(data)], sep=""))
+}
+
+# CallCNVs
+#     Calls CNVs in sample of interest
+# Arguments:
+#   sample.name:
+#     sample to call CNVs in (should correspond to a column in counts)
+#   counts: 
+#     count matrix, first five columns should be 
+#       target: consecutive numbers for targets (integer)
+#       chromosome: chromosome number (integer-valued) 
+#         (support for sex chromosomes to come)
+#       start: start position of probe (integer)
+#       end: end position of probe (integer)
+#       gc: gc content (real between 0 and 1)
+#       subsequent columns should include counts for each probe for samples
+#   p:
+#     average rate of occurrence of CNVs (real) default is 1e-08
+#   D:
+#     expected distance between targets in a CNV (integer) default is 70,000
+#   Tnum:
+#     expected number of targets in a CNV (integer) default is 6
+#   numrefs
+#     maximum number of reference samples to use (integer) default is 30
+#     the weighted variance calculations will take a long time if too 
+#     many reference samples are used
+# Returns: 
+#   data frame with the following columns:
+#      SAMPLE: name of sample
+#      CNV: DEL of DUP
+#      INTERVAL: CNV coordinates in the form chr:start-stop
+#      KB: length of CNV in kilobases
+#      CHR: chromosome
+#      MID_BP: middle base pair of CNV
+#      TARGETS: target numbers of CNV in the form start..stop
+#      NUM_TARG: how many targets are in the CNV
+#      Q_SOME: a Phred-scaled quality score for the CNV
+CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, numrefs=30, get.dfs=F, homdel.mean=0.2){
+  if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
+  if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){
+    stop("First five columns of counts matrix must be target, chromosome, start, end, gc")
+  }
+  if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) {
+    # remove sex chromosomes
+    cat("Trying to remove sex chromosomes and 'chr' prefixes\n")
+    counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y"))
+    if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){
+      counts$chromosome <- gsub("chr", "", counts$chromosome)
+    }
+    counts$chromosome <- as.numeric(counts$chromosome)
+    if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) 
+      stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)")
+  }
+  library(plyr)
+  counts <- arrange(counts, chromosome, start)
+  if (p <= 0){
+    stop("parameter p must be positive")
+  }
+  if (Tnum <= 0){
+    stop("parameter Tnum must be positive")
+  }
+  if (D <= 0){
+    stop("parameter D must be positive")
+  }
+  if (numrefs <= 0){
+    stop("parameter numrefs must be positive")
+  }
+  sample.names <- colnames(counts)[-seq(1,5)]
+  # find mean coverage of probes
+  mean.counts <- mean(apply(counts[, sample.names], 2, mean))
+  # normalize counts; round so we can use negative binomial
+  counts[, sample.names] <- apply(counts[, sample.names], 2, 
+        function(x, mean.counts) 
+                 round(x * mean.counts / mean(x)), mean.counts)
+  # calculate covariance of read count across samples
+  cov <- cor(counts[, sample.names], counts[, sample.names])
+  reference.samples <- setdiff(sample.names, sample.name)
+  covariances <- cov[sample.name, reference.samples]
+  reference.samples <- names(sort(covariances, 
+          decreasing=T)[1:min(numrefs, length(covariances))])
+  sample.mean.counts <- mean(counts[, sample.name])
+  sample.sumcounts <- apply(counts[, reference.samples], 2, sum)
+  # normalize reference samples to sample of interest
+  counts[, reference.samples] <- apply(counts[, reference.samples], 2, 
+        function(x, sample.mean.counts) 
+                round(x * sample.mean.counts / 
+                mean(x)), sample.mean.counts)  
+  # select reference samples and weightings using non-negative least squares
+  b <- counts[, sample.name]
+  A <- as.matrix(counts[, reference.samples])
+  library(nnls)
+  all <- nnls(A, b)$x
+  est <- matrix(0, nrow=50, ncol=length(reference.samples))
+  set.seed(1)
+  for (i in 1:50){
+    d <- sample(nrow(A), min(500, nrow(A)))
+    est[i, ] <- nnls(A[d, ], b[d])$x
+  }
+  weights <- colMeans(est)
+  sample.weights <- weights / sum(weights)
+  library(Hmisc)
+  # calculate weighted mean of read count
+  # this is used to calculate emission probabilities
+  counts$mean <- apply(counts[, reference.samples], 
+                       1, wtd.mean, sample.weights)
+  targets <- counts$target
+  # exclude probes with all zero counts
+  nonzero.rows <- counts$mean > 0
+  nonzero.rows.df <- data.frame(target=counts$target, 
+                                nonzero.rows=nonzero.rows)
+
+  counts <- counts[nonzero.rows, ]
+  # get the distances between consecutive probes
+  distances <- GetDistances(counts)
+  # estimate the read count variance at each probe
+  var.estimate <- EstimateVariance(counts, reference.samples, 
+                                               sample.weights)
+  emission.probs <- EmissionProbs(counts[, sample.name], 
+                        counts$mean, var.estimate$var.estimate, 
+                        counts[, "target"])
+  if (get.dfs){
+    return(list(emission.probs=emission.probs, distances=distances))
+  }
+  # call CNVs with the Viterbi algorithm
+  viterbi.state <- Viterbi(emission.probs, distances, p, Tnum, D)  
+  # format the CNVs
+  cnvs <- PrintCNVs(sample.name, viterbi.state, 
+                         counts)
+  # if there aren't too many CNVs, calculate the Q_SOME
+  if (nrow(cnvs) > 0 & nrow(cnvs) <= 50){
+    qualities <- GenotypeCNVs(cnvs, sample.name, counts, p, Tnum, D, numrefs, 
+                          emission.probs=emission.probs, 
+                          distances=distances)
+    for (i in 1:nrow(cnvs)){
+      cnvs$Q_SOME[i] <- ifelse(cnvs$CNV[i]=="DEL", qualities[i, "SQDel"], 
+                               qualities[i, "SQDup"])
+    }
+  }
+  data <- as.data.frame(cbind(counts$target, counts$mean, var.estimate$var.estimate, counts[, sample.name]))
+  names(data) <- c("target", "countsmean", "varestimate", "sample")
+  if (nrow(cnvs) > 0){
+    cnvs <- CalcCopyNumber(data, cnvs, homdel.mean)
+  }
+  return(cnvs)
+}
+
+# GenotypeCNVs
+#     Genotype CNVs in sample of interest
+# Arguments:
+#   xcnv
+#     data frame with the following columns, and one row for each
+#     CNV to genotype
+#      INTERVAL: CNV coordinates in the form chr:start-stop
+#      TARGETS: target numbers of CNV in the form start..stop
+#               these should correspond to the target numbers in counts
+#   sample.name:
+#     sample to genotype CNVs in (should correspond to a column in counts)
+#   counts: 
+#     count matrix, first five columns should be 
+#       target: consecutive numbers for targets (integer)
+#       chromosome: chromosome number (integer-valued) 
+#         (support for sex chromosomes to come)
+#       start: start position of probe (integer)
+#       end: end position of probe (integer)
+#       gc: gc content (real between 0 and 1)
+#       subsequent columns should include counts for each probe for samples
+#   p:
+#     average rate of occurrence of CNVs (real) default is 1e-08
+#   D:
+#     expected distance between targets in a CNV (integer) default is 70,000
+#   Tnum:
+#     expected number of targets in a CNV (integer) default is 6
+#   numrefs
+#     maximum number of reference samples to use (integer) default is 30
+#     the weighted variance calculations will take a long time if too 
+#     many reference samples are used
+#   emission.probs and distances are for internal use only
+# Returns: 
+#   data frame with the following columns and one row for each genotyped CNV:
+#      INTERVAL: CNV coordinates in the form chr:start-stop
+#      NQDEL: a Phred-scaled quality score that sample.name has no deletion 
+#             in the interval
+#      SQDEL: a Phred-scaled quality score that sample.name has a deletion 
+#             in the interval
+#      NQDUP and SQDUP: same, but for a duplication
+GenotypeCNVs <- function(xcnvs, sample.name, counts, p=1e-08, Tnum=6, 
+                    D=70000, numrefs=30,
+                    emission.probs=NULL, 
+                    distances=NULL){
+  if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
+  if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){
+    stop("First five columns of counts matrix must be target, chromosome, start, end, gc")
+  }
+  if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) {
+    # remove sex chromosomes
+    cat("Trying to remove sex chromosomes and 'chr' prefixes\n")
+    counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y"))
+    if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){
+      counts$chromosome <- gsub("chr", "", counts$chromosome)
+    }
+    counts$chromosome <- as.numeric(counts$chromosome)
+    if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) 
+      stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)")
+  }
+  library(plyr)
+  counts <- arrange(counts, chromosome, start)
+  if (p <= 0){
+    stop("parameter p must be positive")
+  }
+  if (Tnum <= 0){
+    stop("parameter Tnum must be positive")
+  }
+  if (D <= 0){
+    stop("parameter D must be positive")
+  }
+  if (numrefs <= 0){
+    stop("parameter numrefs must be positive")
+  }
+  num.cnvs <- nrow(xcnvs)
+  cnv.intervals <- as.character(xcnvs$INTERVAL)
+  # if no emission probs matrix is passed in, generate a new one
+  if (is.null(emission.probs)){
+    l <- CallCNVs(sample.name, counts, p, Tnum=6, D=70000, numrefs=30, get.dfs=T)
+    emission.probs <- l[['emission.probs']]
+    distances <- l[['distances']]
+  }
+  forward.m <- GetForwardMatrix(emission.probs, distances, p, Tnum, D)
+  backward.m <- GetBackwardMatrix(emission.probs, distances, p, Tnum, D)
+  qualities <- matrix(0, nrow=num.cnvs, ncol=5, 
+                      dimnames=list(cnv.intervals, 
+                                    c("INTERVAL", "NQDel", "SQDel", "NQDup", "SQDup")))
+  for (i in 1:num.cnvs){
+    interval <- as.character(xcnvs[i, "INTERVAL"])
+    targets <- as.numeric(strsplit(as.character(xcnvs[i, "TARGETS"]), ".", fixed=T)[[1]][c(1,3)])
+    left.target <- targets[1]
+    right.target <- targets[2]
+    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
+                                         emission.probs, distances, 
+                                         left.target, right.target, 
+                                         c(DUPLICATION, DELETION), p, Tnum, D)
+    modified.likelihood <- likelihoods[1]; 
+    unmodified.likelihood <- likelihoods[2]
+    Prob.All.Normal <- exp(modified.likelihood - unmodified.likelihood)
+    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
+                                         emission.probs, distances, 
+                                         left.target, right.target, DELETION, p, Tnum, D)
+    modified.likelihood <- likelihoods[1]; 
+    unmodified.likelihood <- likelihoods[2]
+    Prob.No.Deletion <- exp(modified.likelihood - unmodified.likelihood)
+    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
+                                         emission.probs, distances, 
+                                         left.target, right.target, DUPLICATION, p, Tnum, D)
+    modified.likelihood <- likelihoods[1]; 
+    unmodified.likelihood <- likelihoods[2]
+    Prob.No.Duplication <- exp(modified.likelihood - unmodified.likelihood)
+    # Check if probabilities greater than 1 are numerical error or bug
+    Phred <- function(prob){
+      return(round(min(99, -10 * log10(1 - prob))))
+    }
+    qualities[i, "NQDel"] <- Phred(Prob.No.Deletion)       
+    qualities[i, "SQDel"] <- Phred(Prob.No.Duplication - Prob.All.Normal)
+    qualities[i, "NQDup"] <- Phred(Prob.No.Duplication)       
+    qualities[i, "SQDup"] <- Phred(Prob.No.Deletion - Prob.All.Normal)
+    qualities[i, "INTERVAL"] <- interval
+  }
+  qualities <- as.data.frame(qualities, stringsAsFactors=F)
+  qualities$NQDel <- as.integer(qualities$NQDel)
+  qualities$NQDup <- as.integer(qualities$NQDup)
+  qualities$SQDel <- as.integer(qualities$SQDel)
+  qualities$SQDup <- as.integer(qualities$SQDup)
+  return(qualities)
+}
+
+# returns data frame with distance to each target from the previous target 
+# (0 in the case of the first target on chromosome 1, a very big number
+# for the first target on each other chromosome--this resets the HMM
+# for each chromosome)
+GetDistances <- function(counts){
+  chromosome <- counts[, "chromosome"]
+  startbase <- counts[, "start"]
+  num.nonzero.exons <- length(startbase)
+  distances <- c(0, startbase[2:num.nonzero.exons] - 
+                   startbase[1:(num.nonzero.exons - 1)] + 
+                   1000000000000 * (chromosome[2:num.nonzero.exons] - 
+                                      chromosome[1:(num.nonzero.exons - 1)]))
+  return(data.frame(target=counts[, "target"], distance=distances))
+}
+
+EstimateVariance <- function(counts, ref.sample.names, sample.weights){
+  library(Hmisc)
+  counts$var <- apply(counts[, ref.sample.names], 1, wtd.var, sample.weights, normwt=T)
+  set.seed(1)
+  counts.subset <- counts[sample(nrow(counts), min(36000, nrow(counts))), ]
+  library(mgcv)
+  # can't do gamma regression with negative 
+  counts.subset$var[counts.subset$var==0] <- 0.1 
+  fit <- gam(var ~ s(mean) + s(gc), family=Gamma(link=log), data=counts.subset)
+  # we don't want variance less than Poisson
+  # we take maximum of genome-wide estimate, method of moments estimate
+  # and Poisson variance
+  v.estimate <- pmax(predict(fit, counts, type="response"), counts$var, 
+                     counts$mean * 1.01)
+  return(data.frame(target=counts$target, var.estimate=v.estimate))
+}
+
+EmissionProbs <- function(test.counts, target.means, 
+                                      var.estimate, targets){
+  num.targets <- length(test.counts)
+  # calculate the means for the deletion, normal and duplication states
+  state.target.means <- t(apply(data.frame(x=target.means), 1, function(x) c(x*1/2, x, x*3/2)))
+  # calculate the expected size (given the predicted variance)
+  size <- target.means ^ 2 / (var.estimate - target.means)
+  emission.probs <- matrix(NA, num.targets, 4)
+  colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob")
+  # calculate the emission probabilities given the read count
+  size.del <- size
+  size.dup <- size
+  size.del <- size / 2
+  size.dup <- size * 3 / 2
+  emission.probs[, "delprob"] <- dnbinom(
+    test.counts,
+    mu=state.target.means[, 1],
+    size=size.del, log=T)
+  emission.probs[, "normalprob"] <- dnbinom(
+    test.counts,
+    mu=state.target.means[, 2],
+    size=size, log=T)
+  emission.probs[, "dupprob"] <- dnbinom(
+    test.counts,
+    mu=state.target.means[, 3],
+    size=size.dup, log=T)
+  emission.probs[, "target"] <- targets
+  # some values may be infinite as a result of extreme read count
+  row.all.inf <- which(apply(emission.probs, 1, function(x){all(is.infinite(x))}))
+  if (length(row.all.inf) > 0){
+    for (i in row.all.inf){
+      if (test.counts[i] >= state.target.means[i, 3]){
+        emission.probs[i, 2:4] <- c(-Inf, -Inf, -0.01)
+      }
+      else if (test.counts[i] <= state.target.means[i, 1]){
+        emission.probs[i, 2:4] <- c(-0.01, -Inf, -Inf)
+      }
+      else emission.probs[i, 2:4] <- c(-Inf, -0.01, -Inf)
+    }
+  }
+  return(emission.probs)
+}
+
+# Viterbi algorithm
+Viterbi <- function(emission.probs.matrix, distances, p, Tnum, D){
+  targets <- emission.probs.matrix[, 1]
+  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
+  num.exons <- dim(emission.probs.matrix)[1]
+  viterbi.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)
+  viterbi.pointers <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)
+  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
+  viterbi.matrix[1, ] <- initial.state + emission.probs.matrix[1,]
+  for (i in 2:num.exons) {
+    temp.matrix <- viterbi.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
+    viterbi.matrix[i, ] <- apply(temp.matrix, 2, max)
+    emission.probs <- c(emission.probs.matrix[i,])
+    dim(emission.probs) <- c(NUM.STATES, 1)
+    viterbi.matrix[i, ] <- viterbi.matrix[i, ] + emission.probs
+    viterbi.pointers[i, ] <- apply(temp.matrix, 2, which.max)
+  }
+  viterbi.states = vector(length = num.exons)
+  viterbi.states[num.exons] = which.max(viterbi.matrix[num.exons, ])
+  for (i in (num.exons - 1):1) {
+    viterbi.states[i] <- viterbi.pointers[i + 1, viterbi.states[i + 1]]
+  }
+  return(data.frame(target=targets, viterbi.state=viterbi.states))
+}
+
+# returns a transition matrix
+#                              to state
+#                    deletion   normal    duplication
+#           deletion   
+#from state   normal
+#        duplication
+GetTransitionMatrix <- function(distance, p, Tnum, D){
+  q <- 1 / Tnum
+  f = exp(-distance/D)
+  prob.abnormal.abnormal <- f * (1 - q) + (1 - f) * p
+  prob.abnormal.normal <- f * q + (1 - f) * (1 - 2 * p)
+  prob.abnormal.diff.abnormal <- (1 - f) * p
+  prob.normal.normal <- 1 - 2 * p
+  prob.normal.abnormal <- p
+  transition.probs <- 
+    c(prob.abnormal.abnormal, prob.abnormal.normal, prob.abnormal.diff.abnormal, 
+      prob.normal.abnormal, prob.normal.normal, prob.normal.abnormal,
+      prob.abnormal.diff.abnormal, prob.abnormal.normal, prob.abnormal.abnormal)
+  transition.m = log(matrix(transition.probs, NUM.STATES, NUM.STATES, byrow=TRUE))
+  return(transition.m)
+}
+
+# adds two log-space probabilities using the identity
+# log (p1 + p2) = log p1 + log(1 + exp(log p2 - log p1))
+AddTwoProbabilities <- function(x, y){
+  if (is.infinite(x)) return (y)
+  if (is.infinite(y)) return (x)
+  sum.probs <- max(x, y) + log1p(exp(-abs(x - y)))
+}
+
+# adds multiple log-space probabilities
+SumProbabilities <- function(x){
+  sum.probs <- x[1]
+  for (i in 2:length(x)){
+    sum.probs <- AddTwoProbabilities(sum.probs, x[i])
+  }
+  return(sum.probs)
+}
+
+# finds the data likelihood by summing the product of the corresponding 
+# forward and backward probabilities at any token (should give the same value
+# regardless of the token)
+GetLikelihood <- function(forward.matrix, backward.matrix, x){
+  SumProbabilities(forward.matrix[x, ] + backward.matrix[x, ])
+}
+
+# get the forward probabilities
+GetForwardMatrix <- function(emission.probs.matrix, distances, p, Tnum, D){
+  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
+  num.exons <- dim(emission.probs.matrix)[1]
+  forward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)   # matrix to hold forward probabilities
+  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
+  forward.matrix[1, ] <- initial.state + emission.probs.matrix[1, ]
+  for (i in 2:num.exons){
+    # compute matrix with probability we were in state j and are now in state i
+    # in temp.matrix[j, i] (ignoring emission of current token)
+    temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
+    # find the probability that we are in each of the three states
+    sum.probs <- apply(temp.matrix, 2, SumProbabilities)
+    forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ]
+  }  
+  return(forward.matrix)  
+}
+
+# get the backward probabilities
+GetBackwardMatrix <- function(emission.probs.matrix, distances, 
+                                  p, Tnum, D){
+  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
+  num.exons <- dim(emission.probs.matrix)[1]
+  backward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)   # matrix to hold backward probabilities
+  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
+  backward.matrix[num.exons, ] <- rep(0, NUM.STATES)
+  for (i in (num.exons - 1):1){
+    temp.matrix <- GetTransitionMatrix(distances$distance[i+1], p, Tnum, D) + 
+      matrix(backward.matrix[i + 1, ], 3, 3, byrow=T) +
+      matrix(emission.probs.matrix[i+1, ], 3, 3, byrow=T)
+    backward.matrix[i, ] <- apply(temp.matrix, 1, SumProbabilities)
+  }  
+  final.prob <- backward.matrix[1, ] + emission.probs.matrix[1, ] + initial.state
+  return(backward.matrix)  
+}
+
+# find the likelihood of the data given that certain states are disallowed
+# between start target and end target
+GetModifiedLikelihood <- function(forward.matrix, backward.matrix, emission.probs.matrix, distances, 
+                                      start.target, end.target, disallowed.states, p, Tnum, D){
+  targets <- emission.probs.matrix[, 1]
+  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
+  # there may be missing targets in this sample, we genotype the largest stretch of 
+  # targets that lie in the CNV
+  left.target <- min(which(targets >= start.target))
+  right.target <- max(which(targets <= end.target))
+  num.exons <- dim(emission.probs.matrix)[1]
+  unmodified.likelihood <- GetLikelihood(forward.matrix, 
+                                             backward.matrix, min(right.target + 1, num.exons))
+  #right.target or left.target may be empty
+  
+  #if (right.target >= left.target) return(c(NA, unmodified.likelihood))
+  stopifnot(right.target >= left.target)
+  modified.emission.probs.matrix <- emission.probs.matrix
+  modified.emission.probs.matrix[left.target:right.target, 
+                                 disallowed.states] <- -Inf
+  
+  # if the start target is the first target we need to recalculate the 
+  # forward probabilities
+  # for that target, using the modified emission probabilities
+  if (left.target == 1){
+    initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
+    forward.matrix[1, ] <- initial.state + modified.emission.probs.matrix[1, ]
+    left.target <- left.target + 1
+  } 
+  for (i in seq(left.target, min(right.target + 1, num.exons))){
+    # compute matrix with probability we were in state j and are now in state i
+    # in temp.matrix[j, i] (ignoring emission of current token)
+    temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
+    # find the probability that we are in each of the three states
+    sum.probs <- apply(temp.matrix, 2, SumProbabilities) 
+    if (!i == (right.target + 1)){
+      forward.matrix[i, ] <- sum.probs + modified.emission.probs.matrix[i, ]
+    } else{
+      forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ]
+    }
+  }  
+  # find the modified likelihood of the sequence
+  modified.likelihood <- GetLikelihood(forward.matrix, backward.matrix, min(right.target + 1, num.exons))
+  return(c(modified.likelihood, unmodified.likelihood))
+}
+
+SummarizeCNVs <- function(cnv.targets, counts, sample.name, state){
+  sample.name <- sample.name
+  cnv.type <- ifelse(state==3, "DUP", "DEL")
+  cnv.start <- min(cnv.targets$target)
+  cnv.end <- max(cnv.targets$target)
+  cnv.chromosome <- counts[cnv.start, "chromosome"]
+  cnv.start.base <- counts[cnv.start, "start"]
+  cnv.start.target <- counts[cnv.start, "target"]
+  cnv.end.base <- counts[cnv.end, "end"]
+  cnv.end.target <- counts[cnv.end, "target"]
+  cnv.kbs <- (cnv.end.base - cnv.start.base) / 1000
+  cnv.midbp <- round((cnv.end.base - cnv.start.base) / 2) + cnv.start.base
+  cnv.targets <- paste(cnv.start.target, "..", cnv.end.target, sep="")
+  cnv.interval <- paste(cnv.chromosome, ":", cnv.start.base, "-", cnv.end.base, sep="")
+  num.targets <- cnv.end.target - cnv.start.target + 1
+  return(data.frame(sample.name=sample.name, cnv.type=cnv.type, cnv.interval=cnv.interval, 
+                    cnv.kbs=cnv.kbs, cnv.chromosome=cnv.chromosome, 
+                    cnv.midbp=cnv.midbp, cnv.targets=cnv.targets, num.targets=num.targets))
+}
+
+PrintCNVs <- function(test.sample.name, viterbi.state, 
+                      nonzero.counts){  
+  consecutiveGroups <- function(sequence){
+    num <- length(sequence)
+    group <- 1
+    groups <- rep(0, num)
+    groups[1] <- group
+    if (num > 1){
+      for (i in 2:num){
+        if (!sequence[i] == (sequence[i - 1] + 1)) group <- group + 1
+        groups[i] <- group
+      }
+    }
+    return(groups)
+  }
+  num.duplications <- 0
+  num.deletions <- 0
+  for (state in c(1, 3)){
+    cnv.targets <- which(viterbi.state$viterbi.state == state)
+    if (!length(cnv.targets) == 0){
+      groups <- consecutiveGroups(cnv.targets)
+      library(plyr)
+      cnvs.temp.df <- ddply(data.frame(target=cnv.targets, group=groups), 
+                            "group", SummarizeCNVs, nonzero.counts, test.sample.name, 
+                            state)
+      if (state == 1){
+        deletions.df <- cnvs.temp.df
+        if (!is.null(dim(deletions.df))){
+          num.deletions <- dim(deletions.df)[1]
+        }
+      } else {
+        duplications.df <- cnvs.temp.df
+        if (!is.null(dim(duplications.df))){
+          num.duplications <- dim(duplications.df)[1]
+        }
+      }
+    }
+  }
+  num.calls <- num.deletions + num.duplications
+  cat(num.calls, "CNVs called in sample", test.sample.name, "\n")
+  if (num.deletions == 0 & num.duplications == 0){
+    df <- data.frame(SAMPLE=character(0), CNV=character(0), INTERVAL=character(0), 
+                     KB=numeric(0), CHR=character(0), 
+                     MID_BP=numeric(), TARGETS=character(0), NUM_TARG=numeric(0), Q_SOME=numeric(0), MLCN=numeric(0))
+    return(df)
+  }
+  if (num.deletions > 0 & num.duplications > 0){
+    cnvs.df <- rbind(deletions.df, duplications.df)
+  } else {
+    ifelse(num.deletions > 0, 
+           cnvs.df <- deletions.df, cnvs.df <- duplications.df)
+  }
+  xcnv <- cbind(cnvs.df[, c("sample.name", "cnv.type", "cnv.interval", 
+                      "cnv.kbs", "cnv.chromosome", "cnv.midbp", 
+                      "cnv.targets", "num.targets")], 0)
+  colnames(xcnv) <- c("SAMPLE", "CNV", "INTERVAL", "KB", "CHR", "MID_BP", "TARGETS",
+                      "NUM_TARG", "MLCN")
+  xcnv$Q_SOME <- NA
+  return(xcnv)
+}
+
+CalcCopyNumber <- function(data, cnvs, homdel.mean){
+  for (i in 1:nrow(cnvs)){
+    cnv <- cnvs[i, ]
+    targets <- as.numeric(unlist(strsplit(as.character(cnv$TARGETS), "..", fixed=T)))
+    cnv.data <- subset(data, target >= targets[1] & target <= targets[2])
+    state.target.means <- t(apply(data.frame(x=cnv.data$countsmean), 1, 
+                                  function(x) c(C1=x*1/2, C2=x, C3=x*3/2, 
+                                                C4=x * 2, C5=x * 5/2, C6=x*6/2)))
+    # calculate the expected size (given the predicted variance)
+    size <- cnv.data$countsmean ^ 2 / (cnv.data$varestimate - cnv.data$countsmean)
+    emission.probs <- matrix(NA, nrow(cnv.data), 7)
+    colnames(emission.probs) <- c("C0", "C1", "C2", "C3", "C4", "C5", "C6")
+    #colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob")
+    # calculate the emission probabilities given the read count
+    emission.probs[, 1] <- dpois(cnv.data$sample, homdel.mean, log=T)
+    for (s in 1:6){
+      size.state <- size * s/2
+      emission.probs[, s+1] <- dnbinom(cnv.data$sample, mu=state.target.means[, s], 
+                                       size=size.state, log=T)
+    }
+    cs <- colSums(emission.probs)
+    ml.state <- which.max(cs) - 1
+    if (ml.state==2){
+      ml.state <- ifelse(cnv$CNV=="DEL", 1, 3)
+    }
+    cnvs$MLCN[i] <- ml.state
+  }  
+  return(cnvs)
+}
+
diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
new file mode 100644
index 0000000..65ddb36
--- /dev/null
+++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
@@ -0,0 +1,32 @@
+Test <- function(){
+  # read in the data
+  gc <- read.table("gc.txt")$V2
+  canoes.reads <- read.table("canoes.reads.txt")
+  # rename the columns of canoes.reads
+  sample.names <- paste("S", seq(1:26), sep="")
+  names(canoes.reads) <- c("chromosome", "start", "end", sample.names)
+  # create a vector of consecutive target ids
+  target <- seq(1, nrow(canoes.reads))
+  # combine the data into one data frame
+  canoes.reads <- cbind(target, gc, canoes.reads)
+  # call CNVs in each sample
+  # create a vector to hold the results for each sample
+  xcnv.list <- vector('list', length(sample.names))
+  for (i in 1:length(sample.names)){
+    xcnv.list[[i]] <- CallCNVs(sample.names[i], canoes.reads) 
+  }
+  # combine the results into one data frame
+  xcnvs <- do.call('rbind', xcnv.list)
+  # inspect the first two CNV calls
+  print(head(xcnvs, 2))
+  # plot all the CNV calls to a pdf
+  pdf("CNVplots.pdf")
+  for (i in 1:nrow(xcnvs)){
+     PlotCNV(canoes.reads, xcnvs[i, "SAMPLE"], xcnvs[i, "TARGETS"])
+  }
+  dev.off()
+  # genotype all the CNVs calls made above in sample S2
+  genotyping.S2 <- GenotypeCNVs(xcnvs, "S2", canoes.reads)
+  # inspect the genotype scores for the first two CNV calls
+  print(head(genotyping.S2, 2))
+}

From 6dfcaa802870124cc601e5e076dc7731378e9e28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Thu, 11 Oct 2018 09:03:29 +0200
Subject: [PATCH 103/114] CNV.SIMULATOR package

---
 Jenkinsfile                                 |   1 +
 R/CNV.SIMULATOR/DESCRIPTION                 |   6 +-
 R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R | 803 +++-----------------
 R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R       |  67 +-
 4 files changed, 159 insertions(+), 718 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 70682e1..cc0bf7e 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -26,6 +26,7 @@ pipeline {
                                  sh "cd R && R CMD build CANOES/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CANOES_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CANOES_0.0.1.tar.gz"
                                  sh "cd R && R CMD build CNVCALLER.RUNNER/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.RUNNER_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.RUNNER_0.0.1.tar.gz"
                                  sh "cd R && R CMD build CNVCALLER.EVALUATOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNVCALLER.EVALUATOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNVCALLER.EVALUATOR_0.0.1.tar.gz"
+                                 sh "cd R && R CMD build CNV.SIMULATOR/ && curl -v --user ${NEXUS_USER}:${NEXUS_PASS} --upload-file CNV.SIMULATOR_0.0.1.tar.gz http://zsibio.ii.pw.edu.pl/nexus/repository/r-zsibio/src/contrib/CNV.SIMULATOR_0.0.1.tar.gz"
                              }
 
                   }
diff --git a/R/CNV.SIMULATOR/DESCRIPTION b/R/CNV.SIMULATOR/DESCRIPTION
index 0824435..0acb0de 100644
--- a/R/CNV.SIMULATOR/DESCRIPTION
+++ b/R/CNV.SIMULATOR/DESCRIPTION
@@ -1,11 +1,11 @@
-Package: CANOES
-Title: CANOES Package
+Package: CNV.SIMULATOR
+Title: CNV.SIMULATOR A Package To Generate Artificial CNVs
 Version: 0.0.1
 Authors@R: c(
     person("Tomasz", "Gambin", email = "tgambin@gmail.com", role = c("aut", "cre")),
     person("Marek", "Wiewiórka", email = "marek.wiewiorka@gmail.com", role = c("aut")),
     person("Wiktor", "Kuśmirek", email = "kusmirekwiktor@gmail.com", role = c("aut")))
-Description: An implementation of the CANOES package in R.
+Description: An package to generate artificial CNVs.
 Depends:
     R (>= 3.2.3),
     plyr (>= 1.8.4),
diff --git a/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R
index b3077b4..779ba29 100644
--- a/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R
+++ b/R/CNV.SIMULATOR/R/functions_CNV.SIMULATOR.R
@@ -1,685 +1,120 @@
-# Constants
-NUM.ABNORMAL.STATES=2
-NUM.STATES=3
-DELETION=1
-NORMAL=2
-DUPLICATION=3
-
-# PlotCNV
-#     Plots count data for targets of interest
-#     highlights sample of interest in red, 
-#     highlights area of interest with a black line
-#     highlights probe locations with black dots
-# Arguments:
-#   counts: 
-#     count matrix, with column "target" with target numbers 
-#     and sample data in columns 6:end
-#   sample.name:
-#     sample of interest (will be highlighted in red in figure)
-#     (should correspond to a column in counts)
-#   targets:
-#     targets of interest in the form start.target..end.target
-#   offset:
-#     number of targets to add on either end (default=1)
-# Returns: 
-#   returns nothing
-PlotCNV <- function(counts, sample.name, targets, offset=1){
-  sample.name <- as.character(sample.name)
-  if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
-  if (length(setdiff("target", names(counts)[1:5]) > 0)){
-    stop("counts matrix must have column named target")
-  }
-  t <- as.character(targets)
-  start.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[1])
-  end.target <- as.numeric(unlist(strsplit(t, "..", fixed=T))[2])
-  if (!start.target %in% counts$target){
-    stop("no data for start.target in counts matrix")
-  }
-  if (!end.target %in% counts$target){
-    stop("no data for end.target in counts matrix")
-  }
-  if ((start.target - offset) %in% counts$target){
-    start.target <- start.target - offset
-  }
-  if ((end.target + offset) %in% counts$target){
-    end.target <- end.target + offset
-  }
-  ref.sample.names <- setdiff(as.character(names(counts)[-seq(1,5)]), 
-                              sample.name)
-  data <- subset(counts, target >= start.target & target <= end.target)
-  sample.data <- data[, sample.name]
-  means <- apply(data[, ref.sample.names], 1, mean)
-  sd <- sqrt(apply(data[, ref.sample.names], 1, var))
-  refs.z.scores <- matrix(NA, nrow(data), length(ref.sample.names))
-  sample.z.score <- numeric(length = nrow(data))
-  for (i in seq(1, dim(data)[1])){
-    refs.z.scores[i, ] <- as.numeric((data[i, ref.sample.names] - means[i]) / 
-                                       max(0.000001, sd[i]))
-    sample.z.score[i] <- (sample.data[i] - means[i]) / max(0.000001, sd[i])
-  }
-  ylim <- max(abs(refs.z.scores), abs(sample.z.score))
-  plot(seq(-6, 6), seq(-6, 6), 
-       xlim=c(data[1, "start"], data[dim(data)[1], "start"]), 
-       ylim=c(-ylim - 0.1, ylim + 0.1), type="n", xlab="", ylab="Z-score")
-  for (i in seq(1, length(ref.sample.names))){
-    lines(data[, "start"], refs.z.scores[, i], col="#2f4f4f85")
-  }
-  lines(data[, "start"], sample.z.score, col="red", lwd=3)
-  points(data[, "start"], rep(-ylim - 0.05, length(data[, "start"])), pch=20)
-  lines( c(data[1 + offset, "start"], data[nrow(data) - offset, "end"]) , 
-         c(ylim+0.2, ylim+0.2), lwd=2)
-  title(main=paste("Sample ", sample.name, ", ", 
-                   counts$chromosome[start.target], ":", 
-                   data$start[1], "-", data$end[nrow(data)], sep=""))
-}
-
-# CallCNVs
-#     Calls CNVs in sample of interest
-# Arguments:
-#   sample.name:
-#     sample to call CNVs in (should correspond to a column in counts)
-#   counts: 
-#     count matrix, first five columns should be 
-#       target: consecutive numbers for targets (integer)
-#       chromosome: chromosome number (integer-valued) 
-#         (support for sex chromosomes to come)
-#       start: start position of probe (integer)
-#       end: end position of probe (integer)
-#       gc: gc content (real between 0 and 1)
-#       subsequent columns should include counts for each probe for samples
-#   p:
-#     average rate of occurrence of CNVs (real) default is 1e-08
-#   D:
-#     expected distance between targets in a CNV (integer) default is 70,000
-#   Tnum:
-#     expected number of targets in a CNV (integer) default is 6
-#   numrefs
-#     maximum number of reference samples to use (integer) default is 30
-#     the weighted variance calculations will take a long time if too 
-#     many reference samples are used
-# Returns: 
-#   data frame with the following columns:
-#      SAMPLE: name of sample
-#      CNV: DEL of DUP
-#      INTERVAL: CNV coordinates in the form chr:start-stop
-#      KB: length of CNV in kilobases
-#      CHR: chromosome
-#      MID_BP: middle base pair of CNV
-#      TARGETS: target numbers of CNV in the form start..stop
-#      NUM_TARG: how many targets are in the CNV
-#      Q_SOME: a Phred-scaled quality score for the CNV
-CallCNVs <- function(sample.name, counts, p=1e-08, Tnum=6, D=70000, numrefs=30, get.dfs=F, homdel.mean=0.2){
-  if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
-  if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){
-    stop("First five columns of counts matrix must be target, chromosome, start, end, gc")
-  }
-  if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) {
-    # remove sex chromosomes
-    cat("Trying to remove sex chromosomes and 'chr' prefixes\n")
-    counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y"))
-    if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){
-      counts$chromosome <- gsub("chr", "", counts$chromosome)
-    }
-    counts$chromosome <- as.numeric(counts$chromosome)
-    if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) 
-      stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)")
-  }
-  library(plyr)
-  counts <- arrange(counts, chromosome, start)
-  if (p <= 0){
-    stop("parameter p must be positive")
-  }
-  if (Tnum <= 0){
-    stop("parameter Tnum must be positive")
-  }
-  if (D <= 0){
-    stop("parameter D must be positive")
-  }
-  if (numrefs <= 0){
-    stop("parameter numrefs must be positive")
-  }
-  sample.names <- colnames(counts)[-seq(1,5)]
-  # find mean coverage of probes
-  mean.counts <- mean(apply(counts[, sample.names], 2, mean))
-  # normalize counts; round so we can use negative binomial
-  counts[, sample.names] <- apply(counts[, sample.names], 2, 
-        function(x, mean.counts) 
-                 round(x * mean.counts / mean(x)), mean.counts)
-  # calculate covariance of read count across samples
-  cov <- cor(counts[, sample.names], counts[, sample.names])
-  reference.samples <- setdiff(sample.names, sample.name)
-  covariances <- cov[sample.name, reference.samples]
-  reference.samples <- names(sort(covariances, 
-          decreasing=T)[1:min(numrefs, length(covariances))])
-  sample.mean.counts <- mean(counts[, sample.name])
-  sample.sumcounts <- apply(counts[, reference.samples], 2, sum)
-  # normalize reference samples to sample of interest
-  counts[, reference.samples] <- apply(counts[, reference.samples], 2, 
-        function(x, sample.mean.counts) 
-                round(x * sample.mean.counts / 
-                mean(x)), sample.mean.counts)  
-  # select reference samples and weightings using non-negative least squares
-  b <- counts[, sample.name]
-  A <- as.matrix(counts[, reference.samples])
-  library(nnls)
-  all <- nnls(A, b)$x
-  est <- matrix(0, nrow=50, ncol=length(reference.samples))
-  set.seed(1)
-  for (i in 1:50){
-    d <- sample(nrow(A), min(500, nrow(A)))
-    est[i, ] <- nnls(A[d, ], b[d])$x
-  }
-  weights <- colMeans(est)
-  sample.weights <- weights / sum(weights)
-  library(Hmisc)
-  # calculate weighted mean of read count
-  # this is used to calculate emission probabilities
-  counts$mean <- apply(counts[, reference.samples], 
-                       1, wtd.mean, sample.weights)
-  targets <- counts$target
-  # exclude probes with all zero counts
-  nonzero.rows <- counts$mean > 0
-  nonzero.rows.df <- data.frame(target=counts$target, 
-                                nonzero.rows=nonzero.rows)
-
-  counts <- counts[nonzero.rows, ]
-  # get the distances between consecutive probes
-  distances <- GetDistances(counts)
-  # estimate the read count variance at each probe
-  var.estimate <- EstimateVariance(counts, reference.samples, 
-                                               sample.weights)
-  emission.probs <- EmissionProbs(counts[, sample.name], 
-                        counts$mean, var.estimate$var.estimate, 
-                        counts[, "target"])
-  if (get.dfs){
-    return(list(emission.probs=emission.probs, distances=distances))
-  }
-  # call CNVs with the Viterbi algorithm
-  viterbi.state <- Viterbi(emission.probs, distances, p, Tnum, D)  
-  # format the CNVs
-  cnvs <- PrintCNVs(sample.name, viterbi.state, 
-                         counts)
-  # if there aren't too many CNVs, calculate the Q_SOME
-  if (nrow(cnvs) > 0 & nrow(cnvs) <= 50){
-    qualities <- GenotypeCNVs(cnvs, sample.name, counts, p, Tnum, D, numrefs, 
-                          emission.probs=emission.probs, 
-                          distances=distances)
-    for (i in 1:nrow(cnvs)){
-      cnvs$Q_SOME[i] <- ifelse(cnvs$CNV[i]=="DEL", qualities[i, "SQDel"], 
-                               qualities[i, "SQDup"])
-    }
-  }
-  data <- as.data.frame(cbind(counts$target, counts$mean, var.estimate$var.estimate, counts[, sample.name]))
-  names(data) <- c("target", "countsmean", "varestimate", "sample")
-  if (nrow(cnvs) > 0){
-    cnvs <- CalcCopyNumber(data, cnvs, homdel.mean)
-  }
-  return(cnvs)
-}
-
-# GenotypeCNVs
-#     Genotype CNVs in sample of interest
-# Arguments:
-#   xcnv
-#     data frame with the following columns, and one row for each
-#     CNV to genotype
-#      INTERVAL: CNV coordinates in the form chr:start-stop
-#      TARGETS: target numbers of CNV in the form start..stop
-#               these should correspond to the target numbers in counts
-#   sample.name:
-#     sample to genotype CNVs in (should correspond to a column in counts)
-#   counts: 
-#     count matrix, first five columns should be 
-#       target: consecutive numbers for targets (integer)
-#       chromosome: chromosome number (integer-valued) 
-#         (support for sex chromosomes to come)
-#       start: start position of probe (integer)
-#       end: end position of probe (integer)
-#       gc: gc content (real between 0 and 1)
-#       subsequent columns should include counts for each probe for samples
-#   p:
-#     average rate of occurrence of CNVs (real) default is 1e-08
-#   D:
-#     expected distance between targets in a CNV (integer) default is 70,000
-#   Tnum:
-#     expected number of targets in a CNV (integer) default is 6
-#   numrefs
-#     maximum number of reference samples to use (integer) default is 30
-#     the weighted variance calculations will take a long time if too 
-#     many reference samples are used
-#   emission.probs and distances are for internal use only
-# Returns: 
-#   data frame with the following columns and one row for each genotyped CNV:
-#      INTERVAL: CNV coordinates in the form chr:start-stop
-#      NQDEL: a Phred-scaled quality score that sample.name has no deletion 
-#             in the interval
-#      SQDEL: a Phred-scaled quality score that sample.name has a deletion 
-#             in the interval
-#      NQDUP and SQDUP: same, but for a duplication
-GenotypeCNVs <- function(xcnvs, sample.name, counts, p=1e-08, Tnum=6, 
-                    D=70000, numrefs=30,
-                    emission.probs=NULL, 
-                    distances=NULL){
-  if (!sample.name %in% names(counts)){stop("No column for sample ", sample.name, " in counts matrix")}
-  if (length(setdiff(names(counts)[1:5], c("target", "chromosome", "start", "end", "gc"))) > 0){
-    stop("First five columns of counts matrix must be target, chromosome, start, end, gc")
-  }
-  if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) {
-    # remove sex chromosomes
-    cat("Trying to remove sex chromosomes and 'chr' prefixes\n")
-    counts <- subset(counts, !chromosome %in% c("chrX", "chrY", "X", "Y"))
-    if (sum(grepl("chr", counts$chromosome))==length(counts$chromosome)){
-      counts$chromosome <- gsub("chr", "", counts$chromosome)
-    }
-    counts$chromosome <- as.numeric(counts$chromosome)
-    if (length(setdiff(unique(counts$chromosome), seq(1:22))) > 0) 
-      stop("chromosome must take value in range 1-22 (support for sex chromosomes to come)")
-  }
-  library(plyr)
-  counts <- arrange(counts, chromosome, start)
-  if (p <= 0){
-    stop("parameter p must be positive")
-  }
-  if (Tnum <= 0){
-    stop("parameter Tnum must be positive")
-  }
-  if (D <= 0){
-    stop("parameter D must be positive")
-  }
-  if (numrefs <= 0){
-    stop("parameter numrefs must be positive")
-  }
-  num.cnvs <- nrow(xcnvs)
-  cnv.intervals <- as.character(xcnvs$INTERVAL)
-  # if no emission probs matrix is passed in, generate a new one
-  if (is.null(emission.probs)){
-    l <- CallCNVs(sample.name, counts, p, Tnum=6, D=70000, numrefs=30, get.dfs=T)
-    emission.probs <- l[['emission.probs']]
-    distances <- l[['distances']]
-  }
-  forward.m <- GetForwardMatrix(emission.probs, distances, p, Tnum, D)
-  backward.m <- GetBackwardMatrix(emission.probs, distances, p, Tnum, D)
-  qualities <- matrix(0, nrow=num.cnvs, ncol=5, 
-                      dimnames=list(cnv.intervals, 
-                                    c("INTERVAL", "NQDel", "SQDel", "NQDup", "SQDup")))
-  for (i in 1:num.cnvs){
-    interval <- as.character(xcnvs[i, "INTERVAL"])
-    targets <- as.numeric(strsplit(as.character(xcnvs[i, "TARGETS"]), ".", fixed=T)[[1]][c(1,3)])
-    left.target <- targets[1]
-    right.target <- targets[2]
-    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
-                                         emission.probs, distances, 
-                                         left.target, right.target, 
-                                         c(DUPLICATION, DELETION), p, Tnum, D)
-    modified.likelihood <- likelihoods[1]; 
-    unmodified.likelihood <- likelihoods[2]
-    Prob.All.Normal <- exp(modified.likelihood - unmodified.likelihood)
-    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
-                                         emission.probs, distances, 
-                                         left.target, right.target, DELETION, p, Tnum, D)
-    modified.likelihood <- likelihoods[1]; 
-    unmodified.likelihood <- likelihoods[2]
-    Prob.No.Deletion <- exp(modified.likelihood - unmodified.likelihood)
-    likelihoods <- GetModifiedLikelihood(forward.m, backward.m, 
-                                         emission.probs, distances, 
-                                         left.target, right.target, DUPLICATION, p, Tnum, D)
-    modified.likelihood <- likelihoods[1]; 
-    unmodified.likelihood <- likelihoods[2]
-    Prob.No.Duplication <- exp(modified.likelihood - unmodified.likelihood)
-    # Check if probabilities greater than 1 are numerical error or bug
-    Phred <- function(prob){
-      return(round(min(99, -10 * log10(1 - prob))))
-    }
-    qualities[i, "NQDel"] <- Phred(Prob.No.Deletion)       
-    qualities[i, "SQDel"] <- Phred(Prob.No.Duplication - Prob.All.Normal)
-    qualities[i, "NQDup"] <- Phred(Prob.No.Duplication)       
-    qualities[i, "SQDup"] <- Phred(Prob.No.Deletion - Prob.All.Normal)
-    qualities[i, "INTERVAL"] <- interval
-  }
-  qualities <- as.data.frame(qualities, stringsAsFactors=F)
-  qualities$NQDel <- as.integer(qualities$NQDel)
-  qualities$NQDup <- as.integer(qualities$NQDup)
-  qualities$SQDel <- as.integer(qualities$SQDel)
-  qualities$SQDup <- as.integer(qualities$SQDup)
-  return(qualities)
-}
-
-# returns data frame with distance to each target from the previous target 
-# (0 in the case of the first target on chromosome 1, a very big number
-# for the first target on each other chromosome--this resets the HMM
-# for each chromosome)
-GetDistances <- function(counts){
-  chromosome <- counts[, "chromosome"]
-  startbase <- counts[, "start"]
-  num.nonzero.exons <- length(startbase)
-  distances <- c(0, startbase[2:num.nonzero.exons] - 
-                   startbase[1:(num.nonzero.exons - 1)] + 
-                   1000000000000 * (chromosome[2:num.nonzero.exons] - 
-                                      chromosome[1:(num.nonzero.exons - 1)]))
-  return(data.frame(target=counts[, "target"], distance=distances))
-}
-
-EstimateVariance <- function(counts, ref.sample.names, sample.weights){
-  library(Hmisc)
-  counts$var <- apply(counts[, ref.sample.names], 1, wtd.var, sample.weights, normwt=T)
-  set.seed(1)
-  counts.subset <- counts[sample(nrow(counts), min(36000, nrow(counts))), ]
-  library(mgcv)
-  # can't do gamma regression with negative 
-  counts.subset$var[counts.subset$var==0] <- 0.1 
-  fit <- gam(var ~ s(mean) + s(gc), family=Gamma(link=log), data=counts.subset)
-  # we don't want variance less than Poisson
-  # we take maximum of genome-wide estimate, method of moments estimate
-  # and Poisson variance
-  v.estimate <- pmax(predict(fit, counts, type="response"), counts$var, 
-                     counts$mean * 1.01)
-  return(data.frame(target=counts$target, var.estimate=v.estimate))
-}
-
-EmissionProbs <- function(test.counts, target.means, 
-                                      var.estimate, targets){
-  num.targets <- length(test.counts)
-  # calculate the means for the deletion, normal and duplication states
-  state.target.means <- t(apply(data.frame(x=target.means), 1, function(x) c(x*1/2, x, x*3/2)))
-  # calculate the expected size (given the predicted variance)
-  size <- target.means ^ 2 / (var.estimate - target.means)
-  emission.probs <- matrix(NA, num.targets, 4)
-  colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob")
-  # calculate the emission probabilities given the read count
-  size.del <- size
-  size.dup <- size
-  size.del <- size / 2
-  size.dup <- size * 3 / 2
-  emission.probs[, "delprob"] <- dnbinom(
-    test.counts,
-    mu=state.target.means[, 1],
-    size=size.del, log=T)
-  emission.probs[, "normalprob"] <- dnbinom(
-    test.counts,
-    mu=state.target.means[, 2],
-    size=size, log=T)
-  emission.probs[, "dupprob"] <- dnbinom(
-    test.counts,
-    mu=state.target.means[, 3],
-    size=size.dup, log=T)
-  emission.probs[, "target"] <- targets
-  # some values may be infinite as a result of extreme read count
-  row.all.inf <- which(apply(emission.probs, 1, function(x){all(is.infinite(x))}))
-  if (length(row.all.inf) > 0){
-    for (i in row.all.inf){
-      if (test.counts[i] >= state.target.means[i, 3]){
-        emission.probs[i, 2:4] <- c(-Inf, -Inf, -0.01)
-      }
-      else if (test.counts[i] <= state.target.means[i, 1]){
-        emission.probs[i, 2:4] <- c(-0.01, -Inf, -Inf)
-      }
-      else emission.probs[i, 2:4] <- c(-Inf, -0.01, -Inf)
-    }
-  }
-  return(emission.probs)
-}
-
-# Viterbi algorithm
-Viterbi <- function(emission.probs.matrix, distances, p, Tnum, D){
-  targets <- emission.probs.matrix[, 1]
-  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
-  num.exons <- dim(emission.probs.matrix)[1]
-  viterbi.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)
-  viterbi.pointers <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)
-  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
-  viterbi.matrix[1, ] <- initial.state + emission.probs.matrix[1,]
-  for (i in 2:num.exons) {
-    temp.matrix <- viterbi.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
-    viterbi.matrix[i, ] <- apply(temp.matrix, 2, max)
-    emission.probs <- c(emission.probs.matrix[i,])
-    dim(emission.probs) <- c(NUM.STATES, 1)
-    viterbi.matrix[i, ] <- viterbi.matrix[i, ] + emission.probs
-    viterbi.pointers[i, ] <- apply(temp.matrix, 2, which.max)
-  }
-  viterbi.states = vector(length = num.exons)
-  viterbi.states[num.exons] = which.max(viterbi.matrix[num.exons, ])
-  for (i in (num.exons - 1):1) {
-    viterbi.states[i] <- viterbi.pointers[i + 1, viterbi.states[i + 1]]
-  }
-  return(data.frame(target=targets, viterbi.state=viterbi.states))
-}
-
-# returns a transition matrix
-#                              to state
-#                    deletion   normal    duplication
-#           deletion   
-#from state   normal
-#        duplication
-GetTransitionMatrix <- function(distance, p, Tnum, D){
-  q <- 1 / Tnum
-  f = exp(-distance/D)
-  prob.abnormal.abnormal <- f * (1 - q) + (1 - f) * p
-  prob.abnormal.normal <- f * q + (1 - f) * (1 - 2 * p)
-  prob.abnormal.diff.abnormal <- (1 - f) * p
-  prob.normal.normal <- 1 - 2 * p
-  prob.normal.abnormal <- p
-  transition.probs <- 
-    c(prob.abnormal.abnormal, prob.abnormal.normal, prob.abnormal.diff.abnormal, 
-      prob.normal.abnormal, prob.normal.normal, prob.normal.abnormal,
-      prob.abnormal.diff.abnormal, prob.abnormal.normal, prob.abnormal.abnormal)
-  transition.m = log(matrix(transition.probs, NUM.STATES, NUM.STATES, byrow=TRUE))
-  return(transition.m)
-}
-
-# adds two log-space probabilities using the identity
-# log (p1 + p2) = log p1 + log(1 + exp(log p2 - log p1))
-AddTwoProbabilities <- function(x, y){
-  if (is.infinite(x)) return (y)
-  if (is.infinite(y)) return (x)
-  sum.probs <- max(x, y) + log1p(exp(-abs(x - y)))
-}
-
-# adds multiple log-space probabilities
-SumProbabilities <- function(x){
-  sum.probs <- x[1]
-  for (i in 2:length(x)){
-    sum.probs <- AddTwoProbabilities(sum.probs, x[i])
-  }
-  return(sum.probs)
-}
-
-# finds the data likelihood by summing the product of the corresponding 
-# forward and backward probabilities at any token (should give the same value
-# regardless of the token)
-GetLikelihood <- function(forward.matrix, backward.matrix, x){
-  SumProbabilities(forward.matrix[x, ] + backward.matrix[x, ])
-}
-
-# get the forward probabilities
-GetForwardMatrix <- function(emission.probs.matrix, distances, p, Tnum, D){
-  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
-  num.exons <- dim(emission.probs.matrix)[1]
-  forward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)   # matrix to hold forward probabilities
-  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
-  forward.matrix[1, ] <- initial.state + emission.probs.matrix[1, ]
-  for (i in 2:num.exons){
-    # compute matrix with probability we were in state j and are now in state i
-    # in temp.matrix[j, i] (ignoring emission of current token)
-    temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
-    # find the probability that we are in each of the three states
-    sum.probs <- apply(temp.matrix, 2, SumProbabilities)
-    forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ]
-  }  
-  return(forward.matrix)  
-}
-
-# get the backward probabilities
-GetBackwardMatrix <- function(emission.probs.matrix, distances, 
-                                  p, Tnum, D){
-  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
-  num.exons <- dim(emission.probs.matrix)[1]
-  backward.matrix <- matrix(NA, nrow=num.exons, ncol=NUM.STATES)   # matrix to hold backward probabilities
-  initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
-  backward.matrix[num.exons, ] <- rep(0, NUM.STATES)
-  for (i in (num.exons - 1):1){
-    temp.matrix <- GetTransitionMatrix(distances$distance[i+1], p, Tnum, D) + 
-      matrix(backward.matrix[i + 1, ], 3, 3, byrow=T) +
-      matrix(emission.probs.matrix[i+1, ], 3, 3, byrow=T)
-    backward.matrix[i, ] <- apply(temp.matrix, 1, SumProbabilities)
-  }  
-  final.prob <- backward.matrix[1, ] + emission.probs.matrix[1, ] + initial.state
-  return(backward.matrix)  
-}
-
-# find the likelihood of the data given that certain states are disallowed
-# between start target and end target
-GetModifiedLikelihood <- function(forward.matrix, backward.matrix, emission.probs.matrix, distances, 
-                                      start.target, end.target, disallowed.states, p, Tnum, D){
-  targets <- emission.probs.matrix[, 1]
-  emission.probs.matrix <- as.matrix(emission.probs.matrix[, 2:4])
-  # there may be missing targets in this sample, we genotype the largest stretch of 
-  # targets that lie in the CNV
-  left.target <- min(which(targets >= start.target))
-  right.target <- max(which(targets <= end.target))
-  num.exons <- dim(emission.probs.matrix)[1]
-  unmodified.likelihood <- GetLikelihood(forward.matrix, 
-                                             backward.matrix, min(right.target + 1, num.exons))
-  #right.target or left.target may be empty
-  
-  #if (right.target >= left.target) return(c(NA, unmodified.likelihood))
-  stopifnot(right.target >= left.target)
-  modified.emission.probs.matrix <- emission.probs.matrix
-  modified.emission.probs.matrix[left.target:right.target, 
-                                 disallowed.states] <- -Inf
-  
-  # if the start target is the first target we need to recalculate the 
-  # forward probabilities
-  # for that target, using the modified emission probabilities
-  if (left.target == 1){
-    initial.state <- log(c(0.0075 / NUM.ABNORMAL.STATES, 1 - 0.0075, 0.0075 / NUM.ABNORMAL.STATES))
-    forward.matrix[1, ] <- initial.state + modified.emission.probs.matrix[1, ]
-    left.target <- left.target + 1
-  } 
-  for (i in seq(left.target, min(right.target + 1, num.exons))){
-    # compute matrix with probability we were in state j and are now in state i
-    # in temp.matrix[j, i] (ignoring emission of current token)
-    temp.matrix <- forward.matrix[i - 1, ] + GetTransitionMatrix(distances$distance[i], p, Tnum, D)
-    # find the probability that we are in each of the three states
-    sum.probs <- apply(temp.matrix, 2, SumProbabilities) 
-    if (!i == (right.target + 1)){
-      forward.matrix[i, ] <- sum.probs + modified.emission.probs.matrix[i, ]
-    } else{
-      forward.matrix[i, ] <- sum.probs + emission.probs.matrix[i, ]
-    }
-  }  
-  # find the modified likelihood of the sequence
-  modified.likelihood <- GetLikelihood(forward.matrix, backward.matrix, min(right.target + 1, num.exons))
-  return(c(modified.likelihood, unmodified.likelihood))
-}
-
-SummarizeCNVs <- function(cnv.targets, counts, sample.name, state){
-  sample.name <- sample.name
-  cnv.type <- ifelse(state==3, "DUP", "DEL")
-  cnv.start <- min(cnv.targets$target)
-  cnv.end <- max(cnv.targets$target)
-  cnv.chromosome <- counts[cnv.start, "chromosome"]
-  cnv.start.base <- counts[cnv.start, "start"]
-  cnv.start.target <- counts[cnv.start, "target"]
-  cnv.end.base <- counts[cnv.end, "end"]
-  cnv.end.target <- counts[cnv.end, "target"]
-  cnv.kbs <- (cnv.end.base - cnv.start.base) / 1000
-  cnv.midbp <- round((cnv.end.base - cnv.start.base) / 2) + cnv.start.base
-  cnv.targets <- paste(cnv.start.target, "..", cnv.end.target, sep="")
-  cnv.interval <- paste(cnv.chromosome, ":", cnv.start.base, "-", cnv.end.base, sep="")
-  num.targets <- cnv.end.target - cnv.start.target + 1
-  return(data.frame(sample.name=sample.name, cnv.type=cnv.type, cnv.interval=cnv.interval, 
-                    cnv.kbs=cnv.kbs, cnv.chromosome=cnv.chromosome, 
-                    cnv.midbp=cnv.midbp, cnv.targets=cnv.targets, num.targets=num.targets))
-}
-
-PrintCNVs <- function(test.sample.name, viterbi.state, 
-                      nonzero.counts){  
-  consecutiveGroups <- function(sequence){
-    num <- length(sequence)
-    group <- 1
-    groups <- rep(0, num)
-    groups[1] <- group
-    if (num > 1){
-      for (i in 2:num){
-        if (!sequence[i] == (sequence[i - 1] + 1)) group <- group + 1
-        groups[i] <- group
-      }
-    }
-    return(groups)
-  }
-  num.duplications <- 0
-  num.deletions <- 0
-  for (state in c(1, 3)){
-    cnv.targets <- which(viterbi.state$viterbi.state == state)
-    if (!length(cnv.targets) == 0){
-      groups <- consecutiveGroups(cnv.targets)
-      library(plyr)
-      cnvs.temp.df <- ddply(data.frame(target=cnv.targets, group=groups), 
-                            "group", SummarizeCNVs, nonzero.counts, test.sample.name, 
-                            state)
-      if (state == 1){
-        deletions.df <- cnvs.temp.df
-        if (!is.null(dim(deletions.df))){
-          num.deletions <- dim(deletions.df)[1]
-        }
-      } else {
-        duplications.df <- cnvs.temp.df
-        if (!is.null(dim(duplications.df))){
-          num.duplications <- dim(duplications.df)[1]
-        }
-      }
-    }
-  }
-  num.calls <- num.deletions + num.duplications
-  cat(num.calls, "CNVs called in sample", test.sample.name, "\n")
-  if (num.deletions == 0 & num.duplications == 0){
-    df <- data.frame(SAMPLE=character(0), CNV=character(0), INTERVAL=character(0), 
-                     KB=numeric(0), CHR=character(0), 
-                     MID_BP=numeric(), TARGETS=character(0), NUM_TARG=numeric(0), Q_SOME=numeric(0), MLCN=numeric(0))
-    return(df)
-  }
-  if (num.deletions > 0 & num.duplications > 0){
-    cnvs.df <- rbind(deletions.df, duplications.df)
-  } else {
-    ifelse(num.deletions > 0, 
-           cnvs.df <- deletions.df, cnvs.df <- duplications.df)
-  }
-  xcnv <- cbind(cnvs.df[, c("sample.name", "cnv.type", "cnv.interval", 
-                      "cnv.kbs", "cnv.chromosome", "cnv.midbp", 
-                      "cnv.targets", "num.targets")], 0)
-  colnames(xcnv) <- c("SAMPLE", "CNV", "INTERVAL", "KB", "CHR", "MID_BP", "TARGETS",
-                      "NUM_TARG", "MLCN")
-  xcnv$Q_SOME <- NA
-  return(xcnv)
-}
-
-CalcCopyNumber <- function(data, cnvs, homdel.mean){
-  for (i in 1:nrow(cnvs)){
-    cnv <- cnvs[i, ]
-    targets <- as.numeric(unlist(strsplit(as.character(cnv$TARGETS), "..", fixed=T)))
-    cnv.data <- subset(data, target >= targets[1] & target <= targets[2])
-    state.target.means <- t(apply(data.frame(x=cnv.data$countsmean), 1, 
-                                  function(x) c(C1=x*1/2, C2=x, C3=x*3/2, 
-                                                C4=x * 2, C5=x * 5/2, C6=x*6/2)))
-    # calculate the expected size (given the predicted variance)
-    size <- cnv.data$countsmean ^ 2 / (cnv.data$varestimate - cnv.data$countsmean)
-    emission.probs <- matrix(NA, nrow(cnv.data), 7)
-    colnames(emission.probs) <- c("C0", "C1", "C2", "C3", "C4", "C5", "C6")
-    #colnames(emission.probs) <- c("target", "delprob", "normalprob", "dupprob")
-    # calculate the emission probabilities given the read count
-    emission.probs[, 1] <- dpois(cnv.data$sample, homdel.mean, log=T)
-    for (s in 1:6){
-      size.state <- size * s/2
-      emission.probs[, s+1] <- dnbinom(cnv.data$sample, mu=state.target.means[, s], 
-                                       size=size.state, log=T)
-    }
-    cs <- colSums(emission.probs)
-    ml.state <- which.max(cs) - 1
-    if (ml.state==2){
-      ml.state <- ifelse(cnv$CNV=="DEL", 1, 3)
-    }
-    cnvs$MLCN[i] <- ml.state
-  }  
-  return(cnvs)
-}
 
+# build_intersection_matrix <- function(calls, refs){
+#   intersection_matrix <- matrix(data=as.integer(0), nrow = nrow(calls), ncol = nrow(refs))
+#   if (nrow(intersection_matrix) > 0 && ncol(intersection_matrix) > 0) {
+#     for (i in 1:nrow(intersection_matrix)) {
+#       for (j in 1:ncol(intersection_matrix)) {
+#         if (as.character(calls[i,"sample_name"]) == as.character(refs[j,"sample_name"]) && 
+#             as.character(calls[i,"chr"]) == as.character(refs[j,"chr"]) && 
+#             as.character(calls[i,"cnv"]) == as.character(refs[j,"cnv"])) {
+#           overlap_length <- calc_overlap_length(calls[i,"st_bp"], 
+#                                                 calls[i,"ed_bp"], 
+#                                                 refs[j,"st_bp"], 
+#                                                 refs[j,"ed_bp"])
+#           call_length <- calls[i,"ed_bp"] - calls[i,"st_bp"]
+#           ref_length <- refs[j,"ed_bp"] - refs[j,"st_bp"]
+#           overlap_factor <- overlap_length / ((call_length + ref_length) / 2) * 100
+#           intersection_matrix[i,j] <- round(overlap_factor, 2)
+#         }
+#       }
+#     }
+#   }
+#   intersection_matrix
+# }
+# 
+# filter_intersection_matrix_by_overlap_factor <- function(intersection_matrix, min_overlap_factor){
+#   if (nrow(intersection_matrix) > 0 && ncol(intersection_matrix) > 0) {
+#     for (i in 1:nrow(intersection_matrix)) {
+#       for (j in 1:ncol(intersection_matrix)) {
+#         if (intersection_matrix[i,j] < min_overlap_factor) {
+#           intersection_matrix[i,j] <- 0.00
+#         }
+#       }
+#     }
+#   }
+#   intersection_matrix
+# }
+# 
+# calc_number_of_different_copy_number_for_cnv <- function(cnv, calls){
+#   copy_no <- c()
+#   for (i in 1:nrow(calls)) {
+#     if (as.character(calls[i,"chr"]) == as.character(cnv[1,"chr"]) &&
+#         calls[i,"st_bp"] == cnv[1,"st_bp"] &&
+#         calls[i,"ed_bp"] == cnv[1,"ed_bp"] &&
+#         !is.na(calls[i,"copy_no"])) {
+#       copy_no <- c(copy_no, calls[i,"copy_no"])
+#     }
+#   }
+#   length(unique(copy_no))
+# }
+# 
+# calc_NA_rate_for_cnv <- function(cnv, calls){
+#   num_of_samples <- length(unique(calls[,"sample_name"]))
+#   num_of_NA <- 0
+#   for (i in 1:nrow(calls)) {
+#     if (as.character(calls[i,"chr"]) == as.character(cnv[1,"chr"]) &&
+#         calls[i,"st_bp"] == cnv[1,"st_bp"] &&
+#         calls[i,"ed_bp"] == cnv[1,"ed_bp"] &&
+#         is.na(calls[i,"cnv"])) {
+#       num_of_NA <- num_of_NA + 1
+#     }
+#   }
+#   round(num_of_NA / num_of_samples, 2)
+# }
+# 
+# calc_cnv_frequency <- function(cnv, calls){
+#   num_of_samples <- length(unique(calls[,"sample_name"]))
+#   num_of_same_cnv <- 0
+#   for (i in 1:nrow(calls)) {
+#     if (as.character(calls[i,"chr"]) == as.character(cnv[1,"chr"]) &&
+#         calls[i,"st_bp"] == cnv[1,"st_bp"] &&
+#         calls[i,"ed_bp"] == cnv[1,"ed_bp"] &&
+#         as.character(calls[i,"cnv"]) == as.character(cnv[1,"cnv"])) {
+#       num_of_same_cnv <- num_of_same_cnv + 1
+#     }
+#   }
+#   round(num_of_same_cnv / num_of_samples, 2)
+# }
+# 
+# calc_overlap_length <- function(min1, max1, min2, max2){
+#   overlap_length <- max(0, min(max1, max2) - max(min1, min2))
+#   overlap_length
+# }
+# 
+# calc_quality_statistics <- function(TP, FP, TN, FN){
+#   sensitivity <- if (TP + FN > 0) TP / (TP + FN) else 0
+#   specificity <- if (TN + FP > 0) TN / (TN + FP) else 0
+#   precision <- if (TP + FP > 0) TP / (TP + FP) else 0
+#   accuracy <- if (TP + TN + FP + FN > 0) (TP + TN) / (TP + TN + FP + FN) else 0
+#   return(list(sensitivity=round(sensitivity, digits=3), 
+#               specificity=round(specificity, digits=3), 
+#               precision=round(precision, digits=3), 
+#               accuracy=round(accuracy, digits=3)))
+# }
+# 
+# calc_confusion_matrix <- function(intersection_matrix, num_of_original_targets_in_refs, num_of_original_samples_in_refs){
+#   # TP
+#   TP <- 0
+#   if (nrow(intersection_matrix) > 0) {
+#     for (i in 1:nrow(intersection_matrix)) {
+#       if (sum(intersection_matrix[i,] != 0) != 0) {
+#         TP <- TP + 1
+#       }
+#     }
+#   }
+#   # FP
+#   FP <- nrow(intersection_matrix) - TP
+#   # FN
+#   FN <- 0
+#   if (ncol(intersection_matrix) > 0) {
+#     for (j in 1:ncol(intersection_matrix)) {
+#       if (sum(intersection_matrix[,j] != 0) == 0) {
+#         FN <- FN + 1
+#       }
+#     }
+#   }
+#   # TN
+#   TN <- (num_of_original_targets_in_refs * num_of_original_samples_in_refs) - FN
+#   return(list(TP=TP, FP=FP, TN=TN, FN=FN))
+# }
+# 
diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
index 65ddb36..f68c656 100644
--- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
+++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
@@ -1,32 +1,37 @@
-Test <- function(){
-  # read in the data
-  gc <- read.table("gc.txt")$V2
-  canoes.reads <- read.table("canoes.reads.txt")
-  # rename the columns of canoes.reads
-  sample.names <- paste("S", seq(1:26), sep="")
-  names(canoes.reads) <- c("chromosome", "start", "end", sample.names)
-  # create a vector of consecutive target ids
-  target <- seq(1, nrow(canoes.reads))
-  # combine the data into one data frame
-  canoes.reads <- cbind(target, gc, canoes.reads)
-  # call CNVs in each sample
-  # create a vector to hold the results for each sample
-  xcnv.list <- vector('list', length(sample.names))
-  for (i in 1:length(sample.names)){
-    xcnv.list[[i]] <- CallCNVs(sample.names[i], canoes.reads) 
-  }
-  # combine the results into one data frame
-  xcnvs <- do.call('rbind', xcnv.list)
-  # inspect the first two CNV calls
-  print(head(xcnvs, 2))
-  # plot all the CNV calls to a pdf
-  pdf("CNVplots.pdf")
-  for (i in 1:nrow(xcnvs)){
-     PlotCNV(canoes.reads, xcnvs[i, "SAMPLE"], xcnvs[i, "TARGETS"])
-  }
-  dev.off()
-  # genotype all the CNVs calls made above in sample S2
-  genotyping.S2 <- GenotypeCNVs(xcnvs, "S2", canoes.reads)
-  # inspect the genotype scores for the first two CNV calls
-  print(head(genotyping.S2, 2))
+run_CNV.SIMULATOR <- function(calls,
+                              refs,
+                              parameters){
+
+#  TP <- 0
+#  FP <- 0
+#  TN <- 0
+#  FN <- 0
+#  num_of_original_samples_in_refs <- length(unique(refs[,"sample_name"]))
+#  chromosomes <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y")))
+#  for(chromosome in chromosomes) {
+#    print(paste("Processing chr: ", chromosome, sep=""))
+#    calls_for_chr <- subset(calls, chr == chromosome)
+#    refs_for_chr <- subset(refs, chr == chromosome)
+#    if (nrow(calls_for_chr) == 0 && nrow(refs_for_chr) == 0) {  # TODO
+#      next()
+#    }
+#    intersection_matrix <- build_intersection_matrix(calls_for_chr, refs_for_chr)
+#    intersection_matrix <- filter_intersection_matrix_by_overlap_factor(intersection_matrix, parameters$min_overlap_factor)
+#    targets <- refs_for_chr[,c("chr", "st_bp", "ed_bp")]
+#    num_of_original_targets_in_refs <- nrow(targets[!duplicated(targets[,c("chr", "st_bp", "ed_bp")]),])
+#    confusion_matrix <- calc_confusion_matrix(intersection_matrix, num_of_original_targets_in_refs, num_of_original_samples_in_refs)
+#    TP <- TP + confusion_matrix$TP
+#    FP <- FP + confusion_matrix$FP
+#    TN <- TN + confusion_matrix$TN
+#    FN <- FN + confusion_matrix$FN
+#  }
+#  quality_statistics <- calc_quality_statistics(TP, FP, TN, FN)
+#  return(list(TP=TP,
+#              FP=FP,
+#              TN=TN,
+#              FN=FN,
+#              sensitivity=round(quality_statistics$sensitivity, digits=3), 
+#              specificity=round(quality_statistics$specificity, digits=3), 
+#              precision=round(quality_statistics$precision, digits=3), 
+#              accuracy=round(quality_statistics$accuracy, digits=3)))
 }

From 918bee2529ed40b1e9a75ccc67c6505f40c08146 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Thu, 11 Oct 2018 13:50:44 +0200
Subject: [PATCH 104/114] first, not tested version of simulating CNVs by
 downsampling

---
 R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 72 ++++++++++++++-------------
 R/CNV.SIMULATOR/inst/simulate_cnvs.R  | 61 +++++++++++++++++++++++
 2 files changed, 98 insertions(+), 35 deletions(-)
 create mode 100755 R/CNV.SIMULATOR/inst/simulate_cnvs.R

diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
index f68c656..1eecd0c 100644
--- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
+++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
@@ -1,37 +1,39 @@
-run_CNV.SIMULATOR <- function(calls,
-                              refs,
-                              parameters){
+run_CNV.SIMULATOR <- function(input_cov_table,
+                              input_bed,
+                              input_males,
+                              input_females,
+                              output_cov_table,
+                              output_generated_cnvs,
+                              min_number_of_cnvs_per_sample,
+                              min_number_of_regions,
+                              max_number_of_regions,
+                              simulation_mode){
 
-#  TP <- 0
-#  FP <- 0
-#  TN <- 0
-#  FN <- 0
-#  num_of_original_samples_in_refs <- length(unique(refs[,"sample_name"]))
-#  chromosomes <- c(1:22, "X", "Y", paste0("chr",c(1:22, "X", "Y")))
-#  for(chromosome in chromosomes) {
-#    print(paste("Processing chr: ", chromosome, sep=""))
-#    calls_for_chr <- subset(calls, chr == chromosome)
-#    refs_for_chr <- subset(refs, chr == chromosome)
-#    if (nrow(calls_for_chr) == 0 && nrow(refs_for_chr) == 0) {  # TODO
-#      next()
-#    }
-#    intersection_matrix <- build_intersection_matrix(calls_for_chr, refs_for_chr)
-#    intersection_matrix <- filter_intersection_matrix_by_overlap_factor(intersection_matrix, parameters$min_overlap_factor)
-#    targets <- refs_for_chr[,c("chr", "st_bp", "ed_bp")]
-#    num_of_original_targets_in_refs <- nrow(targets[!duplicated(targets[,c("chr", "st_bp", "ed_bp")]),])
-#    confusion_matrix <- calc_confusion_matrix(intersection_matrix, num_of_original_targets_in_refs, num_of_original_samples_in_refs)
-#    TP <- TP + confusion_matrix$TP
-#    FP <- FP + confusion_matrix$FP
-#    TN <- TN + confusion_matrix$TN
-#    FN <- FN + confusion_matrix$FN
-#  }
-#  quality_statistics <- calc_quality_statistics(TP, FP, TN, FN)
-#  return(list(TP=TP,
-#              FP=FP,
-#              TN=TN,
-#              FN=FN,
-#              sensitivity=round(quality_statistics$sensitivity, digits=3), 
-#              specificity=round(quality_statistics$specificity, digits=3), 
-#              precision=round(quality_statistics$precision, digits=3), 
-#              accuracy=round(quality_statistics$accuracy, digits=3)))
+
+  Y <- read.csv(input_cov_table)
+  sampname <- colnames(Y)
+  targets <- read.delim(input_bed)
+  males <- read.delim(input_males)
+  females <- read.delim(input_females)
+  generated_cnvs <- matrix(nrow=0, ncol=4) 
+  if (simulation_mode == "downsample") {
+    downsample_factor <- 0.5
+    for (sample in sampname) {
+      print(paste("Generating arficial CNVs in sample: ", sample, sep=""))
+      for (i in 1:min_number_of_cnvs_per_sample) {
+        cnv_length <- floor(runif(1, min=min_number_of_regions, max=max_number_of_regions))
+        cnv_start <- floor(runif(1, min=1, max=nrow(targets)))
+        for (j in cnv_start:cnv_start+cnv_length) {
+          Y[j,sample] <- floor(Y[j,sample]*downsample_factor)
+        }
+        generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length]), nrow = 1))
+      }
+    }
+  } else if (simulation_mode == "replace") {
+  # TODO
+  } else {
+  # TODO
+  }
+  write.csv(Y, output_cov_table, row.names=F, quote=F)
+  write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F)
 }
diff --git a/R/CNV.SIMULATOR/inst/simulate_cnvs.R b/R/CNV.SIMULATOR/inst/simulate_cnvs.R
new file mode 100755
index 0000000..88c58d8
--- /dev/null
+++ b/R/CNV.SIMULATOR/inst/simulate_cnvs.R
@@ -0,0 +1,61 @@
+#!/usr/bin/env Rscript
+options(java.parameters = "-Xmx1512m")
+library(devtools)
+library('CNV.SIMULATOR')
+library(optparse)
+if (length(which(installed.packages()[,1] == "stringr")) == 0){install.packages("stringr",repos="https://cloud.r-project.org/")}
+library(stringr)
+
+option_list <- list(
+  make_option("--input_cov_table", default="public.runner_calls",
+              help="Calls table. [default %default]"),
+  make_option("--input_bed", default="public.runner_calls",
+              help="Calls table. [default %default]"),
+  make_option("--input_males", default="public.runner_calls",
+              help="Calls table. [default %default]"),
+  make_option("--input_females", default="public.runner_calls",
+              help="Calls table. [default %default]"),
+  make_option("--output_cov_table", default="public.runner_calls",
+              help="Calls table. [default %default]"),
+  make_option("--output_generated_cnvs", default="public.runner_calls",
+              help="Calls table. [default %default]"),
+  make_option("--min_number_of_cnvs_per_sample", default="public.runner_calls",
+              help="Calls table. [default %default]"),
+  make_option("--min_number_of_regions", default="public.runner_calls",
+              help="Calls table. [default %default]"),
+  make_option("--max_number_of_regions", default="public.runner_calls",
+              help="Calls table. [default %default]"),
+  make_option("--simulation_mode", default="1",
+              help="Calls table. [default %default]")
+)
+opt <- parse_args(OptionParser(option_list=option_list))
+
+simulate_cnvs <- function(parameters, cov_table){
+  simulated_cnvs <- run_CNV.SIMULATOR(input_cov_table,
+                                      input_bed,
+                                      input_males,
+                                      input_females,
+                                      output_cov_table,
+                                      output_generated_cnvs,
+                                      min_number_of_cnvs_per_sample,
+                                      min_number_of_regions,
+                                      max_number_of_regions,
+                                      simulation_mode
+  )
+  simulated_cnvs
+}
+
+simulated_cnvs <- simulate_cnvs(opt$input_cov_table, 
+                                opt$input_bed, 
+                                opt$input_males, 
+                                opt$input_females, 
+                                opt$output_cov_table, 
+                                opt$output_generated_cnvs, 
+                                opt$min_number_of_cnvs_per_sample, 
+                                opt$min_number_of_regions, 
+                                opt$max_number_of_regions, 
+                                opt$simulation_mode
+)
+print(simulated_cnvs)
+
+

From 449754519c87b5e96926328d65948d53a0233100 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 16 Oct 2018 10:46:52 +0200
Subject: [PATCH 105/114] comment out testing Scala code

---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index cc0bf7e..9fa4388 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -31,7 +31,7 @@ pipeline {
 
                   }
 
-        stage('Test Scala code') {
+        /*stage('Test Scala code') {
                     steps {
                         slackSend botUser: true, channel: '#development', message: 'started ${env.JOB_NAME} ${env.BUILD_NUMBER} (<${env.BUILD_URL}|Open>)', teamDomain: 'zsibio.slack.com'
                         echo 'Testing Scala code....'
@@ -42,7 +42,7 @@ pipeline {
                         junit '**/target/test-reports/*.xml'
                       }
                     }
-                }
+                }*/
 
          stage('Package scala code') {
                             steps {

From b89bb5fc37e3ef324b7dfceeb0f7f9e16cbaefdc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 16 Oct 2018 10:53:48 +0200
Subject: [PATCH 106/114] comment out everything connected to Scala from
 Jenkinsfile

---
 Jenkinsfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 9fa4388..e4bb8d3 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -42,7 +42,7 @@ pipeline {
                         junit '**/target/test-reports/*.xml'
                       }
                     }
-                }*/
+                }
 
          stage('Package scala code') {
                             steps {
@@ -54,7 +54,7 @@ pipeline {
 
                             }
 
-                }
+                }*/
 
         stage('Build Docker images') {
                     steps {

From c7cc00461ad0f2af3ae0d165458f88983c48a440 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 16 Oct 2018 10:57:26 +0200
Subject: [PATCH 107/114] bugfix

---
 Jenkinsfile | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index e4bb8d3..2f8230c 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -31,17 +31,17 @@ pipeline {
 
                   }
 
-        /*stage('Test Scala code') {
+        stage('Test Scala code') {
                     steps {
-                        slackSend botUser: true, channel: '#development', message: 'started ${env.JOB_NAME} ${env.BUILD_NUMBER} (<${env.BUILD_URL}|Open>)', teamDomain: 'zsibio.slack.com'
+                        // slackSend botUser: true, channel: '#development', message: 'started ${env.JOB_NAME} ${env.BUILD_NUMBER} (<${env.BUILD_URL}|Open>)', teamDomain: 'zsibio.slack.com'
                         echo 'Testing Scala code....'
-                        sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt test"
-                    }
-                    post {
-                      always {
-                        junit '**/target/test-reports/*.xml'
-                      }
+                        // sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt test"
                     }
+                    // post {
+                      // always {
+                        // junit '**/target/test-reports/*.xml'
+                      // }
+                    // }
                 }
 
          stage('Package scala code') {
@@ -54,7 +54,7 @@ pipeline {
 
                             }
 
-                }*/
+                }
 
         stage('Build Docker images') {
                     steps {

From 1ed18d208574f58d49b5d5ba4daaa68f94744f23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 16 Oct 2018 11:01:00 +0200
Subject: [PATCH 108/114] remove unresolved dependencies

---
 Jenkinsfile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index 2f8230c..cb2dbad 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -47,10 +47,10 @@ pipeline {
          stage('Package scala code') {
                             steps {
                                 echo 'Building Scala code....'
-                                sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt package"
+                                // sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt package"
         			            echo "Generating documentation"
-        			            sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt doc"
-        			            publishHTML([allowMissing: false, alwaysLinkToLastBuild: true, keepAll: false, reportDir: 'target/scala-2.11/api/', reportFiles: 'package.html', reportName: 'Scala Doc', reportTitles: ''])
+        			            // sh "${tool name: 'sbt-0.13.15', type: 'org.jvnet.hudson.plugins.SbtPluginBuilder$SbtInstallation'}/bin/sbt doc"
+        			            // publishHTML([allowMissing: false, alwaysLinkToLastBuild: true, keepAll: false, reportDir: 'target/scala-2.11/api/', reportFiles: 'package.html', reportName: 'Scala Doc', reportTitles: ''])
 
                             }
 

From 2f8d051a0d0965266bef5a19c9fb633c2693b8ac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 16 Oct 2018 11:12:02 +0200
Subject: [PATCH 109/114] force to rebuild cnv-simulator docker

---
 build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index b7313d0..313e5cb 100755
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,7 @@ do
   diffTs=`echo "$(date +%s) - $(git log -n 1 --pretty=format:%at ${dir})" | bc`
   if [ $diffTs -lt $MAX_COMMIT_TS_DIFF ]; then
     cd $dir
-    if [[ ${image} == "biodatageeks/cnv-opt-reference-sample-set-selector" ]]; then
+    if [[ ${image} == "biodatageeks/cnv-opt-cnv-simulator" ]]; then
       echo "Rebuild of ${image} image forced..."
       docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:$version .
       docker build --no-cache --build-arg CACHE_DATE=$(date +%Y-%m-%d:%H:%M:%S) -t $image:latest .

From 81533bcf456d3343a7d7197bc7f1882a338acb26 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 16 Oct 2018 13:11:47 +0200
Subject: [PATCH 110/114] printing generated CNVs

---
 R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
index 1eecd0c..ab88513 100644
--- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
+++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
@@ -26,7 +26,8 @@ run_CNV.SIMULATOR <- function(input_cov_table,
         for (j in cnv_start:cnv_start+cnv_length) {
           Y[j,sample] <- floor(Y[j,sample]*downsample_factor)
         }
-        generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length]), nrow = 1))
+        print(paste(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length], sep=" "))
+        # generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length]), nrow = 1))
       }
     }
   } else if (simulation_mode == "replace") {
@@ -35,5 +36,5 @@ run_CNV.SIMULATOR <- function(input_cov_table,
   # TODO
   }
   write.csv(Y, output_cov_table, row.names=F, quote=F)
-  write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F)
+  # write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F)
 }

From a99d5a30cbcb6c48bc96efec386d375f4b86a52e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Tue, 16 Oct 2018 17:20:38 +0200
Subject: [PATCH 111/114] downsample method finished

---
 R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
index ab88513..d1d81ac 100644
--- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
+++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
@@ -4,7 +4,7 @@ run_CNV.SIMULATOR <- function(input_cov_table,
                               input_females,
                               output_cov_table,
                               output_generated_cnvs,
-                              min_number_of_cnvs_per_sample,
+                              number_of_cnvs_per_sample,
                               min_number_of_regions,
                               max_number_of_regions,
                               simulation_mode){
@@ -13,21 +13,22 @@ run_CNV.SIMULATOR <- function(input_cov_table,
   Y <- read.csv(input_cov_table)
   sampname <- colnames(Y)
   targets <- read.delim(input_bed)
-  males <- read.delim(input_males)
-  females <- read.delim(input_females)
-  generated_cnvs <- matrix(nrow=0, ncol=4) 
+  males <- read.csv(input_males)
+  females <- read.csv(input_females)
+  generated_cnvs <- matrix(nrow=0, ncol=4)
+  colnames(generated_cnvs) <- c('sample','chr','st_bp','ed_bp')
   if (simulation_mode == "downsample") {
     downsample_factor <- 0.5
     for (sample in sampname) {
       print(paste("Generating arficial CNVs in sample: ", sample, sep=""))
-      for (i in 1:min_number_of_cnvs_per_sample) {
+      for (i in 1:number_of_cnvs_per_sample) {
         cnv_length <- floor(runif(1, min=min_number_of_regions, max=max_number_of_regions))
         cnv_start <- floor(runif(1, min=1, max=nrow(targets)))
-        for (j in cnv_start:cnv_start+cnv_length) {
+        for (j in cnv_start:(min(cnv_start+cnv_length-1,nrow(targets)))) {
           Y[j,sample] <- floor(Y[j,sample]*downsample_factor)
         }
-        print(paste(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length], sep=" "))
-        # generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[1,cnv_start], targets[2,cnv_start], targets[3,cnv_start+cnv_length]), nrow = 1))
+        print(paste(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" "))
+        generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3]), nrow = 1))
       }
     }
   } else if (simulation_mode == "replace") {
@@ -36,5 +37,5 @@ run_CNV.SIMULATOR <- function(input_cov_table,
   # TODO
   }
   write.csv(Y, output_cov_table, row.names=F, quote=F)
-  # write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F)
+  write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F)
 }

From 0497387921e95c786fa96912299b58d09b35ee2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 17 Oct 2018 16:31:04 +0200
Subject: [PATCH 112/114] first version of X replacement from Ximmer tool

---
 R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
index d1d81ac..f859f14 100644
--- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
+++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
@@ -13,8 +13,8 @@ run_CNV.SIMULATOR <- function(input_cov_table,
   Y <- read.csv(input_cov_table)
   sampname <- colnames(Y)
   targets <- read.delim(input_bed)
-  males <- read.csv(input_males)
-  females <- read.csv(input_females)
+  males <- as.character(unlist(read.table(input_males, sep = ",")))
+  females <- as.character(unlist(read.table(input_females, sep = ",")))
   generated_cnvs <- matrix(nrow=0, ncol=4)
   colnames(generated_cnvs) <- c('sample','chr','st_bp','ed_bp')
   if (simulation_mode == "downsample") {
@@ -32,9 +32,26 @@ run_CNV.SIMULATOR <- function(input_cov_table,
       }
     }
   } else if (simulation_mode == "replace") {
-  # TODO
+    Y_males <- Y[,males]
+    Y_females <- Y[,females]
+    for (female in females) {
+      print(paste("Generating arficial CNVs in sample: ", female, sep=""))
+      male <- males[floor(runif(1, min=1, max=length(males)))]
+      for (i in 1:number_of_cnvs_per_sample) {
+        cnv_length <- floor(runif(1, min=min_number_of_regions, max=max_number_of_regions))
+        cnv_start <- floor(runif(1, min=1, max=nrow(targets)))
+        for (j in cnv_start:(min(cnv_start+cnv_length-1,nrow(targets)))) {
+          Y_females[j,female] <- Y_males[j,male]
+          Y[j,female] <- Y[j,male]
+        }
+        print(paste(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" "))
+        generated_cnvs <- rbind(generated_cnvs, matrix(c(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3]), nrow = 1))
+      }
+    }
+    write.csv(Y_males, paste(output_cov_table, ".males", sep=""), row.names=F, quote=F)
+    write.csv(Y_females, paste(output_cov_table, ".females", sep=""), row.names=F, quote=F)
   } else {
-  # TODO
+    print("Choose proper simulation mode!!!")
   }
   write.csv(Y, output_cov_table, row.names=F, quote=F)
   write.csv(generated_cnvs, output_generated_cnvs, row.names=F, quote=F)

From f927dea4b1b9624507d824f3db1162cc9e14ce18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 17 Oct 2018 17:31:32 +0200
Subject: [PATCH 113/114] random male sample changed to the most correlated
 male sample

---
 R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
index f859f14..327c306 100644
--- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
+++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
@@ -15,8 +15,8 @@ run_CNV.SIMULATOR <- function(input_cov_table,
   targets <- read.delim(input_bed)
   males <- as.character(unlist(read.table(input_males, sep = ",")))
   females <- as.character(unlist(read.table(input_females, sep = ",")))
-  generated_cnvs <- matrix(nrow=0, ncol=4)
-  colnames(generated_cnvs) <- c('sample','chr','st_bp','ed_bp')
+  generated_cnvs <- matrix(nrow=0, ncol=6)
+  colnames(generated_cnvs) <- c('sample_name','cnv','chr','st_bp','ed_bp','copy_no')
   if (simulation_mode == "downsample") {
     downsample_factor <- 0.5
     for (sample in sampname) {
@@ -28,7 +28,7 @@ run_CNV.SIMULATOR <- function(input_cov_table,
           Y[j,sample] <- floor(Y[j,sample]*downsample_factor)
         }
         print(paste(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" "))
-        generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3]), nrow = 1))
+        generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, 'del', targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1))
       }
     }
   } else if (simulation_mode == "replace") {
@@ -36,7 +36,10 @@ run_CNV.SIMULATOR <- function(input_cov_table,
     Y_females <- Y[,females]
     for (female in females) {
       print(paste("Generating arficial CNVs in sample: ", female, sep=""))
-      male <- males[floor(runif(1, min=1, max=length(males)))]
+      cov <- cor(Y[,female], Y[,males])
+      covariances <- cov[1,males]
+      male <- names(sort(covariances, decreasing=T)[1:min(1, length(covariances))])
+      #male <- males[floor(runif(1, min=1, max=length(males)))]  # random male sample - in Ximmer tool
       for (i in 1:number_of_cnvs_per_sample) {
         cnv_length <- floor(runif(1, min=min_number_of_regions, max=max_number_of_regions))
         cnv_start <- floor(runif(1, min=1, max=nrow(targets)))
@@ -45,7 +48,7 @@ run_CNV.SIMULATOR <- function(input_cov_table,
           Y[j,female] <- Y[j,male]
         }
         print(paste(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" "))
-        generated_cnvs <- rbind(generated_cnvs, matrix(c(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3]), nrow = 1))
+        generated_cnvs <- rbind(generated_cnvs, matrix(c(female, 'del', targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1))
       }
     }
     write.csv(Y_males, paste(output_cov_table, ".males", sep=""), row.names=F, quote=F)

From 426bced1b7c48300c077de5f191d13f4ad75e01f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Wiktor=20Ku=C5=9Bmirek?= <kusmirekwiktor@gmail.com>
Date: Wed, 24 Oct 2018 16:36:50 +0200
Subject: [PATCH 114/114] force to rebuild docker image

---
 Docker/cnv-opt-cnv-simulator/Dockerfile | 2 --
 R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R   | 6 ++++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Docker/cnv-opt-cnv-simulator/Dockerfile b/Docker/cnv-opt-cnv-simulator/Dockerfile
index eec7fc9..d31a1d1 100644
--- a/Docker/cnv-opt-cnv-simulator/Dockerfile
+++ b/Docker/cnv-opt-cnv-simulator/Dockerfile
@@ -1,6 +1,4 @@
 FROM biodatageeks/cnv-opt-codex
 MAINTAINER biodatageeks <team@biodatageeks.ii.pw.edu.pl>
 
-ARG CACHE_DATE=not_a_specified_date
-
 RUN Rscript -e "install.packages('CNV.SIMULATOR', repos = 'http://zsibio.ii.pw.edu.pl/nexus/repository/r-all')"
diff --git a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
index 327c306..04ba8d1 100644
--- a/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
+++ b/R/CNV.SIMULATOR/R/run_CNV.SIMULATOR.R
@@ -28,9 +28,11 @@ run_CNV.SIMULATOR <- function(input_cov_table,
           Y[j,sample] <- floor(Y[j,sample]*downsample_factor)
         }
         print(paste(sample, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" "))
-        generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, 'del', targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1))
+        generated_cnvs <- rbind(generated_cnvs, matrix(c(sample, 'del', as.character(targets[cnv_start,1]), targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1))
       }
     }
+    write.csv(Y[,males], paste(output_cov_table, ".males", sep=""), row.names=F, quote=F)
+    write.csv(Y[,females], paste(output_cov_table, ".females", sep=""), row.names=F, quote=F)
   } else if (simulation_mode == "replace") {
     Y_males <- Y[,males]
     Y_females <- Y[,females]
@@ -48,7 +50,7 @@ run_CNV.SIMULATOR <- function(input_cov_table,
           Y[j,female] <- Y[j,male]
         }
         print(paste(female, targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], sep=" "))
-        generated_cnvs <- rbind(generated_cnvs, matrix(c(female, 'del', targets[cnv_start,1], targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1))
+        generated_cnvs <- rbind(generated_cnvs, matrix(c(female, 'del', as.character(targets[cnv_start,1]), targets[cnv_start,2], targets[cnv_start+cnv_length,3], '1'), nrow = 1))
       }
     }
     write.csv(Y_males, paste(output_cov_table, ".males", sep=""), row.names=F, quote=F)