icgc-argo-workflows
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎bam-merge-sort-markdup/.dockerignore‎
Lines changed: 5 additions & 0 deletions b/‎bam-merge-sort-markdup/.dockerignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎bam-merge-sort-markdup/Dockerfile‎
Lines changed: 11 additions & 0 deletions b/‎bam-merge-sort-markdup/Dockerfile‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎bam-merge-sort-markdup/main.nf‎
Lines changed: 100 additions & 0 deletions b/‎bam-merge-sort-markdup/main.nf‎
Lines changed: 100 additions & 0 deletions
diff --git a/‎bam-merge-sort-markdup/main.py‎
Lines changed: 118 additions & 0 deletions b/‎bam-merge-sort-markdup/main.py‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎bam-merge-sort-markdup/nextflow.config‎
Lines changed: 4 additions & 0 deletions b/‎bam-merge-sort-markdup/nextflow.config‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎bam-merge-sort-markdup/pkg.json‎
Lines changed: 42 additions & 0 deletions b/‎bam-merge-sort-markdup/pkg.json‎
Lines changed: 42 additions & 0 deletions
@@ -67,3 +67,4 @@ docs/_build
 .nextflow*
 work
 outdir
+output
@@ -0,0 +1,5 @@
+.gitignore
+.nextflow*
+tests
+work
+outdir
@@ -0,0 +1,11 @@
+FROM quay.io/icgc-argo/dna-seq-processing-tools:base-docker.0.2.1
+
+LABEL org.opencontainers.image.source https://github.com/icgc-argo/dna-seq-processing-tools
+
+ENV PATH="/tools:${PATH}"
+
+COPY *.py /tools/
+
+ENTRYPOINT ["/usr/bin/env"]
+
+CMD ["/bin/bash"]
@@ -0,0 +1,100 @@
+#!/usr/bin/env nextflow
+
+/*
+  Copyright (C) 2021,  icgc-argo
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Affero General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Affero General Public License for more details.
+
+  You should have received a copy of the GNU Affero General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+  Authors:
+    Junjun Zhang
+    Linda Xiang
+*/
+
+/********************************************************************/
+/* this block is auto-generated based on info from pkg.json where   */
+/* changes can be made if needed, do NOT modify this block manually */
+nextflow.enable.dsl = 2
+version = '0.2.0'  // package version
+
+container = [
+    'ghcr.io': 'ghcr.io/icgc-argo/dna-seq-processing-tools.bam-merge-sort-markdup'
+]
+default_container_registry = 'ghcr.io'
+/********************************************************************/
+
+
+// universal params go here
+params.container_registry = ""
+params.container_version = ""
+params.container = ""
+
+params.cpus = 1
+params.mem = 1  // GB
+params.publish_dir = ""  // set to empty string will disable publishDir
+
+
+// tool specific parmas go here, add / change as needed
+params.aligned_lane_bams = ""
+params.ref_genome_gz = ""
+params.aligned_basename = "grch38-aligned.merged"
+params.markdup = true
+params.output_format = "cram"
+params.lossy = false
+params.tempdir = "NO_DIR"
+
+include { getSecondaryFiles } from './wfpr_modules/github.com/icgc-argo/data-processing-utility-tools/helper-functions@1.0.1/main'
+
+process bamMergeSortMarkdup {
+  container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
+  publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir
+
+  cpus params.cpus
+  memory "${params.mem} GB"
+
+  input:
+    path aligned_lane_bams
+    path ref_genome_gz
+    path ref_genome_gz_secondary_file
+    val tempdir
+
+  output:
+    path "${params.aligned_basename}.{bam,cram}", emit: merged_seq
+    path "${params.aligned_basename}.{bam.bai,cram.crai}", emit: merged_seq_idx
+    path "${params.aligned_basename}.duplicates_metrics.tgz", optional: true, emit: duplicates_metrics
+
+  script:
+    arg_markdup = params.markdup ? "-d" : ""
+    arg_lossy = params.lossy ? "-l" : ""
+    arg_tempdir = tempdir != 'NO_DIR' ? "-t ${tempdir}" : ""
+    """
+    main.py \
+      -i ${aligned_lane_bams} \
+      -r ${ref_genome_gz} \
+      -n ${params.cpus} \
+      -b ${params.aligned_basename} ${arg_markdup} \
+      -o ${params.output_format} ${arg_lossy} ${arg_tempdir}
+    """
+}
+
+
+// this provides an entry point for this main script, so it can be run directly without clone the repo
+// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
+workflow {
+  bamMergeSortMarkdup(
+    Channel.fromPath(params.aligned_lane_bams, checkIfExists: true).collect(),
+    file(params.ref_genome_gz),
+    Channel.fromPath(getSecondaryFiles(params.ref_genome_gz, ['fai', 'gzi']), checkIfExists: true).collect(),
+    params.tempdir
+  )
+}
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+  Copyright (C) 2021,  icgc-argo
+
+  This program is free software: you can redistribute it and/or modify
+  it under the terms of the GNU Affero General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU Affero General Public License for more details.
+
+  You should have received a copy of the GNU Affero General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+  Authors:
+    Junjun Zhang
+    Linda Xiang
+"""
+
+import sys
+import subprocess
+import argparse
+from multiprocessing import cpu_count
+import json
+import os
+
+def run_cmd(cmd):
+  stdout, stderr, p, success = '', '', None, True
+  try:
+    p = subprocess.Popen([cmd],
+                          stdout=subprocess.PIPE,
+                          stderr=subprocess.PIPE,
+                          shell=True)
+    stdout, stderr = p.communicate()
+  except Exception as e:
+    print('Execution failed: %s' % e, file=sys.stderr)
+    success = False
+
+  if p and p.returncode != 0:
+    print('Execution failed, none zero code returned.', file=sys.stderr)
+    success = False
+
+  print(stdout.decode("utf-8"))
+  print(stderr.decode("utf-8"), file=sys.stderr)
+
+  if not success:
+    sys.exit(p.returncode if p.returncode else 1)
+
+  return stdout, stderr
+
+def main():
+    """ Main program """
+    parser = argparse.ArgumentParser(description='Merge and markdup')
+    parser.add_argument('-i','--input-bams', dest='input_bams',
+                        type=str, help='Input bam file', nargs='+', required=True)
+    parser.add_argument('-b','--output-base', dest='output_base',
+                        type=str, help='Output merged file basename', required=True)
+    parser.add_argument('-r', '--reference', dest='reference',
+                        type=str, help='reference fasta', required=True)
+    parser.add_argument('-t', '--tempdir', dest='tempdir', type=str, default=".",
+                        help='Specify directory for temporary files')
+    parser.add_argument("-n", "--cpus", dest='cpus', type=int, default=cpu_count())
+    parser.add_argument("-d", "--mdup", dest='mdup', action='store_true')
+    parser.add_argument("-l", "--lossy", dest='lossy', action='store_true')
+    parser.add_argument("-o", "--output-format", dest='output_format', default='cram', choices=['bam', 'cram'])
+
+    args = parser.parse_args()
+
+    cmd = []
+
+    if not os.path.isdir(args.tempdir):
+        sys.exit('Error: specified tempdir %s does not exist!' % args.tempdir)
+
+    if args.mdup:
+        merge = 'bammarkduplicates2 markthreads=%s tmpfile=%s/tmp level=0 O=/dev/stdout M=%s I=%s ' % \
+                (str(args.cpus), args.tempdir, args.output_base + ".duplicates_metrics.txt", ' I='.join(args.input_bams))
+    else:
+        merge = 'samtools merge --no-PG -uf -@ %s /dev/stdout %s ' % (str(args.cpus), ' '.join(args.input_bams))
+
+    if args.lossy:
+        cram = 'java -jar /tools/cramtools.jar cram -R %s --capture-all-tags --lossy-quality-score-spec \*8 --preserve-read-names -O %s' % (args.reference, args.output_base + ".cram")
+    else:
+        cram = 'samtools view -C -T %s -@ %s --write-index /dev/stdin -o %s ' % (args.reference, args.cpus, args.output_base + ".cram")
+
+    bam = 'samtools view -b -h -@ %s --write-index /dev/stdin -o %s##idx##%s ' % (args.cpus, args.output_base + ".bam", args.output_base + ".bam.bai")
+    crai1 = 'samtools index -@ %s %s %s ' % (args.cpus, args.output_base + ".cram", args.output_base + ".cram.crai")
+
+    # build command
+    if args.output_format == 'bam':
+        cmd.append('|'.join([merge, bam]))
+
+    elif args.output_format == 'cram':
+        cmd.append('|'.join([merge, cram]))
+        if args.lossy: cmd.append(crai1)
+    else:
+        sys.exit("Unsupported sequence format!")
+
+    for c in cmd:
+        run_cmd(c)
+
+    if os.path.isfile(os.path.join(os.getcwd(), args.output_base + ".duplicates_metrics.txt")):
+        stdout, _ = run_cmd('bammarkduplicates2  -v 2>&1  | grep "biobambam2 version"')
+        version = stdout.decode("utf-8").split(' ')[-1].strip().rstrip('.')
+        with open("%s.duplicates_metrics.extra_info.json" % args.output_base, "w") as j:
+          j.write(json.dumps({  "tool": "biobambam2:bammarkduplicates2@%s" % version }, indent=2))
+
+        tgz = 'tar czf %s.duplicates_metrics.tgz %s.duplicates_metrics.*' % (args.output_base, args.output_base)
+        run_cmd(tgz)
+
+
+if __name__ == "__main__":
+    main()
+
@@ -0,0 +1,4 @@
+docker {
+    enabled = true
+    runOptions = '-u \$(id -u):\$(id -g)'
+}
@@ -0,0 +1,42 @@
+{
+    "name": "bam-merge-sort-markdup",
+    "version": "0.2.0",
+    "description": "Merge multiple lane-level aligned BAMs, Mark duplicated reads and Sort reads by genomic coordinates",
+    "main": "main.nf",
+    "deprecated": false,
+    "keywords": [
+        "seq",
+        "merge",
+        "markduplicate",
+        "sort"
+    ],
+    "repository": {
+        "type": "git",
+        "url": "https://github.com/icgc-argo/dna-seq-processing-tools.git"
+    },
+    "container": {
+        "registries": [
+            {
+                "registry": "ghcr.io",
+                "type": "docker",
+                "org": "icgc-argo",
+                "default": true
+            }
+        ]
+    },
+    "dependencies": [
+        "github.com/icgc-argo/data-processing-utility-tools/helper-functions@1.0.1"
+    ],
+    "devDependencies": [],
+    "contributors": [
+        {
+          "name": "Junjun Zhang"
+        },
+        {
+          "name": "Linda Xiang"
+        }
+    ],
+    "license": "GNU Affero General Public License v3",
+    "bugReport": "https://github.com/icgc-argo/dna-seq-processing-tools/issues",
+    "homepage": "https://github.com/icgc-argo/dna-seq-processing-tools#readme"
+}
-Original file line number
+Diff line change
 .nextflow*
 work
 outdir
 +output