|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# -*- coding: utf-8 -*- |
| 3 | + |
| 4 | +""" |
| 5 | + Copyright (C) 2021, icgc-argo |
| 6 | +
|
| 7 | + This program is free software: you can redistribute it and/or modify |
| 8 | + it under the terms of the GNU Affero General Public License as published by |
| 9 | + the Free Software Foundation, either version 3 of the License, or |
| 10 | + (at your option) any later version. |
| 11 | +
|
| 12 | + This program is distributed in the hope that it will be useful, |
| 13 | + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | + GNU Affero General Public License for more details. |
| 16 | +
|
| 17 | + You should have received a copy of the GNU Affero General Public License |
| 18 | + along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 19 | +
|
| 20 | + Authors: |
| 21 | + Junjun Zhang |
| 22 | + Linda Xiang |
| 23 | +""" |
| 24 | + |
| 25 | +import sys |
| 26 | +import subprocess |
| 27 | +import argparse |
| 28 | +from multiprocessing import cpu_count |
| 29 | +import json |
| 30 | +import os |
| 31 | + |
| 32 | +def run_cmd(cmd): |
| 33 | + stdout, stderr, p, success = '', '', None, True |
| 34 | + try: |
| 35 | + p = subprocess.Popen([cmd], |
| 36 | + stdout=subprocess.PIPE, |
| 37 | + stderr=subprocess.PIPE, |
| 38 | + shell=True) |
| 39 | + stdout, stderr = p.communicate() |
| 40 | + except Exception as e: |
| 41 | + print('Execution failed: %s' % e, file=sys.stderr) |
| 42 | + success = False |
| 43 | + |
| 44 | + if p and p.returncode != 0: |
| 45 | + print('Execution failed, none zero code returned.', file=sys.stderr) |
| 46 | + success = False |
| 47 | + |
| 48 | + print(stdout.decode("utf-8")) |
| 49 | + print(stderr.decode("utf-8"), file=sys.stderr) |
| 50 | + |
| 51 | + if not success: |
| 52 | + sys.exit(p.returncode if p.returncode else 1) |
| 53 | + |
| 54 | + return stdout, stderr |
| 55 | + |
| 56 | +def main(): |
| 57 | + """ Main program """ |
| 58 | + parser = argparse.ArgumentParser(description='Merge and markdup') |
| 59 | + parser.add_argument('-i','--input-bams', dest='input_bams', |
| 60 | + type=str, help='Input bam file', nargs='+', required=True) |
| 61 | + parser.add_argument('-b','--output-base', dest='output_base', |
| 62 | + type=str, help='Output merged file basename', required=True) |
| 63 | + parser.add_argument('-r', '--reference', dest='reference', |
| 64 | + type=str, help='reference fasta', required=True) |
| 65 | + parser.add_argument('-t', '--tempdir', dest='tempdir', type=str, default=".", |
| 66 | + help='Specify directory for temporary files') |
| 67 | + parser.add_argument("-n", "--cpus", dest='cpus', type=int, default=cpu_count()) |
| 68 | + parser.add_argument("-d", "--mdup", dest='mdup', action='store_true') |
| 69 | + parser.add_argument("-l", "--lossy", dest='lossy', action='store_true') |
| 70 | + parser.add_argument("-o", "--output-format", dest='output_format', default='cram', choices=['bam', 'cram']) |
| 71 | + |
| 72 | + args = parser.parse_args() |
| 73 | + |
| 74 | + cmd = [] |
| 75 | + |
| 76 | + if not os.path.isdir(args.tempdir): |
| 77 | + sys.exit('Error: specified tempdir %s does not exist!' % args.tempdir) |
| 78 | + |
| 79 | + if args.mdup: |
| 80 | + merge = 'bammarkduplicates2 markthreads=%s tmpfile=%s/tmp level=0 O=/dev/stdout M=%s I=%s ' % \ |
| 81 | + (str(args.cpus), args.tempdir, args.output_base + ".duplicates_metrics.txt", ' I='.join(args.input_bams)) |
| 82 | + else: |
| 83 | + merge = 'samtools merge --no-PG -uf -@ %s /dev/stdout %s ' % (str(args.cpus), ' '.join(args.input_bams)) |
| 84 | + |
| 85 | + if args.lossy: |
| 86 | + cram = 'java -jar /tools/cramtools.jar cram -R %s --capture-all-tags --lossy-quality-score-spec \*8 --preserve-read-names -O %s' % (args.reference, args.output_base + ".cram") |
| 87 | + else: |
| 88 | + cram = 'samtools view -C -T %s -@ %s --write-index /dev/stdin -o %s ' % (args.reference, args.cpus, args.output_base + ".cram") |
| 89 | + |
| 90 | + bam = 'samtools view -b -h -@ %s --write-index /dev/stdin -o %s##idx##%s ' % (args.cpus, args.output_base + ".bam", args.output_base + ".bam.bai") |
| 91 | + crai1 = 'samtools index -@ %s %s %s ' % (args.cpus, args.output_base + ".cram", args.output_base + ".cram.crai") |
| 92 | + |
| 93 | + # build command |
| 94 | + if args.output_format == 'bam': |
| 95 | + cmd.append('|'.join([merge, bam])) |
| 96 | + |
| 97 | + elif args.output_format == 'cram': |
| 98 | + cmd.append('|'.join([merge, cram])) |
| 99 | + if args.lossy: cmd.append(crai1) |
| 100 | + else: |
| 101 | + sys.exit("Unsupported sequence format!") |
| 102 | + |
| 103 | + for c in cmd: |
| 104 | + run_cmd(c) |
| 105 | + |
| 106 | + if os.path.isfile(os.path.join(os.getcwd(), args.output_base + ".duplicates_metrics.txt")): |
| 107 | + stdout, _ = run_cmd('bammarkduplicates2 -v 2>&1 | grep "biobambam2 version"') |
| 108 | + version = stdout.decode("utf-8").split(' ')[-1].strip().rstrip('.') |
| 109 | + with open("%s.duplicates_metrics.extra_info.json" % args.output_base, "w") as j: |
| 110 | + j.write(json.dumps({ "tool": "biobambam2:bammarkduplicates2@%s" % version }, indent=2)) |
| 111 | + |
| 112 | + tgz = 'tar czf %s.duplicates_metrics.tgz %s.duplicates_metrics.*' % (args.output_base, args.output_base) |
| 113 | + run_cmd(tgz) |
| 114 | + |
| 115 | + |
| 116 | +if __name__ == "__main__": |
| 117 | + main() |
| 118 | + |
0 commit comments