Skip to content

Commit 96698e9

Browse files
authored
Merge pull request #124 from icgc-argo/bam-merge-sort-markdup@0.2.0
[release]
2 parents ab6282b + db7a54c commit 96698e9

40 files changed

Lines changed: 11250 additions & 0 deletions

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,4 @@ docs/_build
6767
.nextflow*
6868
work
6969
outdir
70+
output
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.gitignore
2+
.nextflow*
3+
tests
4+
work
5+
outdir

bam-merge-sort-markdup/Dockerfile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
FROM quay.io/icgc-argo/dna-seq-processing-tools:base-docker.0.2.1
2+
3+
LABEL org.opencontainers.image.source https://github.com/icgc-argo/dna-seq-processing-tools
4+
5+
ENV PATH="/tools:${PATH}"
6+
7+
COPY *.py /tools/
8+
9+
ENTRYPOINT ["/usr/bin/env"]
10+
11+
CMD ["/bin/bash"]

bam-merge-sort-markdup/main.nf

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
#!/usr/bin/env nextflow
2+
3+
/*
4+
Copyright (C) 2021, icgc-argo
5+
6+
This program is free software: you can redistribute it and/or modify
7+
it under the terms of the GNU Affero General Public License as published by
8+
the Free Software Foundation, either version 3 of the License, or
9+
(at your option) any later version.
10+
11+
This program is distributed in the hope that it will be useful,
12+
but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
GNU Affero General Public License for more details.
15+
16+
You should have received a copy of the GNU Affero General Public License
17+
along with this program. If not, see <http://www.gnu.org/licenses/>.
18+
19+
Authors:
20+
Junjun Zhang
21+
Linda Xiang
22+
*/
23+
24+
/********************************************************************/
25+
/* this block is auto-generated based on info from pkg.json where */
26+
/* changes can be made if needed, do NOT modify this block manually */
27+
nextflow.enable.dsl = 2
28+
version = '0.2.0' // package version
29+
30+
container = [
31+
'ghcr.io': 'ghcr.io/icgc-argo/dna-seq-processing-tools.bam-merge-sort-markdup'
32+
]
33+
default_container_registry = 'ghcr.io'
34+
/********************************************************************/
35+
36+
37+
// universal params go here
38+
params.container_registry = ""
39+
params.container_version = ""
40+
params.container = ""
41+
42+
params.cpus = 1
43+
params.mem = 1 // GB
44+
params.publish_dir = "" // set to empty string will disable publishDir
45+
46+
47+
// tool specific parmas go here, add / change as needed
48+
params.aligned_lane_bams = ""
49+
params.ref_genome_gz = ""
50+
params.aligned_basename = "grch38-aligned.merged"
51+
params.markdup = true
52+
params.output_format = "cram"
53+
params.lossy = false
54+
params.tempdir = "NO_DIR"
55+
56+
include { getSecondaryFiles } from './wfpr_modules/github.com/icgc-argo/data-processing-utility-tools/helper-functions@1.0.1/main'
57+
58+
process bamMergeSortMarkdup {
59+
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
60+
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir
61+
62+
cpus params.cpus
63+
memory "${params.mem} GB"
64+
65+
input:
66+
path aligned_lane_bams
67+
path ref_genome_gz
68+
path ref_genome_gz_secondary_file
69+
val tempdir
70+
71+
output:
72+
path "${params.aligned_basename}.{bam,cram}", emit: merged_seq
73+
path "${params.aligned_basename}.{bam.bai,cram.crai}", emit: merged_seq_idx
74+
path "${params.aligned_basename}.duplicates_metrics.tgz", optional: true, emit: duplicates_metrics
75+
76+
script:
77+
arg_markdup = params.markdup ? "-d" : ""
78+
arg_lossy = params.lossy ? "-l" : ""
79+
arg_tempdir = tempdir != 'NO_DIR' ? "-t ${tempdir}" : ""
80+
"""
81+
main.py \
82+
-i ${aligned_lane_bams} \
83+
-r ${ref_genome_gz} \
84+
-n ${params.cpus} \
85+
-b ${params.aligned_basename} ${arg_markdup} \
86+
-o ${params.output_format} ${arg_lossy} ${arg_tempdir}
87+
"""
88+
}
89+
90+
91+
// this provides an entry point for this main script, so it can be run directly without clone the repo
92+
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
93+
workflow {
94+
bamMergeSortMarkdup(
95+
Channel.fromPath(params.aligned_lane_bams, checkIfExists: true).collect(),
96+
file(params.ref_genome_gz),
97+
Channel.fromPath(getSecondaryFiles(params.ref_genome_gz, ['fai', 'gzi']), checkIfExists: true).collect(),
98+
params.tempdir
99+
)
100+
}

bam-merge-sort-markdup/main.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Copyright (C) 2021, icgc-argo
6+
7+
This program is free software: you can redistribute it and/or modify
8+
it under the terms of the GNU Affero General Public License as published by
9+
the Free Software Foundation, either version 3 of the License, or
10+
(at your option) any later version.
11+
12+
This program is distributed in the hope that it will be useful,
13+
but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
GNU Affero General Public License for more details.
16+
17+
You should have received a copy of the GNU Affero General Public License
18+
along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
20+
Authors:
21+
Junjun Zhang
22+
Linda Xiang
23+
"""
24+
25+
import sys
26+
import subprocess
27+
import argparse
28+
from multiprocessing import cpu_count
29+
import json
30+
import os
31+
32+
def run_cmd(cmd):
33+
stdout, stderr, p, success = '', '', None, True
34+
try:
35+
p = subprocess.Popen([cmd],
36+
stdout=subprocess.PIPE,
37+
stderr=subprocess.PIPE,
38+
shell=True)
39+
stdout, stderr = p.communicate()
40+
except Exception as e:
41+
print('Execution failed: %s' % e, file=sys.stderr)
42+
success = False
43+
44+
if p and p.returncode != 0:
45+
print('Execution failed, none zero code returned.', file=sys.stderr)
46+
success = False
47+
48+
print(stdout.decode("utf-8"))
49+
print(stderr.decode("utf-8"), file=sys.stderr)
50+
51+
if not success:
52+
sys.exit(p.returncode if p.returncode else 1)
53+
54+
return stdout, stderr
55+
56+
def main():
57+
""" Main program """
58+
parser = argparse.ArgumentParser(description='Merge and markdup')
59+
parser.add_argument('-i','--input-bams', dest='input_bams',
60+
type=str, help='Input bam file', nargs='+', required=True)
61+
parser.add_argument('-b','--output-base', dest='output_base',
62+
type=str, help='Output merged file basename', required=True)
63+
parser.add_argument('-r', '--reference', dest='reference',
64+
type=str, help='reference fasta', required=True)
65+
parser.add_argument('-t', '--tempdir', dest='tempdir', type=str, default=".",
66+
help='Specify directory for temporary files')
67+
parser.add_argument("-n", "--cpus", dest='cpus', type=int, default=cpu_count())
68+
parser.add_argument("-d", "--mdup", dest='mdup', action='store_true')
69+
parser.add_argument("-l", "--lossy", dest='lossy', action='store_true')
70+
parser.add_argument("-o", "--output-format", dest='output_format', default='cram', choices=['bam', 'cram'])
71+
72+
args = parser.parse_args()
73+
74+
cmd = []
75+
76+
if not os.path.isdir(args.tempdir):
77+
sys.exit('Error: specified tempdir %s does not exist!' % args.tempdir)
78+
79+
if args.mdup:
80+
merge = 'bammarkduplicates2 markthreads=%s tmpfile=%s/tmp level=0 O=/dev/stdout M=%s I=%s ' % \
81+
(str(args.cpus), args.tempdir, args.output_base + ".duplicates_metrics.txt", ' I='.join(args.input_bams))
82+
else:
83+
merge = 'samtools merge --no-PG -uf -@ %s /dev/stdout %s ' % (str(args.cpus), ' '.join(args.input_bams))
84+
85+
if args.lossy:
86+
cram = 'java -jar /tools/cramtools.jar cram -R %s --capture-all-tags --lossy-quality-score-spec \*8 --preserve-read-names -O %s' % (args.reference, args.output_base + ".cram")
87+
else:
88+
cram = 'samtools view -C -T %s -@ %s --write-index /dev/stdin -o %s ' % (args.reference, args.cpus, args.output_base + ".cram")
89+
90+
bam = 'samtools view -b -h -@ %s --write-index /dev/stdin -o %s##idx##%s ' % (args.cpus, args.output_base + ".bam", args.output_base + ".bam.bai")
91+
crai1 = 'samtools index -@ %s %s %s ' % (args.cpus, args.output_base + ".cram", args.output_base + ".cram.crai")
92+
93+
# build command
94+
if args.output_format == 'bam':
95+
cmd.append('|'.join([merge, bam]))
96+
97+
elif args.output_format == 'cram':
98+
cmd.append('|'.join([merge, cram]))
99+
if args.lossy: cmd.append(crai1)
100+
else:
101+
sys.exit("Unsupported sequence format!")
102+
103+
for c in cmd:
104+
run_cmd(c)
105+
106+
if os.path.isfile(os.path.join(os.getcwd(), args.output_base + ".duplicates_metrics.txt")):
107+
stdout, _ = run_cmd('bammarkduplicates2 -v 2>&1 | grep "biobambam2 version"')
108+
version = stdout.decode("utf-8").split(' ')[-1].strip().rstrip('.')
109+
with open("%s.duplicates_metrics.extra_info.json" % args.output_base, "w") as j:
110+
j.write(json.dumps({ "tool": "biobambam2:bammarkduplicates2@%s" % version }, indent=2))
111+
112+
tgz = 'tar czf %s.duplicates_metrics.tgz %s.duplicates_metrics.*' % (args.output_base, args.output_base)
113+
run_cmd(tgz)
114+
115+
116+
if __name__ == "__main__":
117+
main()
118+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
docker {
2+
enabled = true
3+
runOptions = '-u \$(id -u):\$(id -g)'
4+
}

bam-merge-sort-markdup/pkg.json

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"name": "bam-merge-sort-markdup",
3+
"version": "0.2.0",
4+
"description": "Merge multiple lane-level aligned BAMs, Mark duplicated reads and Sort reads by genomic coordinates",
5+
"main": "main.nf",
6+
"deprecated": false,
7+
"keywords": [
8+
"seq",
9+
"merge",
10+
"markduplicate",
11+
"sort"
12+
],
13+
"repository": {
14+
"type": "git",
15+
"url": "https://github.com/icgc-argo/dna-seq-processing-tools.git"
16+
},
17+
"container": {
18+
"registries": [
19+
{
20+
"registry": "ghcr.io",
21+
"type": "docker",
22+
"org": "icgc-argo",
23+
"default": true
24+
}
25+
]
26+
},
27+
"dependencies": [
28+
"github.com/icgc-argo/data-processing-utility-tools/helper-functions@1.0.1"
29+
],
30+
"devDependencies": [],
31+
"contributors": [
32+
{
33+
"name": "Junjun Zhang"
34+
},
35+
{
36+
"name": "Linda Xiang"
37+
}
38+
],
39+
"license": "GNU Affero General Public License v3",
40+
"bugReport": "https://github.com/icgc-argo/dna-seq-processing-tools/issues",
41+
"homepage": "https://github.com/icgc-argo/dna-seq-processing-tools#readme"
42+
}

0 commit comments

Comments
 (0)