Skip to content

Commit edd1ba4

Browse files
authored
Merge pull request #129 from icgc-argo-workflows/seq-data-to-lane-fastq@0.1.0
[release]
2 parents 654df1d + 7106202 commit edd1ba4

22 files changed

Lines changed: 617 additions & 0 deletions
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.gitignore
2+
.nextflow*
3+
tests
4+
work
5+
outdir

seq-data-to-lane-fastq/Dockerfile

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
FROM ubuntu:20.04
2+
3+
LABEL org.opencontainers.image.source https://github.com/icgc-argo-workflows/dna-seq-processing-tools
4+
5+
ENV DEBIAN_FRONTEND noninteractive
6+
7+
RUN apt-get update -y && \
8+
apt-get install -y software-properties-common python3-pip python3-dev curl && \
9+
apt-get install -y libz-dev pkg-config libtool m4 autotools-dev automake libncurses5-dev libbz2-dev liblzma-dev
10+
11+
# install samtools
12+
ARG SAMTOOLS_VERSION=1.15
13+
RUN cd /tmp \
14+
&& curl -sSL -o samtools-$SAMTOOLS_VERSION.tar.bz2 --retry 10 https://github.com/samtools/samtools/releases/download/$SAMTOOLS_VERSION/samtools-$SAMTOOLS_VERSION.tar.bz2 \
15+
&& bunzip2 -c samtools-$SAMTOOLS_VERSION.tar.bz2 |tar xf - \
16+
&& cd samtools-$SAMTOOLS_VERSION \
17+
&& ./configure --prefix=/usr/local \
18+
&& make \
19+
&& make install
20+
21+
ENV PATH="/tools:${PATH}"
22+
23+
COPY *.py /tools/
24+
25+
RUN groupadd -g 1000 ubuntu && \
26+
useradd -l -u 1000 -g ubuntu ubuntu && \
27+
install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu
28+
29+
ENTRYPOINT ["/usr/bin/env"]
30+
31+
CMD ["/bin/bash"]

seq-data-to-lane-fastq/README.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Nextflow Package `seq-data-to-lane-fastq`
2+
3+
A simple wrapper written in `nextflow` for the sequencing processing tool to convert all input sequencing data into unaligned and lane level fastq files.
4+
The tool support both aligned bam or unaligned fastq formats with paired or single end reads.
5+
6+
## Package development
7+
8+
The initial version of this package was created by the WorkFlow Package Manager CLI tool, please refer to
9+
the [documentation](https://wfpm.readthedocs.io) for details on the development procedure including
10+
versioning, updating, CI testing and releasing.
11+
12+
13+
## Inputs
14+
### Required
15+
- `metadata_json`: JSON file contains donor/sample/specimen/experiment/read_groups/files metadata
16+
- `seq_files`: Sequencing reads in aligned BAM or unaligned FASTQ formats. Supported input format: {BAM, *.fq.gz, *.fastq.gz, *.fq.bz2, *.fastq.bz2}
17+
18+
### Optional
19+
- `reads_max_discard_fraction`: Max fraction of reads allowed to be discarded when reverting aligned BAM to unaligned
20+
- `tempdir`: Specify directory for temporary files
21+
- `cpus`: Set cpu number for running the tool
22+
- `mem`: Set memory(G) for running the tool
23+
- `publish_dir`: Specify directory for getting output files
24+
25+
## Outputs
26+
- `lane_fastq`: All fastq files
27+
- `file_pair_map_csv`: CSV file contains the 3 columns per lane: `read_group_id`, `file_r1`, `file_r2`
28+
29+
## Usage
30+
31+
### Run the package directly
32+
33+
With inputs prepared, you should be able to run the package directly using the following command.
34+
Please replace the params file with a real one (with all required parameters and input files). Example
35+
params file(s) can be found in the `tests` folder.
36+
37+
```
38+
nextflow run icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq/main.nf -r seq-data-to-lane-fastq.v0.1.0 -params-file <your-params-json-file>
39+
```
40+
41+
### Import the package as a dependency
42+
43+
To import this package into another package as a dependency, please follow these steps at the
44+
importing package side:
45+
46+
1. add this package's URI `github.com/icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq@0.1.0` in the `dependencies` list of the `pkg.json` file
47+
2. run `wfpm install` to install the dependency
48+
3. add the `include` statement in the main Nextflow script to import the dependent package from this path: `./wfpr_modules/github.com/icgc-argo-workflows/dna-seq-processing-tools/seq-data-to-lane-fastq@0.1.0/main.nf`

seq-data-to-lane-fastq/main.nf

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
#!/usr/bin/env nextflow
2+
3+
/*
4+
Copyright (C) 2021, icgc-argo
5+
6+
This program is free software: you can redistribute it and/or modify
7+
it under the terms of the GNU Affero General Public License as published by
8+
the Free Software Foundation, either version 3 of the License, or
9+
(at your option) any later version.
10+
11+
This program is distributed in the hope that it will be useful,
12+
but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
GNU Affero General Public License for more details.
15+
16+
You should have received a copy of the GNU Affero General Public License
17+
along with this program. If not, see <http://www.gnu.org/licenses/>.
18+
19+
Authors:
20+
Linda Xiang
21+
*/
22+
23+
/********************************************************************/
24+
/* this block is auto-generated based on info from pkg.json where */
25+
/* changes can be made if needed, do NOT modify this block manually */
26+
nextflow.enable.dsl = 2
27+
version = '0.1.0' // package version
28+
29+
container = [
30+
'ghcr.io': 'ghcr.io/icgc-argo-workflows/dna-seq-processing-tools.seq-data-to-lane-fastq'
31+
]
32+
default_container_registry = 'ghcr.io'
33+
/********************************************************************/
34+
35+
36+
// universal params go here
37+
params.container_registry = ""
38+
params.container_version = ""
39+
params.container = ""
40+
41+
params.cpus = 1
42+
params.mem = 1 // GB
43+
params.publish_dir = "" // set to empty string will disable publishDir
44+
45+
46+
// tool specific parmas go here, add / change as needed
47+
params.metadata_json = ""
48+
params.seq_files = ""
49+
params.reads_max_discard_fraction = 0.05
50+
params.tempdir = "NO_DIR"
51+
52+
53+
process seqDataToLaneFastq {
54+
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
55+
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir ? true : false
56+
57+
cpus params.cpus
58+
memory "${params.mem} GB"
59+
60+
input: // input, make update as needed
61+
path metadata_json
62+
path seq
63+
64+
output: // output, make update as needed
65+
path "out/*{fq,fastq,fq.gz,fastq.gz}", emit: lane_fastq
66+
path "out/rgs_file_pair_map.csv", emit: file_pair_map_csv
67+
68+
script:
69+
// add and initialize variables here as needed
70+
71+
arg_tempdir = params.tempdir != 'NO_DIR' ? "-t ${params.tempdir}" : ""
72+
73+
"""
74+
mkdir -p out
75+
76+
main.py \
77+
-p ${metadata_json} \
78+
-s ${seq} \
79+
-d ${params.reads_max_discard_fraction} \
80+
-n ${params.cpus} \
81+
-o out ${arg_tempdir}
82+
83+
"""
84+
}
85+
86+
87+
// this provides an entry point for this main script, so it can be run directly without clone the repo
88+
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
89+
workflow {
90+
seqDataToLaneFastq(
91+
file(params.metadata_json),
92+
Channel.fromPath(params.seq_files).collect()
93+
)
94+
}

0 commit comments

Comments
 (0)