diff --git a/Snakefile_step2 b/Snakefile_step2 index 7aeff73..0aaa604 100644 --- a/Snakefile_step2 +++ b/Snakefile_step2 @@ -3,7 +3,7 @@ configfile: 'result/config.yaml' ## INPUTS / OUTPUTS SUFFIX = ["R1_trim", "R2_trim"] SAMPLES = config["SampleList"] - + TRIM1 = expand('result/1_trim/{sample}_R1_trim.fastq.gz', sample=SAMPLES) TRIM2 = expand('result/1_trim/{sample}_R2_trim.fastq.gz', sample=SAMPLES) QC_trim_html = expand("result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html", sample=SAMPLES, suf=SUFFIX) @@ -24,8 +24,8 @@ rule all: *( ['result/5_combined/combined_picard.tsv'] if config["picard"] else [] ), 'log/benchmark/STAR_index.benchmark.txt', 'log/benchmark/STAR_load.benchmark.txt' - - + + ##----------------------------------------## ## 1. Adapter removal & quality filtering ## ##----------------------------------------## @@ -36,23 +36,19 @@ rule adapterremoval: R2 = lambda wildcards: config["SampleList"][wildcards.sample]["R2"] output: R1 = "result/1_trim/{sample}_R1_trim.fastq.gz", - R2 = "result/1_trim/{sample}_R2_trim.fastq.gz", - single = "result/1_trim/{sample}_singleton.truncated.gz", - discard = "result/1_trim/{sample}.settings", - set = "result/1_trim/{sample}_discarded.gz" + R2 = "result/1_trim/{sample}_R2_trim.fastq.gz" params: trim5p = config["trim5p"], trimAdapt = config["trimAdapt"], adapter1 = config["adapter1"], adapter2 = config["adapter2"] - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Adapter removal & quality filtering" - - run: + run: if params.trimAdapt == True: - shell("AdapterRemoval --file1 {input.R1} --file2 {input.R2} --output1 {output.R1} --output2 {output.R2} --singleton {output.single} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") + shell("cutadapt -g {params.adapter1} -a {params.adapter2} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores {threads} {input.R1} {input.R2}") if params.trimAdapt == False: - shell("AdapterRemoval --file1 {input.R1} --file2 {input.R2} --output1 {output.R1} --output2 {output.R2} --singleton {output.single} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p}") + shell("cutadapt --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores {threads} {input.R1} {input.R2} ") if params.trimAdapt != True and params.trimAdapt != False: print("Error: Config trimAdapt parameter must be set to True or False.") @@ -65,7 +61,7 @@ rule fastqc_trim: output: HTML = 'result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html', ZIP = 'result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.zip' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Quality assessment, FastQC" shell: """ @@ -80,54 +76,65 @@ STAR_index_SA = 'ref/release' + config["release"] + '/STARindex/SA' if (not os.path.exists(STAR_index_SA)): rule STAR_index: input: expand('result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html', sample=SAMPLES, suf=SUFFIX) - output: + output: IDX_done = touch('log/benchmark/STAR_index.benchmark.txt') params: release = config["release"], genome = config["genome"] - threads: config["threads"] * 0.5 + threads: max(1, int(config["threads"]*0.5)) message: "Generating genome index" run: shell("scripts/STAR_index.sh {params.release} {params.genome} {threads}") else: rule skip_index: input: expand('result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html', sample=SAMPLES, suf=SUFFIX) - output: + output: IDX_done = touch('log/benchmark/STAR_index.benchmark.txt') message: "Genome index already present. Skipping indexing rule." rule STAR_load: input: IDX_done = 'log/benchmark/STAR_index.benchmark.txt' - output: + output: LOAD_done = touch('log/benchmark/STAR_load.benchmark.txt') params: release = config["release"] - threads: config["threads"] * 0.5 + threads: max(1, int(config["threads"]*0.5)) message: 'Loading genome index' run: shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndExit") - + rule STAR_align: input: R1 = "result/1_trim/{sample}_R1_trim.fastq.gz", R2 = "result/1_trim/{sample}_R2_trim.fastq.gz", LOAD_done = 'log/benchmark/STAR_load.benchmark.txt' - output: + output: BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam' params: OUT = 'result/2_bam/{sample}_', release = config["release"] - threads: config["threads"] * 0.25 + threads: max(1, int(config["threads"]*0.25)) message: 'Aligning reads' - run: - shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} {input.R2} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000") - + shell: + """ + STAR \ + --genomeDir ref/release{params.release}/STARindex \ + --readFilesCommand zcat \ + --readFilesIn {input.R1} {input.R2} \ + --outFileNamePrefix {params.OUT} \ + --outSAMtype BAM SortedByCoordinate \ + --runThreadN {threads} \ + --runRNGseed 8756 \ + --genomeLoad LoadAndKeep \ + --limitBAMsortRAM 20000000000 + """ + rule STAR_remove: input: LOAD_done = 'log/benchmark/STAR_load.benchmark.txt', BAM = expand('result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', sample=SAMPLES) - output: + output: REMOVE_done = touch('log/benchmark/STAR_remove.benchmark.txt') params: release = config["release"] @@ -143,20 +150,20 @@ rule STAR_remove: ##----------------------------------------## rule align_filter: - input: + input: BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', REMOVE_done = 'log/benchmark/STAR_remove.benchmark.txt' - output: + output: BAM2 = 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam', BAI = 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam.bai' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Filtering alignments" shell: """ samtools view {input.BAM} -h -b -f 3 -F 1284 -q 30 -@ {threads} > {output.BAM2} - samtools index -@ {threads} {output.BAM2} > {output.BAI} + samtools index -@ {threads} {output.BAM2} > {output.BAI} """ - + ##----------------------------------------## ## 5. ALIGNMENT QC ## ##----------------------------------------## @@ -164,7 +171,7 @@ rule align_filter: rule flagstat: input: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam' output: 'result/qc/3_flagstat/{sample}_flagstat.tsv' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Quality control, flagstat" run: shell("samtools flagstat -@ {threads} {input} > {output}") @@ -197,7 +204,7 @@ rule fcount: params: release = config["release"], genome = config["genome"] - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Counting features" run: shell("featureCounts -T {threads} -g gene_id -t exon -p -a 'ref/release{params.release}/STARref/{params.genome}.{params.release}.gtf' -o {output} {input.bam}") diff --git a/Snakefile_step2_single b/Snakefile_step2_single index 1b466e7..2542196 100644 --- a/Snakefile_step2_single +++ b/Snakefile_step2_single @@ -2,7 +2,7 @@ configfile: 'result/config.yaml' ## INPUTS / OUTPUTS SAMPLES = config["SampleList"] - + TRIM1 = expand('result/1_trim/{sample}_R1_trim.fastq.gz', sample=SAMPLES) QC_trim_html = expand("result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html", sample=SAMPLES) QC_trim_zip = expand("result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.zip", sample=SAMPLES) @@ -21,7 +21,7 @@ rule all: *( ['result/5_combined/combined_picard.tsv'] if config["picard"] else [] ), 'log/benchmark/STAR_index.benchmark.txt', 'log/benchmark/STAR_load.benchmark.txt' - + ##----------------------------------------## ## 1. Adapter removal & quality filtering ## ##----------------------------------------## @@ -30,21 +30,19 @@ rule adapterremoval: input: R1 = lambda wildcards: config["SampleList"][wildcards.sample]["R1"] output: - R1 = "result/1_trim/{sample}_R1_trim.fastq.gz", - discard = "result/1_trim/{sample}.settings", - set = "result/1_trim/{sample}_discarded.gz" + R1 = "result/1_trim/{sample}_R1_trim.fastq.gz" params: trim5p = config["trim5p"], trimAdapt = config["trimAdapt"], adapter1 = config["adapter1"] - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Adapter removal & quality filtering" - run: + run: if params.trimAdapt == True: - shell("AdapterRemoval --file1 {input.R1} --output1 {output.R1} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p} --adapter1 {params.adapter1}") + shell("cutadapt -g {params.adapter1} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores {threads} {input.R1}") if params.trimAdapt == False: - shell("AdapterRemoval --file1 {input.R1} --output1 {output.R1} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p}") + shell("cutadapt --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores {threads} {input.R1}") if params.trimAdapt != True and params.trimAdapt != False: print("Error: Config trimAdapt parameter must be set to True or False.") @@ -57,7 +55,7 @@ rule fastqc_trim: output: HTML = 'result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html', ZIP = 'result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.zip' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Quality assessment, FastQC" shell: """ @@ -72,53 +70,64 @@ STAR_index_SA = 'ref/release' + config["release"] + '/STARindex/SA' if (not os.path.exists(STAR_index_SA)): rule STAR_index: input: expand('result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html', sample=SAMPLES) - output: + output: IDX_done = touch('log/benchmark/STAR_index.benchmark.txt') params: release = config["release"], genome = config["genome"] - threads: config["threads"] * 0.5 + threads: max(1, int(config["threads"]*0.5)) message: "Generating genome index" run: shell("scripts/STAR_index.sh {params.release} {params.genome} {threads}") else: rule skip_index: input: expand('result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html', sample=SAMPLES) - output: + output: IDX_done = touch('log/benchmark/STAR_index.benchmark.txt') message: "Genome index already present. Skipping indexing rule." rule STAR_load: input: IDX_done = 'log/benchmark/STAR_index.benchmark.txt' - output: + output: LOAD_done = touch('log/benchmark/STAR_load.benchmark.txt') params: release = config["release"] - threads: config["threads"] * 0.5 + threads: max(1, int(config["threads"]*0.5)) message: 'Loading genome index' run: shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndExit") - + rule STAR_align: input: R1 = "result/1_trim/{sample}_R1_trim.fastq.gz", LOAD_done = 'log/benchmark/STAR_load.benchmark.txt' - output: + output: BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam' params: OUT = 'result/2_bam/{sample}_', release = config["release"] - threads: config["threads"] * 0.25 + threads: max(1, int(config["threads"]*0.25)) message: 'Aligning reads' - run: - shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000") - + shell: + """ + STAR \ + --genomeDir ref/release{params.release}/STARindex \ + --readFilesCommand zcat \ + --readFilesIn {input.R1} \ + --outFileNamePrefix {params.OUT} \ + --outSAMtype BAM SortedByCoordinate \ + --runThreadN {threads} \ + --runRNGseed 8756 \ + --genomeLoad LoadAndKeep \ + --limitBAMsortRAM 20000000000 + """ + rule STAR_remove: input: LOAD_done = 'log/benchmark/STAR_load.benchmark.txt', BAM = expand('result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', sample=SAMPLES) - output: + output: REMOVE_done = touch('log/benchmark/STAR_remove.benchmark.txt') params: release = config["release"] @@ -134,11 +143,11 @@ rule STAR_remove: ##----------------------------------------## rule align_filter: - input: + input: BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', REMOVE_done = 'log/benchmark/STAR_remove.benchmark.txt' output: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Filtering alignments" run: shell("samtools view {input.BAM} -b -h -F 1284 -q 30 -@ {threads} > {output}") @@ -151,7 +160,7 @@ rule align_filter: rule flagstat: input: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam' output: 'result/qc/3_flagstat/{sample}_flagstat.tsv' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Quality control, flagstat" run: shell("samtools flagstat -@ {threads} {input} > {output}") @@ -184,7 +193,7 @@ rule fcount: params: release = config["release"], genome = config["genome"] - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Counting features" run: shell("featureCounts -T {threads} -g gene_id -t exon -p -a 'ref/release{params.release}/STARref/{params.genome}.{params.release}.gtf' -o {output} {input.bam}") diff --git a/environment/Hissss_env.yaml b/environment/Hissss_env.yaml index df60ab9..98a6232 100644 --- a/environment/Hissss_env.yaml +++ b/environment/Hissss_env.yaml @@ -1,6 +1,7 @@ channels: - bioconda - conda-forge + - defaults dependencies: - snakemake-minimal >=5.24.1 - jinja2 >=2.11 @@ -12,8 +13,12 @@ dependencies: - bwa >=0.7 - pysam >=0.15 - fastqc >=0.11.9 - - adapterremoval >=2.3.2 - bedtools >=2.30.0 - picard >=2.27.1 - star >=2.7.9a - subread >=2.0.1 + - python >=3.11 + - pandas >=1.6 + - pip + - pip: + - cutadapt>=5.1 diff --git a/scripts/create_config.sh b/scripts/create_config.sh index 5ff0431..efe713c 100755 --- a/scripts/create_config.sh +++ b/scripts/create_config.sh @@ -6,7 +6,7 @@ SampleList="data/*_R1*" -sudo echo "SampleList:" > result/config.yaml +echo "SampleList:" > result/config.yaml spacer1=": " @@ -25,11 +25,11 @@ sample2=`echo "${sample/R1/R2}"` sample_name=`echo "$(basename $sample)" | grep -o '^.*_L[0-9][0-9][0-9]' | sed 's/_L[0-9][0-9][0-9]$//'` # Add sample name to config -sudo echo " " "$sample_name$spacer1" >> result/config.yaml -sudo echo " sample: '"$sample_name"'" >> result/config.yaml +echo " " "$sample_name$spacer1" >> result/config.yaml +echo " sample: '"$sample_name"'" >> result/config.yaml # Add fastq files to config -sudo echo " R1: '"$sample"'" >> result/config.yaml -sudo echo " R2: '"$sample2"'" >> result/config.yaml +echo " R1: '"$sample"'" >> result/config.yaml +echo " R2: '"$sample2"'" >> result/config.yaml done ################################# @@ -44,7 +44,7 @@ cores2=1 fi # Add default param to config -sudo echo " +echo " # Adapter removal ## Base pairs to trim from 5' end @@ -58,8 +58,8 @@ adapter2: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT ## Species the format 'Homo_sapiens.GRCh38' or 'Mus_musculus.GRCm39' genome: 'Homo_sapiens.GRCh38' -## Genome release number. Current as of 2022.04.14 -release: '106' +## Genome release number. Current as of 2025.06.05 +release: '114' # Alignment metrics ## Run Picard? @@ -71,17 +71,17 @@ threads: $cores2 # Setup directory structure -sudo mkdir -p -m 777 'result/qc/1_fastqc_raw' -sudo mkdir -p -m 777 'result/qc/2_fastqc_trim' -sudo mkdir -p -m 777 'result/qc/3_flagstat' -sudo mkdir -p -m 777 'result/qc/4_picard' +mkdir -p -m 777 'result/qc/1_fastqc_raw' +mkdir -p -m 777 'result/qc/2_fastqc_trim' +mkdir -p -m 777 'result/qc/3_flagstat' +mkdir -p -m 777 'result/qc/4_picard' -sudo mkdir -p -m 777 'result/1_trim' -sudo mkdir -p -m 777 'result/2_bam' -sudo mkdir -p -m 777 'result/3_bam_filter' -sudo mkdir -p -m 777 'result/4_count' -sudo mkdir -p -m 777 'result/5_combined' +mkdir -p -m 777 'result/1_trim' +mkdir -p -m 777 'result/2_bam' +mkdir -p -m 777 'result/3_bam_filter' +mkdir -p -m 777 'result/4_count' +mkdir -p -m 777 'result/5_combined' -sudo mkdir -p -m 777 'ref' -sudo mkdir -p -m 777 'log' +mkdir -p -m 777 'ref' +mkdir -p -m 777 'log' diff --git a/vignette/SEAsnake_vignette.Rmd b/vignette/SEAsnake_vignette.Rmd index f0ab049..fc593f8 100755 --- a/vignette/SEAsnake_vignette.Rmd +++ b/vignette/SEAsnake_vignette.Rmd @@ -19,7 +19,7 @@ urlcolor: blue [SEAsnake](https://github.com/BIGslu/SEAsnake) is a [snakemake](https://snakemake.readthedocs.io/en/stable/) pipeline to process bulk RNA-seq data from fastq sequences to gene counts. It includes the following steps. You can see an in-depth example of how to run these steps separately outside SEAsnake in our [first tutorial](https://github.com/BIGslu/tutorials/blob/main/RNAseq/1.Hawn_RNAseq_fastq.to.counts.pdf). 1. Quality assess sequences with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) -2. Remove adapters and filter low quality sequences with [AdapterRemoval](https://adapterremoval.readthedocs.io/en/latest/) +2. Remove adapters and filter low quality sequences with [cutadapt](https://cutadapt.readthedocs.io/en/stable/index.html) - max N = 1 - min length = 15 - min quality = 30 @@ -60,7 +60,7 @@ SEAsnake step 1 completes 1 fastq in 30 to 45 min. Since this step is run in par SEAsnake step 2 completes 1 set of paired end fastq in roughly 10 hours. Using the same example, if you have 40 fastq from 20 paired end samples running on 14 cores, this is 20 / 14 * 10 = 14.3 hours. Non-paired end samples takes about half the time. -Step 2 will take less time if you do not have adapter contamination requiring alignment in `AdapterRemoval` (- 1 hour per single fastq or paired-end sample) or if you already have an indexed reference genome for `STAR` (- 1 hour from total time). +Step 2 will take less time if you do not have adapter contamination requiring alignment in `cutadapt` (- 1 hour per single fastq or paired-end sample) or if you already have an indexed reference genome for `STAR` (- 1 hour from total time). We are working on improving speed so please check for updates periodically. @@ -341,7 +341,7 @@ Your directory structure will look like library(DiagrammeR) grViz(diagram = "digraph flowchart { graph[rankdir = LR] - node [fontname = arial, shape = retangle, fontsize=10, height=0.1] + node [fontname = arial, shape = rectangle, fontsize=10, height=0.1] dir1 [label = '@@1'] dir2 [label = '@@2'] dir3 [label = '@@3'] @@ -545,7 +545,7 @@ STAR_align 2 3 3 STAR_index 1 3 3 STAR_load 1 3 3 STAR_remove 1 1 1 -adapterremoval 2 1 1 +cutadapt 2 1 1 align_filter 2 1 1 all 1 1 1 combine 1 1 1 diff --git a/vignette/SEAsnake_vignette.html b/vignette/SEAsnake_vignette.html index 70ddf98..9b4a9db 100755 --- a/vignette/SEAsnake_vignette.html +++ b/vignette/SEAsnake_vignette.html @@ -2721,7 +2721,7 @@

SEAsnake vignette

snakemake pipeline from fastq to counts

Kim Dill-McFarland, kadm@uw.edu

-

version February 10, 2025

+

version June 05, 2025

@@ -2735,7 +2735,7 @@

Introduction

tutorial.

  1. Quality assess sequences with FastQC
  2. -
  3. Remove adapters and filter low quality sequences with AdapterRemoval +
  4. Remove adapters and filter low quality sequences with cutadapt