From 2536bfe2407496642bb7ceb591c82df023267558 Mon Sep 17 00:00:00 2001 From: Kimberly Dill-McFarland Date: Mon, 14 Apr 2025 10:12:52 -0700 Subject: [PATCH 01/10] typo --- vignette/SEAsnake_vignette.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignette/SEAsnake_vignette.Rmd b/vignette/SEAsnake_vignette.Rmd index f0ab049..7e6a7c1 100755 --- a/vignette/SEAsnake_vignette.Rmd +++ b/vignette/SEAsnake_vignette.Rmd @@ -341,7 +341,7 @@ Your directory structure will look like library(DiagrammeR) grViz(diagram = "digraph flowchart { graph[rankdir = LR] - node [fontname = arial, shape = retangle, fontsize=10, height=0.1] + node [fontname = arial, shape = rectangle, fontsize=10, height=0.1] dir1 [label = '@@1'] dir2 [label = '@@2'] dir3 [label = '@@3'] From e8c54a068d386d5137fa351d13b9af54323405c9 Mon Sep 17 00:00:00 2001 From: Kimberly Dill-McFarland Date: Thu, 5 Jun 2025 16:14:15 -0700 Subject: [PATCH 02/10] Update pipeline with cutadapt --- Snakefile_step2 | 9 +++------ Snakefile_step2_single | 8 +++----- environment/Hissss_env.yaml | 2 +- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/Snakefile_step2 b/Snakefile_step2 index 7aeff73..033dba1 100644 --- a/Snakefile_step2 +++ b/Snakefile_step2 @@ -36,10 +36,7 @@ rule adapterremoval: R2 = lambda wildcards: config["SampleList"][wildcards.sample]["R2"] output: R1 = "result/1_trim/{sample}_R1_trim.fastq.gz", - R2 = "result/1_trim/{sample}_R2_trim.fastq.gz", - single = "result/1_trim/{sample}_singleton.truncated.gz", - discard = "result/1_trim/{sample}.settings", - set = "result/1_trim/{sample}_discarded.gz" + R2 = "result/1_trim/{sample}_R2_trim.fastq.gz" params: trim5p = config["trim5p"], trimAdapt = config["trimAdapt"], @@ -50,9 +47,9 @@ rule adapterremoval: run: if params.trimAdapt == True: - shell("AdapterRemoval --file1 {input.R1} --file2 {input.R2} --output1 {output.R1} --output2 {output.R2} --singleton {output.single} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") + shell("cutadapt -g {params.adapter1} -a {params.adapter2} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores={threads} {input.R1} {input.R2}") if params.trimAdapt == False: - shell("AdapterRemoval --file1 {input.R1} --file2 {input.R2} --output1 {output.R1} --output2 {output.R2} --singleton {output.single} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p}") + shell("cutadapt --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores={threads} {input.R1} {input.R2} ") if params.trimAdapt != True and params.trimAdapt != False: print("Error: Config trimAdapt parameter must be set to True or False.") diff --git a/Snakefile_step2_single b/Snakefile_step2_single index 1b466e7..a77e7df 100644 --- a/Snakefile_step2_single +++ b/Snakefile_step2_single @@ -30,9 +30,7 @@ rule adapterremoval: input: R1 = lambda wildcards: config["SampleList"][wildcards.sample]["R1"] output: - R1 = "result/1_trim/{sample}_R1_trim.fastq.gz", - discard = "result/1_trim/{sample}.settings", - set = "result/1_trim/{sample}_discarded.gz" + R1 = "result/1_trim/{sample}_R1_trim.fastq.gz" params: trim5p = config["trim5p"], trimAdapt = config["trimAdapt"], @@ -42,9 +40,9 @@ rule adapterremoval: run: if params.trimAdapt == True: - shell("AdapterRemoval --file1 {input.R1} --output1 {output.R1} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p} --adapter1 {params.adapter1}") + shell("cutadapt -g {params.adapter1} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores={threads} {input.R1}") if params.trimAdapt == False: - shell("AdapterRemoval --file1 {input.R1} --output1 {output.R1} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p}") + shell("cutadapt --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores={threads} {input.R1}") if params.trimAdapt != True and params.trimAdapt != False: print("Error: Config trimAdapt parameter must be set to True or False.") diff --git a/environment/Hissss_env.yaml b/environment/Hissss_env.yaml index df60ab9..0ea9690 100644 --- a/environment/Hissss_env.yaml +++ b/environment/Hissss_env.yaml @@ -12,7 +12,7 @@ dependencies: - bwa >=0.7 - pysam >=0.15 - fastqc >=0.11.9 - - adapterremoval >=2.3.2 + - cutadapt >=5.1 - bedtools >=2.30.0 - picard >=2.27.1 - star >=2.7.9a From 97b349992bb23ea0f7bdfc4eb40389156d5cc0b2 Mon Sep 17 00:00:00 2001 From: Kimberly Dill-McFarland Date: Thu, 5 Jun 2025 16:15:39 -0700 Subject: [PATCH 03/10] update vignette --- vignette/SEAsnake_vignette.Rmd | 6 +++--- vignette/SEAsnake_vignette.html | 16 ++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/vignette/SEAsnake_vignette.Rmd b/vignette/SEAsnake_vignette.Rmd index 7e6a7c1..fc593f8 100755 --- a/vignette/SEAsnake_vignette.Rmd +++ b/vignette/SEAsnake_vignette.Rmd @@ -19,7 +19,7 @@ urlcolor: blue [SEAsnake](https://github.com/BIGslu/SEAsnake) is a [snakemake](https://snakemake.readthedocs.io/en/stable/) pipeline to process bulk RNA-seq data from fastq sequences to gene counts. It includes the following steps. You can see an in-depth example of how to run these steps separately outside SEAsnake in our [first tutorial](https://github.com/BIGslu/tutorials/blob/main/RNAseq/1.Hawn_RNAseq_fastq.to.counts.pdf). 1. Quality assess sequences with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) -2. Remove adapters and filter low quality sequences with [AdapterRemoval](https://adapterremoval.readthedocs.io/en/latest/) +2. Remove adapters and filter low quality sequences with [cutadapt](https://cutadapt.readthedocs.io/en/stable/index.html) - max N = 1 - min length = 15 - min quality = 30 @@ -60,7 +60,7 @@ SEAsnake step 1 completes 1 fastq in 30 to 45 min. Since this step is run in par SEAsnake step 2 completes 1 set of paired end fastq in roughly 10 hours. Using the same example, if you have 40 fastq from 20 paired end samples running on 14 cores, this is 20 / 14 * 10 = 14.3 hours. Non-paired end samples takes about half the time. -Step 2 will take less time if you do not have adapter contamination requiring alignment in `AdapterRemoval` (- 1 hour per single fastq or paired-end sample) or if you already have an indexed reference genome for `STAR` (- 1 hour from total time). +Step 2 will take less time if you do not have adapter contamination requiring alignment in `cutadapt` (- 1 hour per single fastq or paired-end sample) or if you already have an indexed reference genome for `STAR` (- 1 hour from total time). We are working on improving speed so please check for updates periodically. @@ -545,7 +545,7 @@ STAR_align 2 3 3 STAR_index 1 3 3 STAR_load 1 3 3 STAR_remove 1 1 1 -adapterremoval 2 1 1 +cutadapt 2 1 1 align_filter 2 1 1 all 1 1 1 combine 1 1 1 diff --git a/vignette/SEAsnake_vignette.html b/vignette/SEAsnake_vignette.html index 70ddf98..9b4a9db 100755 --- a/vignette/SEAsnake_vignette.html +++ b/vignette/SEAsnake_vignette.html @@ -2721,7 +2721,7 @@

SEAsnake vignette

snakemake pipeline from fastq to counts

Kim Dill-McFarland,

-

version February 10, 2025

+

version June 05, 2025

@@ -2735,7 +2735,7 @@

Introduction

tutorial.

  1. Quality assess sequences with FastQC
  2. -
  3. Remove adapters and filter low quality sequences with AdapterRemoval +
  4. Remove adapters and filter low quality sequences with cutadapt
    • max N = 1
    • min length = 15
    • @@ -2817,9 +2817,9 @@

      Time

      samples running on 14 cores, this is 20 / 14 * 10 = 14.3 hours. Non-paired end samples takes about half the time.

      Step 2 will take less time if you do not have adapter contamination -requiring alignment in AdapterRemoval (- 1 hour per single -fastq or paired-end sample) or if you already have an indexed reference -genome for STAR (- 1 hour from total time).

      +requiring alignment in cutadapt (- 1 hour per single fastq +or paired-end sample) or if you already have an indexed reference genome +for STAR (- 1 hour from total time).

      We are working on improving speed so please check for updates periodically.

      @@ -3089,8 +3089,8 @@

      Step 1

      retains all messages and errors in the log file.

      nohup snakemake --snakefile Snakefile_step1 --cores $cores >> log/SEAsnake_step1.log 2>&1 &

      Your directory structure will look like

      -
      - +
      +

      Breaking down the snakemake command

      For those less familiar with bash scripting, here is a detailed break @@ -3275,7 +3275,7 @@

      Step 2

      STAR_index 1 3 3 STAR_load 1 3 3 STAR_remove 1 1 1 -adapterremoval 2 1 1 +cutadapt 2 1 1 align_filter 2 1 1 all 1 1 1 combine 1 1 1 From f3e5919afdf78308024b32affdc885d1f81670f1 Mon Sep 17 00:00:00 2001 From: Kimberly Dill-McFarland Date: Thu, 5 Jun 2025 16:24:41 -0700 Subject: [PATCH 04/10] typo --- Snakefile_step2 | 34 +++++++++++++++++----------------- Snakefile_step2_single | 26 +++++++++++++------------- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/Snakefile_step2 b/Snakefile_step2 index 033dba1..448d4c8 100644 --- a/Snakefile_step2 +++ b/Snakefile_step2 @@ -3,7 +3,7 @@ configfile: 'result/config.yaml' ## INPUTS / OUTPUTS SUFFIX = ["R1_trim", "R2_trim"] SAMPLES = config["SampleList"] - + TRIM1 = expand('result/1_trim/{sample}_R1_trim.fastq.gz', sample=SAMPLES) TRIM2 = expand('result/1_trim/{sample}_R2_trim.fastq.gz', sample=SAMPLES) QC_trim_html = expand("result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html", sample=SAMPLES, suf=SUFFIX) @@ -24,8 +24,8 @@ rule all: *( ['result/5_combined/combined_picard.tsv'] if config["picard"] else [] ), 'log/benchmark/STAR_index.benchmark.txt', 'log/benchmark/STAR_load.benchmark.txt' - - + + ##----------------------------------------## ## 1. Adapter removal & quality filtering ## ##----------------------------------------## @@ -45,11 +45,11 @@ rule adapterremoval: threads: config["threads"] * 0.1 message: "Adapter removal & quality filtering" - run: + run: if params.trimAdapt == True: - shell("cutadapt -g {params.adapter1} -a {params.adapter2} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores={threads} {input.R1} {input.R2}") + shell("cutadapt -g {params.adapter1} -a {params.adapter2} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores {threads} {input.R1} {input.R2}") if params.trimAdapt == False: - shell("cutadapt --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores={threads} {input.R1} {input.R2} ") + shell("cutadapt --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores {threads} {input.R1} {input.R2} ") if params.trimAdapt != True and params.trimAdapt != False: print("Error: Config trimAdapt parameter must be set to True or False.") @@ -77,7 +77,7 @@ STAR_index_SA = 'ref/release' + config["release"] + '/STARindex/SA' if (not os.path.exists(STAR_index_SA)): rule STAR_index: input: expand('result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html', sample=SAMPLES, suf=SUFFIX) - output: + output: IDX_done = touch('log/benchmark/STAR_index.benchmark.txt') params: release = config["release"], @@ -89,14 +89,14 @@ if (not os.path.exists(STAR_index_SA)): else: rule skip_index: input: expand('result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html', sample=SAMPLES, suf=SUFFIX) - output: + output: IDX_done = touch('log/benchmark/STAR_index.benchmark.txt') message: "Genome index already present. Skipping indexing rule." rule STAR_load: input: IDX_done = 'log/benchmark/STAR_index.benchmark.txt' - output: + output: LOAD_done = touch('log/benchmark/STAR_load.benchmark.txt') params: release = config["release"] @@ -104,13 +104,13 @@ rule STAR_load: message: 'Loading genome index' run: shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndExit") - + rule STAR_align: input: R1 = "result/1_trim/{sample}_R1_trim.fastq.gz", R2 = "result/1_trim/{sample}_R2_trim.fastq.gz", LOAD_done = 'log/benchmark/STAR_load.benchmark.txt' - output: + output: BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam' params: OUT = 'result/2_bam/{sample}_', @@ -119,12 +119,12 @@ rule STAR_align: message: 'Aligning reads' run: shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} {input.R2} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000") - + rule STAR_remove: input: LOAD_done = 'log/benchmark/STAR_load.benchmark.txt', BAM = expand('result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', sample=SAMPLES) - output: + output: REMOVE_done = touch('log/benchmark/STAR_remove.benchmark.txt') params: release = config["release"] @@ -140,10 +140,10 @@ rule STAR_remove: ##----------------------------------------## rule align_filter: - input: + input: BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', REMOVE_done = 'log/benchmark/STAR_remove.benchmark.txt' - output: + output: BAM2 = 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam', BAI = 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam.bai' threads: config["threads"] * 0.1 @@ -151,9 +151,9 @@ rule align_filter: shell: """ samtools view {input.BAM} -h -b -f 3 -F 1284 -q 30 -@ {threads} > {output.BAM2} - samtools index -@ {threads} {output.BAM2} > {output.BAI} + samtools index -@ {threads} {output.BAM2} > {output.BAI} """ - + ##----------------------------------------## ## 5. ALIGNMENT QC ## ##----------------------------------------## diff --git a/Snakefile_step2_single b/Snakefile_step2_single index a77e7df..3682716 100644 --- a/Snakefile_step2_single +++ b/Snakefile_step2_single @@ -2,7 +2,7 @@ configfile: 'result/config.yaml' ## INPUTS / OUTPUTS SAMPLES = config["SampleList"] - + TRIM1 = expand('result/1_trim/{sample}_R1_trim.fastq.gz', sample=SAMPLES) QC_trim_html = expand("result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html", sample=SAMPLES) QC_trim_zip = expand("result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.zip", sample=SAMPLES) @@ -21,7 +21,7 @@ rule all: *( ['result/5_combined/combined_picard.tsv'] if config["picard"] else [] ), 'log/benchmark/STAR_index.benchmark.txt', 'log/benchmark/STAR_load.benchmark.txt' - + ##----------------------------------------## ## 1. Adapter removal & quality filtering ## ##----------------------------------------## @@ -38,11 +38,11 @@ rule adapterremoval: threads: config["threads"] * 0.1 message: "Adapter removal & quality filtering" - run: + run: if params.trimAdapt == True: - shell("cutadapt -g {params.adapter1} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores={threads} {input.R1}") + shell("cutadapt -g {params.adapter1} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores {threads} {input.R1}") if params.trimAdapt == False: - shell("cutadapt --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores={threads} {input.R1}") + shell("cutadapt --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores {threads} {input.R1}") if params.trimAdapt != True and params.trimAdapt != False: print("Error: Config trimAdapt parameter must be set to True or False.") @@ -70,7 +70,7 @@ STAR_index_SA = 'ref/release' + config["release"] + '/STARindex/SA' if (not os.path.exists(STAR_index_SA)): rule STAR_index: input: expand('result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html', sample=SAMPLES) - output: + output: IDX_done = touch('log/benchmark/STAR_index.benchmark.txt') params: release = config["release"], @@ -82,14 +82,14 @@ if (not os.path.exists(STAR_index_SA)): else: rule skip_index: input: expand('result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html', sample=SAMPLES) - output: + output: IDX_done = touch('log/benchmark/STAR_index.benchmark.txt') message: "Genome index already present. Skipping indexing rule." rule STAR_load: input: IDX_done = 'log/benchmark/STAR_index.benchmark.txt' - output: + output: LOAD_done = touch('log/benchmark/STAR_load.benchmark.txt') params: release = config["release"] @@ -97,12 +97,12 @@ rule STAR_load: message: 'Loading genome index' run: shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndExit") - + rule STAR_align: input: R1 = "result/1_trim/{sample}_R1_trim.fastq.gz", LOAD_done = 'log/benchmark/STAR_load.benchmark.txt' - output: + output: BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam' params: OUT = 'result/2_bam/{sample}_', @@ -111,12 +111,12 @@ rule STAR_align: message: 'Aligning reads' run: shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000") - + rule STAR_remove: input: LOAD_done = 'log/benchmark/STAR_load.benchmark.txt', BAM = expand('result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', sample=SAMPLES) - output: + output: REMOVE_done = touch('log/benchmark/STAR_remove.benchmark.txt') params: release = config["release"] @@ -132,7 +132,7 @@ rule STAR_remove: ##----------------------------------------## rule align_filter: - input: + input: BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', REMOVE_done = 'log/benchmark/STAR_remove.benchmark.txt' output: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam' From fdc2983a3180b55f871b99d44b070f65797c877b Mon Sep 17 00:00:00 2001 From: Kimberly Dill-McFarland Date: Thu, 11 Sep 2025 16:45:27 -0700 Subject: [PATCH 05/10] Add python dependencies --- environment/Hissss_env.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environment/Hissss_env.yaml b/environment/Hissss_env.yaml index 0ea9690..3909331 100644 --- a/environment/Hissss_env.yaml +++ b/environment/Hissss_env.yaml @@ -17,3 +17,5 @@ dependencies: - picard >=2.27.1 - star >=2.7.9a - subread >=2.0.1 + - python >=3.11 + - pandas >=1.6 From c5782f6624fa55bb3748fc37ddf7353c74bcb404 Mon Sep 17 00:00:00 2001 From: Kimberly Dill-McFarland Date: Thu, 11 Sep 2025 16:45:40 -0700 Subject: [PATCH 06/10] rm sudo usage --- scripts/create_config.sh | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/scripts/create_config.sh b/scripts/create_config.sh index 5ff0431..efe713c 100755 --- a/scripts/create_config.sh +++ b/scripts/create_config.sh @@ -6,7 +6,7 @@ SampleList="data/*_R1*" -sudo echo "SampleList:" > result/config.yaml +echo "SampleList:" > result/config.yaml spacer1=": " @@ -25,11 +25,11 @@ sample2=`echo "${sample/R1/R2}"` sample_name=`echo "$(basename $sample)" | grep -o '^.*_L[0-9][0-9][0-9]' | sed 's/_L[0-9][0-9][0-9]$//'` # Add sample name to config -sudo echo " " "$sample_name$spacer1" >> result/config.yaml -sudo echo " sample: '"$sample_name"'" >> result/config.yaml +echo " " "$sample_name$spacer1" >> result/config.yaml +echo " sample: '"$sample_name"'" >> result/config.yaml # Add fastq files to config -sudo echo " R1: '"$sample"'" >> result/config.yaml -sudo echo " R2: '"$sample2"'" >> result/config.yaml +echo " R1: '"$sample"'" >> result/config.yaml +echo " R2: '"$sample2"'" >> result/config.yaml done ################################# @@ -44,7 +44,7 @@ cores2=1 fi # Add default param to config -sudo echo " +echo " # Adapter removal ## Base pairs to trim from 5' end @@ -58,8 +58,8 @@ adapter2: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT ## Species the format 'Homo_sapiens.GRCh38' or 'Mus_musculus.GRCm39' genome: 'Homo_sapiens.GRCh38' -## Genome release number. Current as of 2022.04.14 -release: '106' +## Genome release number. Current as of 2025.06.05 +release: '114' # Alignment metrics ## Run Picard? @@ -71,17 +71,17 @@ threads: $cores2 # Setup directory structure -sudo mkdir -p -m 777 'result/qc/1_fastqc_raw' -sudo mkdir -p -m 777 'result/qc/2_fastqc_trim' -sudo mkdir -p -m 777 'result/qc/3_flagstat' -sudo mkdir -p -m 777 'result/qc/4_picard' +mkdir -p -m 777 'result/qc/1_fastqc_raw' +mkdir -p -m 777 'result/qc/2_fastqc_trim' +mkdir -p -m 777 'result/qc/3_flagstat' +mkdir -p -m 777 'result/qc/4_picard' -sudo mkdir -p -m 777 'result/1_trim' -sudo mkdir -p -m 777 'result/2_bam' -sudo mkdir -p -m 777 'result/3_bam_filter' -sudo mkdir -p -m 777 'result/4_count' -sudo mkdir -p -m 777 'result/5_combined' +mkdir -p -m 777 'result/1_trim' +mkdir -p -m 777 'result/2_bam' +mkdir -p -m 777 'result/3_bam_filter' +mkdir -p -m 777 'result/4_count' +mkdir -p -m 777 'result/5_combined' -sudo mkdir -p -m 777 'ref' -sudo mkdir -p -m 777 'log' +mkdir -p -m 777 'ref' +mkdir -p -m 777 'log' From 5ce03661de7abe532be9829481e1e609431eb2c1 Mon Sep 17 00:00:00 2001 From: Kimberly Dill-McFarland Date: Thu, 11 Sep 2025 16:46:08 -0700 Subject: [PATCH 07/10] Make robust to small # of threads input --- Snakefile_step2 | 16 ++++++++-------- Snakefile_step2_single | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Snakefile_step2 b/Snakefile_step2 index 448d4c8..dc3030e 100644 --- a/Snakefile_step2 +++ b/Snakefile_step2 @@ -42,7 +42,7 @@ rule adapterremoval: trimAdapt = config["trimAdapt"], adapter1 = config["adapter1"], adapter2 = config["adapter2"] - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Adapter removal & quality filtering" run: @@ -62,7 +62,7 @@ rule fastqc_trim: output: HTML = 'result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html', ZIP = 'result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.zip' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Quality assessment, FastQC" shell: """ @@ -82,7 +82,7 @@ if (not os.path.exists(STAR_index_SA)): params: release = config["release"], genome = config["genome"] - threads: config["threads"] * 0.5 + threads: max(1, int(config["threads"]*0.5)) message: "Generating genome index" run: shell("scripts/STAR_index.sh {params.release} {params.genome} {threads}") @@ -100,7 +100,7 @@ rule STAR_load: LOAD_done = touch('log/benchmark/STAR_load.benchmark.txt') params: release = config["release"] - threads: config["threads"] * 0.5 + threads: max(1, int(config["threads"]*0.5)) message: 'Loading genome index' run: shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndExit") @@ -115,7 +115,7 @@ rule STAR_align: params: OUT = 'result/2_bam/{sample}_', release = config["release"] - threads: config["threads"] * 0.25 + threads: max(1, int(config["threads"]*0.25)) message: 'Aligning reads' run: shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} {input.R2} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000") @@ -146,7 +146,7 @@ rule align_filter: output: BAM2 = 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam', BAI = 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam.bai' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Filtering alignments" shell: """ @@ -161,7 +161,7 @@ rule align_filter: rule flagstat: input: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam' output: 'result/qc/3_flagstat/{sample}_flagstat.tsv' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Quality control, flagstat" run: shell("samtools flagstat -@ {threads} {input} > {output}") @@ -194,7 +194,7 @@ rule fcount: params: release = config["release"], genome = config["genome"] - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Counting features" run: shell("featureCounts -T {threads} -g gene_id -t exon -p -a 'ref/release{params.release}/STARref/{params.genome}.{params.release}.gtf' -o {output} {input.bam}") diff --git a/Snakefile_step2_single b/Snakefile_step2_single index 3682716..3149a2d 100644 --- a/Snakefile_step2_single +++ b/Snakefile_step2_single @@ -35,7 +35,7 @@ rule adapterremoval: trim5p = config["trim5p"], trimAdapt = config["trimAdapt"], adapter1 = config["adapter1"] - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Adapter removal & quality filtering" run: @@ -55,7 +55,7 @@ rule fastqc_trim: output: HTML = 'result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html', ZIP = 'result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.zip' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Quality assessment, FastQC" shell: """ @@ -75,7 +75,7 @@ if (not os.path.exists(STAR_index_SA)): params: release = config["release"], genome = config["genome"] - threads: config["threads"] * 0.5 + threads: max(1, int(config["threads"]*0.5)) message: "Generating genome index" run: shell("scripts/STAR_index.sh {params.release} {params.genome} {threads}") @@ -93,7 +93,7 @@ rule STAR_load: LOAD_done = touch('log/benchmark/STAR_load.benchmark.txt') params: release = config["release"] - threads: config["threads"] * 0.5 + threads: max(1, int(config["threads"]*0.5)) message: 'Loading genome index' run: shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndExit") @@ -107,7 +107,7 @@ rule STAR_align: params: OUT = 'result/2_bam/{sample}_', release = config["release"] - threads: config["threads"] * 0.25 + threads: max(1, int(config["threads"]*0.25)) message: 'Aligning reads' run: shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000") @@ -136,7 +136,7 @@ rule align_filter: BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', REMOVE_done = 'log/benchmark/STAR_remove.benchmark.txt' output: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Filtering alignments" run: shell("samtools view {input.BAM} -b -h -F 1284 -q 30 -@ {threads} > {output}") @@ -149,7 +149,7 @@ rule align_filter: rule flagstat: input: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam' output: 'result/qc/3_flagstat/{sample}_flagstat.tsv' - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Quality control, flagstat" run: shell("samtools flagstat -@ {threads} {input} > {output}") @@ -182,7 +182,7 @@ rule fcount: params: release = config["release"], genome = config["genome"] - threads: config["threads"] * 0.1 + threads: max(1, int(config["threads"]*0.1)) message: "Counting features" run: shell("featureCounts -T {threads} -g gene_id -t exon -p -a 'ref/release{params.release}/STARref/{params.genome}.{params.release}.gtf' -o {output} {input.bam}") From 935968109ffa9eb4a682a2e7b117fae32f446fad Mon Sep 17 00:00:00 2001 From: Kimberly Dill-McFarland Date: Thu, 11 Sep 2025 16:46:22 -0700 Subject: [PATCH 08/10] clean star command --- Snakefile_step2 | 15 +++++++++++++-- Snakefile_step2_single | 15 +++++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/Snakefile_step2 b/Snakefile_step2 index dc3030e..42d89d3 100644 --- a/Snakefile_step2 +++ b/Snakefile_step2 @@ -117,8 +117,19 @@ rule STAR_align: release = config["release"] threads: max(1, int(config["threads"]*0.25)) message: 'Aligning reads' - run: - shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} {input.R2} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000") + shell: + """ + STAR \ + --genomeDir ref/release{params.release}/STARindex \ + --readFilesCommand zcat \ + --readFilesIn {input.R1} {input.R2} \ + --outFileNamePrefix {params.OUT} \ + --outSAMtype BAM SortedByCoordinate \ + --runThreadN {threads} \ + --runRNGseed 8756 \ + --genomeLoad LoadAndKeep \ + --limitBAMsortRAM 20000000000 + """ rule STAR_remove: input: diff --git a/Snakefile_step2_single b/Snakefile_step2_single index 3149a2d..2542196 100644 --- a/Snakefile_step2_single +++ b/Snakefile_step2_single @@ -109,8 +109,19 @@ rule STAR_align: release = config["release"] threads: max(1, int(config["threads"]*0.25)) message: 'Aligning reads' - run: - shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000") + shell: + """ + STAR \ + --genomeDir ref/release{params.release}/STARindex \ + --readFilesCommand zcat \ + --readFilesIn {input.R1} \ + --outFileNamePrefix {params.OUT} \ + --outSAMtype BAM SortedByCoordinate \ + --runThreadN {threads} \ + --runRNGseed 8756 \ + --genomeLoad LoadAndKeep \ + --limitBAMsortRAM 20000000000 + """ rule STAR_remove: input: From d859e721df6066844999bf180c101c80b930a3f9 Mon Sep 17 00:00:00 2001 From: Kimberly Dill-McFarland Date: Fri, 12 Sep 2025 16:17:33 -0700 Subject: [PATCH 09/10] fix cutadapt install issue on Mac --- environment/Hissss_env.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/environment/Hissss_env.yaml b/environment/Hissss_env.yaml index 3909331..98a6232 100644 --- a/environment/Hissss_env.yaml +++ b/environment/Hissss_env.yaml @@ -1,6 +1,7 @@ channels: - bioconda - conda-forge + - defaults dependencies: - snakemake-minimal >=5.24.1 - jinja2 >=2.11 @@ -12,10 +13,12 @@ dependencies: - bwa >=0.7 - pysam >=0.15 - fastqc >=0.11.9 - - cutadapt >=5.1 - bedtools >=2.30.0 - picard >=2.27.1 - star >=2.7.9a - subread >=2.0.1 - python >=3.11 - pandas >=1.6 + - pip + - pip: + - cutadapt>=5.1 From f821d539dd67925a5349cab2fae668e56132ba75 Mon Sep 17 00:00:00 2001 From: Kimberly Dill-McFarland Date: Fri, 12 Sep 2025 16:30:31 -0700 Subject: [PATCH 10/10] typo --- Snakefile_step2 | 1 - 1 file changed, 1 deletion(-) diff --git a/Snakefile_step2 b/Snakefile_step2 index 42d89d3..0aaa604 100644 --- a/Snakefile_step2 +++ b/Snakefile_step2 @@ -44,7 +44,6 @@ rule adapterremoval: adapter2 = config["adapter2"] threads: max(1, int(config["threads"]*0.1)) message: "Adapter removal & quality filtering" - run: if params.trimAdapt == True: shell("cutadapt -g {params.adapter1} -a {params.adapter2} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores {threads} {input.R1} {input.R2}")