From 2536bfe2407496642bb7ceb591c82df023267558 Mon Sep 17 00:00:00 2001
From: Kimberly Dill-McFarland
Date: Mon, 14 Apr 2025 10:12:52 -0700
Subject: [PATCH 01/10] typo
---
vignette/SEAsnake_vignette.Rmd | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/vignette/SEAsnake_vignette.Rmd b/vignette/SEAsnake_vignette.Rmd
index f0ab049..7e6a7c1 100755
--- a/vignette/SEAsnake_vignette.Rmd
+++ b/vignette/SEAsnake_vignette.Rmd
@@ -341,7 +341,7 @@ Your directory structure will look like
library(DiagrammeR)
grViz(diagram = "digraph flowchart {
graph[rankdir = LR]
- node [fontname = arial, shape = retangle, fontsize=10, height=0.1]
+ node [fontname = arial, shape = rectangle, fontsize=10, height=0.1]
dir1 [label = '@@1']
dir2 [label = '@@2']
dir3 [label = '@@3']
From e8c54a068d386d5137fa351d13b9af54323405c9 Mon Sep 17 00:00:00 2001
From: Kimberly Dill-McFarland
Date: Thu, 5 Jun 2025 16:14:15 -0700
Subject: [PATCH 02/10] Update pipeline with cutadapt
---
Snakefile_step2 | 9 +++------
Snakefile_step2_single | 8 +++-----
environment/Hissss_env.yaml | 2 +-
3 files changed, 7 insertions(+), 12 deletions(-)
diff --git a/Snakefile_step2 b/Snakefile_step2
index 7aeff73..033dba1 100644
--- a/Snakefile_step2
+++ b/Snakefile_step2
@@ -36,10 +36,7 @@ rule adapterremoval:
R2 = lambda wildcards: config["SampleList"][wildcards.sample]["R2"]
output:
R1 = "result/1_trim/{sample}_R1_trim.fastq.gz",
- R2 = "result/1_trim/{sample}_R2_trim.fastq.gz",
- single = "result/1_trim/{sample}_singleton.truncated.gz",
- discard = "result/1_trim/{sample}.settings",
- set = "result/1_trim/{sample}_discarded.gz"
+ R2 = "result/1_trim/{sample}_R2_trim.fastq.gz"
params:
trim5p = config["trim5p"],
trimAdapt = config["trimAdapt"],
@@ -50,9 +47,9 @@ rule adapterremoval:
run:
if params.trimAdapt == True:
- shell("AdapterRemoval --file1 {input.R1} --file2 {input.R2} --output1 {output.R1} --output2 {output.R2} --singleton {output.single} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p} --adapter1 {params.adapter1} --adapter2 {params.adapter2}")
+ shell("cutadapt -g {params.adapter1} -a {params.adapter2} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores={threads} {input.R1} {input.R2}")
if params.trimAdapt == False:
- shell("AdapterRemoval --file1 {input.R1} --file2 {input.R2} --output1 {output.R1} --output2 {output.R2} --singleton {output.single} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p}")
+ shell("cutadapt --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores={threads} {input.R1} {input.R2} ")
if params.trimAdapt != True and params.trimAdapt != False:
print("Error: Config trimAdapt parameter must be set to True or False.")
diff --git a/Snakefile_step2_single b/Snakefile_step2_single
index 1b466e7..a77e7df 100644
--- a/Snakefile_step2_single
+++ b/Snakefile_step2_single
@@ -30,9 +30,7 @@ rule adapterremoval:
input:
R1 = lambda wildcards: config["SampleList"][wildcards.sample]["R1"]
output:
- R1 = "result/1_trim/{sample}_R1_trim.fastq.gz",
- discard = "result/1_trim/{sample}.settings",
- set = "result/1_trim/{sample}_discarded.gz"
+ R1 = "result/1_trim/{sample}_R1_trim.fastq.gz"
params:
trim5p = config["trim5p"],
trimAdapt = config["trimAdapt"],
@@ -42,9 +40,9 @@ rule adapterremoval:
run:
if params.trimAdapt == True:
- shell("AdapterRemoval --file1 {input.R1} --output1 {output.R1} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p} --adapter1 {params.adapter1}")
+ shell("cutadapt -g {params.adapter1} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores={threads} {input.R1}")
if params.trimAdapt == False:
- shell("AdapterRemoval --file1 {input.R1} --output1 {output.R1} --discarded {output.discard} --settings {output.set} --threads {threads} --gzip --maxns 1 --minlength 15 --trimqualities --minquality 30 --trim5p {params.trim5p}")
+ shell("cutadapt --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores={threads} {input.R1}")
if params.trimAdapt != True and params.trimAdapt != False:
print("Error: Config trimAdapt parameter must be set to True or False.")
diff --git a/environment/Hissss_env.yaml b/environment/Hissss_env.yaml
index df60ab9..0ea9690 100644
--- a/environment/Hissss_env.yaml
+++ b/environment/Hissss_env.yaml
@@ -12,7 +12,7 @@ dependencies:
- bwa >=0.7
- pysam >=0.15
- fastqc >=0.11.9
- - adapterremoval >=2.3.2
+ - cutadapt >=5.1
- bedtools >=2.30.0
- picard >=2.27.1
- star >=2.7.9a
From 97b349992bb23ea0f7bdfc4eb40389156d5cc0b2 Mon Sep 17 00:00:00 2001
From: Kimberly Dill-McFarland
Date: Thu, 5 Jun 2025 16:15:39 -0700
Subject: [PATCH 03/10] update vignette
---
vignette/SEAsnake_vignette.Rmd | 6 +++---
vignette/SEAsnake_vignette.html | 16 ++++++++--------
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/vignette/SEAsnake_vignette.Rmd b/vignette/SEAsnake_vignette.Rmd
index 7e6a7c1..fc593f8 100755
--- a/vignette/SEAsnake_vignette.Rmd
+++ b/vignette/SEAsnake_vignette.Rmd
@@ -19,7 +19,7 @@ urlcolor: blue
[SEAsnake](https://github.com/BIGslu/SEAsnake) is a [snakemake](https://snakemake.readthedocs.io/en/stable/) pipeline to process bulk RNA-seq data from fastq sequences to gene counts. It includes the following steps. You can see an in-depth example of how to run these steps separately outside SEAsnake in our [first tutorial](https://github.com/BIGslu/tutorials/blob/main/RNAseq/1.Hawn_RNAseq_fastq.to.counts.pdf).
1. Quality assess sequences with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)
-2. Remove adapters and filter low quality sequences with [AdapterRemoval](https://adapterremoval.readthedocs.io/en/latest/)
+2. Remove adapters and filter low quality sequences with [cutadapt](https://cutadapt.readthedocs.io/en/stable/index.html)
- max N = 1
- min length = 15
- min quality = 30
@@ -60,7 +60,7 @@ SEAsnake step 1 completes 1 fastq in 30 to 45 min. Since this step is run in par
SEAsnake step 2 completes 1 set of paired end fastq in roughly 10 hours. Using the same example, if you have 40 fastq from 20 paired end samples running on 14 cores, this is 20 / 14 * 10 = 14.3 hours. Non-paired end samples takes about half the time.
-Step 2 will take less time if you do not have adapter contamination requiring alignment in `AdapterRemoval` (- 1 hour per single fastq or paired-end sample) or if you already have an indexed reference genome for `STAR` (- 1 hour from total time).
+Step 2 will take less time if you do not have adapter contamination requiring alignment in `cutadapt` (- 1 hour per single fastq or paired-end sample) or if you already have an indexed reference genome for `STAR` (- 1 hour from total time).
We are working on improving speed so please check for updates periodically.
@@ -545,7 +545,7 @@ STAR_align 2 3 3
STAR_index 1 3 3
STAR_load 1 3 3
STAR_remove 1 1 1
-adapterremoval 2 1 1
+cutadapt 2 1 1
align_filter 2 1 1
all 1 1 1
combine 1 1 1
diff --git a/vignette/SEAsnake_vignette.html b/vignette/SEAsnake_vignette.html
index 70ddf98..9b4a9db 100755
--- a/vignette/SEAsnake_vignette.html
+++ b/vignette/SEAsnake_vignette.html
@@ -2721,7 +2721,7 @@
SEAsnake vignette
snakemake pipeline from fastq to counts
-version February 10, 2025
+version June 05, 2025
@@ -2735,7 +2735,7 @@ Introduction
tutorial.
- Quality assess sequences with FastQC
-- Remove adapters and filter low quality sequences with AdapterRemoval
+
- Remove adapters and filter low quality sequences with cutadapt
- max N = 1
- min length = 15
@@ -2817,9 +2817,9 @@ Time
samples running on 14 cores, this is 20 / 14 * 10 = 14.3 hours.
Non-paired end samples takes about half the time.
Step 2 will take less time if you do not have adapter contamination
-requiring alignment in AdapterRemoval (- 1 hour per single
-fastq or paired-end sample) or if you already have an indexed reference
-genome for STAR (- 1 hour from total time).
+requiring alignment in cutadapt (- 1 hour per single fastq
+or paired-end sample) or if you already have an indexed reference genome
+for STAR (- 1 hour from total time).
We are working on improving speed so please check for updates
periodically.
@@ -3089,8 +3089,8 @@ Step 1
retains all messages and errors in the log file.
nohup snakemake --snakefile Snakefile_step1 --cores $cores >> log/SEAsnake_step1.log 2>&1 &
Your directory structure will look like
-
-
+
+
Breaking down the snakemake command
For those less familiar with bash scripting, here is a detailed break
@@ -3275,7 +3275,7 @@
Step 2
STAR_index 1 3 3
STAR_load 1 3 3
STAR_remove 1 1 1
-adapterremoval 2 1 1
+cutadapt 2 1 1
align_filter 2 1 1
all 1 1 1
combine 1 1 1
From f3e5919afdf78308024b32affdc885d1f81670f1 Mon Sep 17 00:00:00 2001
From: Kimberly Dill-McFarland
Date: Thu, 5 Jun 2025 16:24:41 -0700
Subject: [PATCH 04/10] typo
---
Snakefile_step2 | 34 +++++++++++++++++-----------------
Snakefile_step2_single | 26 +++++++++++++-------------
2 files changed, 30 insertions(+), 30 deletions(-)
diff --git a/Snakefile_step2 b/Snakefile_step2
index 033dba1..448d4c8 100644
--- a/Snakefile_step2
+++ b/Snakefile_step2
@@ -3,7 +3,7 @@ configfile: 'result/config.yaml'
## INPUTS / OUTPUTS
SUFFIX = ["R1_trim", "R2_trim"]
SAMPLES = config["SampleList"]
-
+
TRIM1 = expand('result/1_trim/{sample}_R1_trim.fastq.gz', sample=SAMPLES)
TRIM2 = expand('result/1_trim/{sample}_R2_trim.fastq.gz', sample=SAMPLES)
QC_trim_html = expand("result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html", sample=SAMPLES, suf=SUFFIX)
@@ -24,8 +24,8 @@ rule all:
*( ['result/5_combined/combined_picard.tsv'] if config["picard"] else [] ),
'log/benchmark/STAR_index.benchmark.txt',
'log/benchmark/STAR_load.benchmark.txt'
-
-
+
+
##----------------------------------------##
## 1. Adapter removal & quality filtering ##
##----------------------------------------##
@@ -45,11 +45,11 @@ rule adapterremoval:
threads: config["threads"] * 0.1
message: "Adapter removal & quality filtering"
- run:
+ run:
if params.trimAdapt == True:
- shell("cutadapt -g {params.adapter1} -a {params.adapter2} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores={threads} {input.R1} {input.R2}")
+ shell("cutadapt -g {params.adapter1} -a {params.adapter2} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores {threads} {input.R1} {input.R2}")
if params.trimAdapt == False:
- shell("cutadapt --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores={threads} {input.R1} {input.R2} ")
+ shell("cutadapt --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores {threads} {input.R1} {input.R2} ")
if params.trimAdapt != True and params.trimAdapt != False:
print("Error: Config trimAdapt parameter must be set to True or False.")
@@ -77,7 +77,7 @@ STAR_index_SA = 'ref/release' + config["release"] + '/STARindex/SA'
if (not os.path.exists(STAR_index_SA)):
rule STAR_index:
input: expand('result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html', sample=SAMPLES, suf=SUFFIX)
- output:
+ output:
IDX_done = touch('log/benchmark/STAR_index.benchmark.txt')
params:
release = config["release"],
@@ -89,14 +89,14 @@ if (not os.path.exists(STAR_index_SA)):
else:
rule skip_index:
input: expand('result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html', sample=SAMPLES, suf=SUFFIX)
- output:
+ output:
IDX_done = touch('log/benchmark/STAR_index.benchmark.txt')
message: "Genome index already present. Skipping indexing rule."
rule STAR_load:
input:
IDX_done = 'log/benchmark/STAR_index.benchmark.txt'
- output:
+ output:
LOAD_done = touch('log/benchmark/STAR_load.benchmark.txt')
params:
release = config["release"]
@@ -104,13 +104,13 @@ rule STAR_load:
message: 'Loading genome index'
run:
shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndExit")
-
+
rule STAR_align:
input:
R1 = "result/1_trim/{sample}_R1_trim.fastq.gz",
R2 = "result/1_trim/{sample}_R2_trim.fastq.gz",
LOAD_done = 'log/benchmark/STAR_load.benchmark.txt'
- output:
+ output:
BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam'
params:
OUT = 'result/2_bam/{sample}_',
@@ -119,12 +119,12 @@ rule STAR_align:
message: 'Aligning reads'
run:
shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} {input.R2} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000")
-
+
rule STAR_remove:
input:
LOAD_done = 'log/benchmark/STAR_load.benchmark.txt',
BAM = expand('result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', sample=SAMPLES)
- output:
+ output:
REMOVE_done = touch('log/benchmark/STAR_remove.benchmark.txt')
params:
release = config["release"]
@@ -140,10 +140,10 @@ rule STAR_remove:
##----------------------------------------##
rule align_filter:
- input:
+ input:
BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam',
REMOVE_done = 'log/benchmark/STAR_remove.benchmark.txt'
- output:
+ output:
BAM2 = 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam',
BAI = 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam.bai'
threads: config["threads"] * 0.1
@@ -151,9 +151,9 @@ rule align_filter:
shell:
"""
samtools view {input.BAM} -h -b -f 3 -F 1284 -q 30 -@ {threads} > {output.BAM2}
- samtools index -@ {threads} {output.BAM2} > {output.BAI}
+ samtools index -@ {threads} {output.BAM2} > {output.BAI}
"""
-
+
##----------------------------------------##
## 5. ALIGNMENT QC ##
##----------------------------------------##
diff --git a/Snakefile_step2_single b/Snakefile_step2_single
index a77e7df..3682716 100644
--- a/Snakefile_step2_single
+++ b/Snakefile_step2_single
@@ -2,7 +2,7 @@ configfile: 'result/config.yaml'
## INPUTS / OUTPUTS
SAMPLES = config["SampleList"]
-
+
TRIM1 = expand('result/1_trim/{sample}_R1_trim.fastq.gz', sample=SAMPLES)
QC_trim_html = expand("result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html", sample=SAMPLES)
QC_trim_zip = expand("result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.zip", sample=SAMPLES)
@@ -21,7 +21,7 @@ rule all:
*( ['result/5_combined/combined_picard.tsv'] if config["picard"] else [] ),
'log/benchmark/STAR_index.benchmark.txt',
'log/benchmark/STAR_load.benchmark.txt'
-
+
##----------------------------------------##
## 1. Adapter removal & quality filtering ##
##----------------------------------------##
@@ -38,11 +38,11 @@ rule adapterremoval:
threads: config["threads"] * 0.1
message: "Adapter removal & quality filtering"
- run:
+ run:
if params.trimAdapt == True:
- shell("cutadapt -g {params.adapter1} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores={threads} {input.R1}")
+ shell("cutadapt -g {params.adapter1} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores {threads} {input.R1}")
if params.trimAdapt == False:
- shell("cutadapt --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores={threads} {input.R1}")
+ shell("cutadapt --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} --cores {threads} {input.R1}")
if params.trimAdapt != True and params.trimAdapt != False:
print("Error: Config trimAdapt parameter must be set to True or False.")
@@ -70,7 +70,7 @@ STAR_index_SA = 'ref/release' + config["release"] + '/STARindex/SA'
if (not os.path.exists(STAR_index_SA)):
rule STAR_index:
input: expand('result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html', sample=SAMPLES)
- output:
+ output:
IDX_done = touch('log/benchmark/STAR_index.benchmark.txt')
params:
release = config["release"],
@@ -82,14 +82,14 @@ if (not os.path.exists(STAR_index_SA)):
else:
rule skip_index:
input: expand('result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html', sample=SAMPLES)
- output:
+ output:
IDX_done = touch('log/benchmark/STAR_index.benchmark.txt')
message: "Genome index already present. Skipping indexing rule."
rule STAR_load:
input:
IDX_done = 'log/benchmark/STAR_index.benchmark.txt'
- output:
+ output:
LOAD_done = touch('log/benchmark/STAR_load.benchmark.txt')
params:
release = config["release"]
@@ -97,12 +97,12 @@ rule STAR_load:
message: 'Loading genome index'
run:
shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndExit")
-
+
rule STAR_align:
input:
R1 = "result/1_trim/{sample}_R1_trim.fastq.gz",
LOAD_done = 'log/benchmark/STAR_load.benchmark.txt'
- output:
+ output:
BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam'
params:
OUT = 'result/2_bam/{sample}_',
@@ -111,12 +111,12 @@ rule STAR_align:
message: 'Aligning reads'
run:
shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000")
-
+
rule STAR_remove:
input:
LOAD_done = 'log/benchmark/STAR_load.benchmark.txt',
BAM = expand('result/2_bam/{sample}_Aligned.sortedByCoord.out.bam', sample=SAMPLES)
- output:
+ output:
REMOVE_done = touch('log/benchmark/STAR_remove.benchmark.txt')
params:
release = config["release"]
@@ -132,7 +132,7 @@ rule STAR_remove:
##----------------------------------------##
rule align_filter:
- input:
+ input:
BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam',
REMOVE_done = 'log/benchmark/STAR_remove.benchmark.txt'
output: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam'
From fdc2983a3180b55f871b99d44b070f65797c877b Mon Sep 17 00:00:00 2001
From: Kimberly Dill-McFarland
Date: Thu, 11 Sep 2025 16:45:27 -0700
Subject: [PATCH 05/10] Add python dependencies
---
environment/Hissss_env.yaml | 2 ++
1 file changed, 2 insertions(+)
diff --git a/environment/Hissss_env.yaml b/environment/Hissss_env.yaml
index 0ea9690..3909331 100644
--- a/environment/Hissss_env.yaml
+++ b/environment/Hissss_env.yaml
@@ -17,3 +17,5 @@ dependencies:
- picard >=2.27.1
- star >=2.7.9a
- subread >=2.0.1
+ - python >=3.11
+ - pandas >=1.6
From c5782f6624fa55bb3748fc37ddf7353c74bcb404 Mon Sep 17 00:00:00 2001
From: Kimberly Dill-McFarland
Date: Thu, 11 Sep 2025 16:45:40 -0700
Subject: [PATCH 06/10] rm sudo usage
---
scripts/create_config.sh | 38 +++++++++++++++++++-------------------
1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/scripts/create_config.sh b/scripts/create_config.sh
index 5ff0431..efe713c 100755
--- a/scripts/create_config.sh
+++ b/scripts/create_config.sh
@@ -6,7 +6,7 @@
SampleList="data/*_R1*"
-sudo echo "SampleList:" > result/config.yaml
+echo "SampleList:" > result/config.yaml
spacer1=": "
@@ -25,11 +25,11 @@ sample2=`echo "${sample/R1/R2}"`
sample_name=`echo "$(basename $sample)" | grep -o '^.*_L[0-9][0-9][0-9]' | sed 's/_L[0-9][0-9][0-9]$//'`
# Add sample name to config
-sudo echo " " "$sample_name$spacer1" >> result/config.yaml
-sudo echo " sample: '"$sample_name"'" >> result/config.yaml
+echo " " "$sample_name$spacer1" >> result/config.yaml
+echo " sample: '"$sample_name"'" >> result/config.yaml
# Add fastq files to config
-sudo echo " R1: '"$sample"'" >> result/config.yaml
-sudo echo " R2: '"$sample2"'" >> result/config.yaml
+echo " R1: '"$sample"'" >> result/config.yaml
+echo " R2: '"$sample2"'" >> result/config.yaml
done
#################################
@@ -44,7 +44,7 @@ cores2=1
fi
# Add default param to config
-sudo echo "
+echo "
# Adapter removal
## Base pairs to trim from 5' end
@@ -58,8 +58,8 @@ adapter2: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
## Species the format 'Homo_sapiens.GRCh38' or 'Mus_musculus.GRCm39'
genome: 'Homo_sapiens.GRCh38'
-## Genome release number. Current as of 2022.04.14
-release: '106'
+## Genome release number. Current as of 2025.06.05
+release: '114'
# Alignment metrics
## Run Picard?
@@ -71,17 +71,17 @@ threads: $cores2
# Setup directory structure
-sudo mkdir -p -m 777 'result/qc/1_fastqc_raw'
-sudo mkdir -p -m 777 'result/qc/2_fastqc_trim'
-sudo mkdir -p -m 777 'result/qc/3_flagstat'
-sudo mkdir -p -m 777 'result/qc/4_picard'
+mkdir -p -m 777 'result/qc/1_fastqc_raw'
+mkdir -p -m 777 'result/qc/2_fastqc_trim'
+mkdir -p -m 777 'result/qc/3_flagstat'
+mkdir -p -m 777 'result/qc/4_picard'
-sudo mkdir -p -m 777 'result/1_trim'
-sudo mkdir -p -m 777 'result/2_bam'
-sudo mkdir -p -m 777 'result/3_bam_filter'
-sudo mkdir -p -m 777 'result/4_count'
-sudo mkdir -p -m 777 'result/5_combined'
+mkdir -p -m 777 'result/1_trim'
+mkdir -p -m 777 'result/2_bam'
+mkdir -p -m 777 'result/3_bam_filter'
+mkdir -p -m 777 'result/4_count'
+mkdir -p -m 777 'result/5_combined'
-sudo mkdir -p -m 777 'ref'
-sudo mkdir -p -m 777 'log'
+mkdir -p -m 777 'ref'
+mkdir -p -m 777 'log'
From 5ce03661de7abe532be9829481e1e609431eb2c1 Mon Sep 17 00:00:00 2001
From: Kimberly Dill-McFarland
Date: Thu, 11 Sep 2025 16:46:08 -0700
Subject: [PATCH 07/10] Make robust to small # of threads input
---
Snakefile_step2 | 16 ++++++++--------
Snakefile_step2_single | 16 ++++++++--------
2 files changed, 16 insertions(+), 16 deletions(-)
diff --git a/Snakefile_step2 b/Snakefile_step2
index 448d4c8..dc3030e 100644
--- a/Snakefile_step2
+++ b/Snakefile_step2
@@ -42,7 +42,7 @@ rule adapterremoval:
trimAdapt = config["trimAdapt"],
adapter1 = config["adapter1"],
adapter2 = config["adapter2"]
- threads: config["threads"] * 0.1
+ threads: max(1, int(config["threads"]*0.1))
message: "Adapter removal & quality filtering"
run:
@@ -62,7 +62,7 @@ rule fastqc_trim:
output:
HTML = 'result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.html',
ZIP = 'result/qc/2_fastqc_trim/{sample}_{suf}_fastqc.zip'
- threads: config["threads"] * 0.1
+ threads: max(1, int(config["threads"]*0.1))
message: "Quality assessment, FastQC"
shell:
"""
@@ -82,7 +82,7 @@ if (not os.path.exists(STAR_index_SA)):
params:
release = config["release"],
genome = config["genome"]
- threads: config["threads"] * 0.5
+ threads: max(1, int(config["threads"]*0.5))
message: "Generating genome index"
run:
shell("scripts/STAR_index.sh {params.release} {params.genome} {threads}")
@@ -100,7 +100,7 @@ rule STAR_load:
LOAD_done = touch('log/benchmark/STAR_load.benchmark.txt')
params:
release = config["release"]
- threads: config["threads"] * 0.5
+ threads: max(1, int(config["threads"]*0.5))
message: 'Loading genome index'
run:
shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndExit")
@@ -115,7 +115,7 @@ rule STAR_align:
params:
OUT = 'result/2_bam/{sample}_',
release = config["release"]
- threads: config["threads"] * 0.25
+ threads: max(1, int(config["threads"]*0.25))
message: 'Aligning reads'
run:
shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} {input.R2} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000")
@@ -146,7 +146,7 @@ rule align_filter:
output:
BAM2 = 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam',
BAI = 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam.bai'
- threads: config["threads"] * 0.1
+ threads: max(1, int(config["threads"]*0.1))
message: "Filtering alignments"
shell:
"""
@@ -161,7 +161,7 @@ rule align_filter:
rule flagstat:
input: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam'
output: 'result/qc/3_flagstat/{sample}_flagstat.tsv'
- threads: config["threads"] * 0.1
+ threads: max(1, int(config["threads"]*0.1))
message: "Quality control, flagstat"
run:
shell("samtools flagstat -@ {threads} {input} > {output}")
@@ -194,7 +194,7 @@ rule fcount:
params:
release = config["release"],
genome = config["genome"]
- threads: config["threads"] * 0.1
+ threads: max(1, int(config["threads"]*0.1))
message: "Counting features"
run:
shell("featureCounts -T {threads} -g gene_id -t exon -p -a 'ref/release{params.release}/STARref/{params.genome}.{params.release}.gtf' -o {output} {input.bam}")
diff --git a/Snakefile_step2_single b/Snakefile_step2_single
index 3682716..3149a2d 100644
--- a/Snakefile_step2_single
+++ b/Snakefile_step2_single
@@ -35,7 +35,7 @@ rule adapterremoval:
trim5p = config["trim5p"],
trimAdapt = config["trimAdapt"],
adapter1 = config["adapter1"]
- threads: config["threads"] * 0.1
+ threads: max(1, int(config["threads"]*0.1))
message: "Adapter removal & quality filtering"
run:
@@ -55,7 +55,7 @@ rule fastqc_trim:
output:
HTML = 'result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.html',
ZIP = 'result/qc/2_fastqc_trim/{sample}_R1_trim_fastqc.zip'
- threads: config["threads"] * 0.1
+ threads: max(1, int(config["threads"]*0.1))
message: "Quality assessment, FastQC"
shell:
"""
@@ -75,7 +75,7 @@ if (not os.path.exists(STAR_index_SA)):
params:
release = config["release"],
genome = config["genome"]
- threads: config["threads"] * 0.5
+ threads: max(1, int(config["threads"]*0.5))
message: "Generating genome index"
run:
shell("scripts/STAR_index.sh {params.release} {params.genome} {threads}")
@@ -93,7 +93,7 @@ rule STAR_load:
LOAD_done = touch('log/benchmark/STAR_load.benchmark.txt')
params:
release = config["release"]
- threads: config["threads"] * 0.5
+ threads: max(1, int(config["threads"]*0.5))
message: 'Loading genome index'
run:
shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndExit")
@@ -107,7 +107,7 @@ rule STAR_align:
params:
OUT = 'result/2_bam/{sample}_',
release = config["release"]
- threads: config["threads"] * 0.25
+ threads: max(1, int(config["threads"]*0.25))
message: 'Aligning reads'
run:
shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000")
@@ -136,7 +136,7 @@ rule align_filter:
BAM = 'result/2_bam/{sample}_Aligned.sortedByCoord.out.bam',
REMOVE_done = 'log/benchmark/STAR_remove.benchmark.txt'
output: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam'
- threads: config["threads"] * 0.1
+ threads: max(1, int(config["threads"]*0.1))
message: "Filtering alignments"
run:
shell("samtools view {input.BAM} -b -h -F 1284 -q 30 -@ {threads} > {output}")
@@ -149,7 +149,7 @@ rule align_filter:
rule flagstat:
input: 'result/3_bam_filter/{sample}_Aligned.sortedByCoord.filter.bam'
output: 'result/qc/3_flagstat/{sample}_flagstat.tsv'
- threads: config["threads"] * 0.1
+ threads: max(1, int(config["threads"]*0.1))
message: "Quality control, flagstat"
run:
shell("samtools flagstat -@ {threads} {input} > {output}")
@@ -182,7 +182,7 @@ rule fcount:
params:
release = config["release"],
genome = config["genome"]
- threads: config["threads"] * 0.1
+ threads: max(1, int(config["threads"]*0.1))
message: "Counting features"
run:
shell("featureCounts -T {threads} -g gene_id -t exon -p -a 'ref/release{params.release}/STARref/{params.genome}.{params.release}.gtf' -o {output} {input.bam}")
From 935968109ffa9eb4a682a2e7b117fae32f446fad Mon Sep 17 00:00:00 2001
From: Kimberly Dill-McFarland
Date: Thu, 11 Sep 2025 16:46:22 -0700
Subject: [PATCH 08/10] clean star command
---
Snakefile_step2 | 15 +++++++++++++--
Snakefile_step2_single | 15 +++++++++++++--
2 files changed, 26 insertions(+), 4 deletions(-)
diff --git a/Snakefile_step2 b/Snakefile_step2
index dc3030e..42d89d3 100644
--- a/Snakefile_step2
+++ b/Snakefile_step2
@@ -117,8 +117,19 @@ rule STAR_align:
release = config["release"]
threads: max(1, int(config["threads"]*0.25))
message: 'Aligning reads'
- run:
- shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} {input.R2} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000")
+ shell:
+ """
+ STAR \
+ --genomeDir ref/release{params.release}/STARindex \
+ --readFilesCommand zcat \
+ --readFilesIn {input.R1} {input.R2} \
+ --outFileNamePrefix {params.OUT} \
+ --outSAMtype BAM SortedByCoordinate \
+ --runThreadN {threads} \
+ --runRNGseed 8756 \
+ --genomeLoad LoadAndKeep \
+ --limitBAMsortRAM 20000000000
+ """
rule STAR_remove:
input:
diff --git a/Snakefile_step2_single b/Snakefile_step2_single
index 3149a2d..2542196 100644
--- a/Snakefile_step2_single
+++ b/Snakefile_step2_single
@@ -109,8 +109,19 @@ rule STAR_align:
release = config["release"]
threads: max(1, int(config["threads"]*0.25))
message: 'Aligning reads'
- run:
- shell("STAR --genomeDir 'ref/release{params.release}/STARindex' --readFilesIn {input.R1} --readFilesCommand zcat --outFileNamePrefix {params.OUT} --outSAMtype BAM SortedByCoordinate --runThreadN {threads} --runRNGseed 8756 --genomeLoad LoadAndKeep --limitBAMsortRAM 20000000000")
+ shell:
+ """
+ STAR \
+ --genomeDir ref/release{params.release}/STARindex \
+ --readFilesCommand zcat \
+ --readFilesIn {input.R1} \
+ --outFileNamePrefix {params.OUT} \
+ --outSAMtype BAM SortedByCoordinate \
+ --runThreadN {threads} \
+ --runRNGseed 8756 \
+ --genomeLoad LoadAndKeep \
+ --limitBAMsortRAM 20000000000
+ """
rule STAR_remove:
input:
From d859e721df6066844999bf180c101c80b930a3f9 Mon Sep 17 00:00:00 2001
From: Kimberly Dill-McFarland
Date: Fri, 12 Sep 2025 16:17:33 -0700
Subject: [PATCH 09/10] fix cutadapt install issue on Mac
---
environment/Hissss_env.yaml | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/environment/Hissss_env.yaml b/environment/Hissss_env.yaml
index 3909331..98a6232 100644
--- a/environment/Hissss_env.yaml
+++ b/environment/Hissss_env.yaml
@@ -1,6 +1,7 @@
channels:
- bioconda
- conda-forge
+ - defaults
dependencies:
- snakemake-minimal >=5.24.1
- jinja2 >=2.11
@@ -12,10 +13,12 @@ dependencies:
- bwa >=0.7
- pysam >=0.15
- fastqc >=0.11.9
- - cutadapt >=5.1
- bedtools >=2.30.0
- picard >=2.27.1
- star >=2.7.9a
- subread >=2.0.1
- python >=3.11
- pandas >=1.6
+ - pip
+ - pip:
+ - cutadapt>=5.1
From f821d539dd67925a5349cab2fae668e56132ba75 Mon Sep 17 00:00:00 2001
From: Kimberly Dill-McFarland
Date: Fri, 12 Sep 2025 16:30:31 -0700
Subject: [PATCH 10/10] typo
---
Snakefile_step2 | 1 -
1 file changed, 1 deletion(-)
diff --git a/Snakefile_step2 b/Snakefile_step2
index 42d89d3..0aaa604 100644
--- a/Snakefile_step2
+++ b/Snakefile_step2
@@ -44,7 +44,6 @@ rule adapterremoval:
adapter2 = config["adapter2"]
threads: max(1, int(config["threads"]*0.1))
message: "Adapter removal & quality filtering"
-
run:
if params.trimAdapt == True:
shell("cutadapt -g {params.adapter1} -a {params.adapter2} --cut {params.trim5p} --quality-cutoff 30 --max-n 1 --minimum-length 15 -o {output.R1} -p {output.R2} --cores {threads} {input.R1} {input.R2}")