From a93a6c306033dbf226525a7f4d656a747aeed325 Mon Sep 17 00:00:00 2001 From: Brandon Fuller Date: Thu, 10 Oct 2024 15:06:29 -0400 Subject: [PATCH 001/196] Fixed render_r1_r2 function(s) in Snakefiles - Removed the unused r1-only=False parameter in the render_r1_r2() function in both the rnaseq and chipseq Snakefiles\n- Changed the name of 'r1_only' function to 'render_r1_only' in both Snakefiles to make the name more intuitive and updated the rest of the files accordingly --- workflows/chipseq/Snakefile | 8 ++++---- workflows/rnaseq/Snakefile | 11 +++++------ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index f278b896..90c84d28 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -70,10 +70,10 @@ if config.get('merged_bigwigs', None): final_targets.extend(utils.flatten(c.targets['merged_bigwig'])) -def render_r1_r2(pattern, r1_only=False): +def render_r1_r2(pattern): return expand(pattern, sample='{sample}', n=c.n) -def r1_only(pattern): +def render_r1_only(pattern): return expand(pattern, sample='{sample}', n=1) rule targets: @@ -133,7 +133,7 @@ if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('S output: fastq=render_r1_r2(c.patterns['fastq']) log: - r1_only(c.patterns['fastq'])[0] + '.log' + render_r1_only(c.patterns['fastq'])[0] + '.log' params: is_paired=c.is_paired, sampletable=_st, @@ -337,7 +337,7 @@ rule fastq_screen: """ input: **fastq_screen_references(), - fastq=r1_only(rules.cutadapt.output.fastq), + fastq=render_r1_only(rules.cutadapt.output.fastq), output: txt=c.patterns['fastq_screen'] log: diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index f9a11c4d..d9e7f692 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -64,10 +64,9 @@ if config.get('merged_bigwigs', None): final_targets.extend(utils.flatten(c.targets['merged_bigwig'])) -def render_r1_r2(pattern, r1_only=False): +def render_r1_r2(pattern): return expand(pattern, sample='{sample}', n=c.n) - -def r1_only(pattern): +def render_r1_only(pattern): return expand(pattern, sample='{sample}', n=1) rule targets: @@ -126,7 +125,7 @@ if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('S output: fastq=render_r1_r2(c.patterns['fastq']) log: - r1_only(c.patterns['fastq'])[0] + '.log' + render_r1_only(c.patterns['fastq'])[0] + '.log' params: is_paired=c.is_paired, sampletable=_st, @@ -472,7 +471,7 @@ rule rRNA: Map reads with bowtie2 to the rRNA reference """ input: - fastq=r1_only(c.patterns['cutadapt']), + fastq=render_r1_only(c.patterns['cutadapt']), index=[c.refdict[c.organism][config['rrna']['tag']]['bowtie2']] output: bam=temporary(c.patterns['rrna']['bam']) @@ -569,7 +568,7 @@ rule fastq_screen: """ input: **fastq_screen_references(), - fastq=r1_only(rules.cutadapt.output.fastq), + fastq=render_r1_only(rules.cutadapt.output.fastq), output: txt=c.patterns['fastq_screen'] log: From d0a0300ede9d14f8dda6bc114fc88ea6df5275cd Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 13 Oct 2024 10:23:19 -0400 Subject: [PATCH 002/196] add newline back in --- workflows/rnaseq/Snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index d9e7f692..1cde537a 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -66,6 +66,7 @@ if config.get('merged_bigwigs', None): def render_r1_r2(pattern): return expand(pattern, sample='{sample}', n=c.n) + def render_r1_only(pattern): return expand(pattern, sample='{sample}', n=1) From 7d92555d67f8a4808670d866be0b239819ca79d6 Mon Sep 17 00:00:00 2001 From: Brandon Fuller Date: Wed, 16 Oct 2024 10:59:43 -0400 Subject: [PATCH 003/196] Make strand_arg a param Move `strand_arg` assignment from the `run` block to the `params` block so that `--rerun-trigger` will detect changes to strandedness configuration and re-run those rules --- workflows/rnaseq/Snakefile | 66 ++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 1cde537a..f4245e30 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -598,12 +598,7 @@ rule featurecounts: resources: mem_mb=gb(16), runtime=autobump(hours=2) - run: - # NOTE: By default, we use -p for paired-end - p_arg = '' - if c.is_paired: - p_arg = '-p --countReadPairs ' - + params: strand_arg = helpers.strand_arg_lookup( c, { 'unstranded': '-s0 ', @@ -611,10 +606,14 @@ rule featurecounts: 'fr-secondstrand': '-s1 ', } ) - + run: + # NOTE: By default, we use -p for paired-end + p_arg = '' + if c.is_paired: + p_arg = '-p --countReadPairs ' shell( 'featureCounts ' - '{strand_arg} ' + '{params.strand_arg} ' '{p_arg} ' '-T {threads} ' '-a {input.annotation} ' @@ -769,15 +768,8 @@ rule collectrnaseqmetrics: # NOTE: Be careful with the memory here; make sure you have enough # and/or it matches the resources you're requesting in the cluster # config. - java_args='-Xmx20g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] - log: - c.patterns['collectrnaseqmetrics']['metrics'] + '.log' - threads: 1 - resources: - mem_mb=gb(32), - runtime=autobump(hours=2) - run: + java_args='-Xmx20g', + # java_args='-Xmx2g', # [TEST SETTINGS -1] strand_arg = helpers.strand_arg_lookup( c, { 'unstranded': 'STRAND=NONE ', @@ -785,11 +777,18 @@ rule collectrnaseqmetrics: 'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ', } ) + log: + c.patterns['collectrnaseqmetrics']['metrics'] + '.log' + threads: 1 + resources: + mem_mb=gb(32), + runtime=autobump(hours=2) + run: shell( 'picard ' '{params.java_args} ' 'CollectRnaSeqMetrics ' - '{strand_arg} ' + '{params.strand_arg} ' 'VALIDATION_STRINGENCY=LENIENT ' 'REF_FLAT={input.refflat} ' 'INPUT={input.bam} ' @@ -870,7 +869,14 @@ rule kallisto: c.patterns['kallisto'] params: index_dir=os.path.dirname(c.refdict[c.organism][config['kallisto']['tag']]['kallisto']), - outdir=os.path.dirname(c.patterns['kallisto']) + outdir=os.path.dirname(c.patterns['kallisto']), + strand_arg = helpers.strand_arg_lookup( + c, { + 'unstranded': '', + 'fr-firststrand': '--rf-stranded', + 'fr-secondstrand': '--fr-stranded', + } + ) log: c.patterns['kallisto'] + '.log' threads: @@ -887,15 +893,6 @@ rule kallisto: # and standard deviation here se_args = '--single --fragment-length 300 --sd 20 ' assert len(input.fastq) == 1 - - strand_arg = helpers.strand_arg_lookup( - c, { - 'unstranded': '', - 'fr-firststrand': '--rf-stranded', - 'fr-secondstrand': '--fr-stranded', - } - ) - shell( 'kallisto quant ' '--index {input.index} ' @@ -905,7 +902,7 @@ rule kallisto: '--bias ' '--threads {threads} ' '{se_args} ' - '{strand_arg} ' + '{params.strand_arg} ' '{input.fastq} ' '&> {log}' ) @@ -987,7 +984,7 @@ rule bigwig_neg: runtime=autobump(hours=2) log: c.patterns['bigwig']['neg'] + '.log' - run: + params: strand_arg = helpers.strand_arg_lookup( c, { 'unstranded': '', @@ -995,13 +992,14 @@ rule bigwig_neg: 'fr-secondstrand': '--filterRNAstrand forward ', } ) + run: shell( 'bamCoverage ' '--bam {input.bam} ' '-o {output} ' '-p {threads} ' '{BAMCOVERAGE_ARGS} ' - '{strand_arg} ' + '{params.strand_arg} ' '&> {log}' ) @@ -1020,8 +1018,7 @@ rule bigwig_pos: runtime=autobump(hours=2) log: c.patterns['bigwig']['pos'] + '.log' - - run: + params: strand_arg = helpers.strand_arg_lookup( c, { 'unstranded': '', @@ -1029,13 +1026,14 @@ rule bigwig_pos: 'fr-secondstrand': '--filterRNAstrand reverse ', } ) + run: shell( 'bamCoverage ' '--bam {input.bam} ' '-o {output} ' '-p {threads} ' '{BAMCOVERAGE_ARGS} ' - '{strand_arg} ' + '{params.strand_arg} ' '&> {log}' ) From b808d10c046b3f8722f0f6d29695ea73e8b60d24 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Thu, 24 Oct 2024 13:49:19 -0400 Subject: [PATCH 004/196] add Plodia interpunctella reference config (#417) Add lib.postprocess.utils.extract_from_zip function, used for extracting -- and then immediately gzipping -- a file from within a downloaded zip. Include reference config for Plodia interpunctella --- .../Plodia_interpunctella.yaml | 41 ++++++++++++++++++ lib/postprocess/utils.py | 43 +++++++++++++++++-- 2 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 include/reference_configs/Plodia_interpunctella.yaml diff --git a/include/reference_configs/Plodia_interpunctella.yaml b/include/reference_configs/Plodia_interpunctella.yaml new file mode 100644 index 00000000..214e907f --- /dev/null +++ b/include/reference_configs/Plodia_interpunctella.yaml @@ -0,0 +1,41 @@ +references: + plodia: + ilPloInte3.2: + genome: + url: 'https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_027563975.2/download?include_annotation_type=GENOME_FASTA' + postprocess: + function: 'lib.postprocess.utils.extract_from_zip' + kwargs: + path_in_zip: 'ncbi_dataset/data/GCF_027563975.2/GCF_027563975.2_ilPloInte3.2_genomic.fna' + indexes: + - 'hisat2' + - 'bowtie2' + - 'star' + + annotation: + url: "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/GCF_027563975.2/download?include_annotation_type=GENOME_GTF" + postprocess: + function: 'lib.postprocess.utils.extract_from_zip' + kwargs: + path_in_zip: "ncbi_dataset/data/GCF_027563975.2/genomic.gtf" + conversions: + - 'refflat' + - 'bed12' + + transcriptome: + indexes: + - 'salmon' + - 'kallisto' + + rRNA: + genome: + url: + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + indexes: + - 'hisat2' + - 'bowtie2' + - 'star' + postprocess: + function: 'lib.common.filter_fastas' + args: 'Plodia interpunctella' diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py index abb87288..16010e14 100644 --- a/lib/postprocess/utils.py +++ b/lib/postprocess/utils.py @@ -1,12 +1,49 @@ import sys import os -import pandas as pd -import gzip import re +import gzip +import zipfile +import shutil +import tempfile +import pandas as pd + here = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(0, os.path.join(here, '../../lib')) +sys.path.insert(0, os.path.join(here, "../../lib")) from common import openfile + + +def extract_from_zip(tmpfiles, outfile, path_in_zip): + """ + Parameters + ---------- + + tmpfiles : list + One-item list containing zip file + + outfile : str + gzipped output file to create + + path_in_zip : str + Path within zipfile to extract. You can identify the path using unzip + -l x.zip from bash. + """ + assert len(tmpfiles) == 1, f"expected single zip file, got {tmpfiles}" + + extraction_dir = tempfile.mkdtemp() + + with zipfile.ZipFile(tmpfiles[0], "r") as z: + z.extract(path_in_zip, path=extraction_dir) + + full_path_to_extracted = os.path.join(extraction_dir, path_in_zip) + + with open(full_path_to_extracted, "rb") as fin: + with gzip.open(outfile, "wb") as fout: + shutil.copyfileobj(fin, fout) + + shutil.rmtree(extraction_dir) + + def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand = "None"): """ Matches string to the 9th field of GTF and an optional strand that defaults to None; From 47b379ad126ee63e5259f74c4dc7c06454e4dc8c Mon Sep 17 00:00:00 2001 From: Nicholas Johnson Date: Wed, 4 Dec 2024 18:09:26 -0500 Subject: [PATCH 005/196] Update plotting.R (#423) Just a small mistake Co-authored-by: Ryan Dale --- lib/lcdbwf/R/plotting.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/lcdbwf/R/plotting.R b/lib/lcdbwf/R/plotting.R index 9e7bc8e5..f4aa9c41 100644 --- a/lib/lcdbwf/R/plotting.R +++ b/lib/lcdbwf/R/plotting.R @@ -268,7 +268,7 @@ vargenes_heatmap <- function(rld, cols_for_grouping, n=50){ mat <- mat - rowMeans(mat) df <- as.data.frame(colData(rld)[, cols_for_grouping]) rownames(df) <- colnames(rld) - colnames(df) <- cols.for.grouping + colnames(df) <- cols_for_grouping pheatmap(mat, annotation_col=df, cluster_cols=TRUE) } From aeca4c2dfb5ee76221e1f7280986e94cba6488b3 Mon Sep 17 00:00:00 2001 From: Brandon Fuller Date: Fri, 13 Dec 2024 22:05:16 -0500 Subject: [PATCH 006/196] Change SRA fastq directory (#418) * Change SRA fastq directory Change the directory where SRA fastq files are downloaded and add the 'orig_filename' column to the config object for each sample so that the rest of the workflow works correctly * Make code more elegant Change a nested for-loop implementation in patters_targets.py to a more elegant one-line solution and clean up some code in Snakefile * improve helper.fill_patterns add check when combining by `zip` to ensure values are all same length add more doctests --------- Co-authored-by: Ryan Dale --- lib/helpers.py | 34 ++++++++++---- lib/patterns_targets.py | 14 +++++- workflows/rnaseq/Snakefile | 49 ++++++++++---------- workflows/rnaseq/config/rnaseq_patterns.yaml | 1 + 4 files changed, 62 insertions(+), 36 deletions(-) diff --git a/lib/helpers.py b/lib/helpers.py index 053bca2b..4723286c 100644 --- a/lib/helpers.py +++ b/lib/helpers.py @@ -34,22 +34,31 @@ def detect_layout(sampletable): def fill_patterns(patterns, fill, combination=product): """ - Fills in a dictionary of patterns with the dictionary or DataFrame `fill`. + Fills in a dictionary of patterns with the dictionary `fill`. >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = dict(sample=['one', 'two'], N=[1, 2]) + >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2]) >>> sorted(fill_patterns(patterns, fill)['a']) - ['one_R1.fastq', 'one_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] + ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] + + If using `zip` as a combination, checks to ensure all values in `fill` are + the same length to avoid truncated output. + + This fails: >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = dict(sample=['one', 'two'], N=[1, 2]) - >>> sorted(fill_patterns(patterns, fill, zip)['a']) - ['one_R1.fastq', 'two_R2.fastq'] + >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2]) + >>> sorted(fill_patterns(patterns, fill, zip)['a']) # doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ... + ValueError: {'sample': ['one', 'two', 'three'], 'N': [1, 2]} does not have the same number of entries for each key + + But this works: >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = pd.DataFrame({'sample': ['one', 'two'], 'N': [1, 2]}) - >>> sorted(fill_patterns(patterns, fill)['a']) - ['one_R1.fastq', 'two_R2.fastq'] + >>> fill = dict(sample=['one', 'one', 'two', 'two', 'three', 'three'], N=[1, 2, 1, 2, 1, 2]) + >>> sorted(fill_patterns(patterns, fill, zip)['a']) + ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] """ # In recent Snakemake versions (e.g., this happens in 5.4.5) file patterns @@ -64,12 +73,17 @@ def fill_patterns(patterns, fill, combination=product): # # expand('x', zip, d=[1,2,3]) == ['x', 'x', 'x'] + if combination == zip: + lengths = set([len(v) for v in fill.values()]) + if len(lengths) != 1: + raise ValueError(f"{fill} does not have the same number of entries for each key") + def update(d, u, c): for k, v in u.items(): if isinstance(v, collections.abc.Mapping): r = update(d.get(k, {}), v, c) d[k] = r - else: + else: # not a dictionary, so we're at a leaf if isinstance(fill, pd.DataFrame): d[k] = list(set(expand(u[k], zip, **fill.to_dict("list")))) else: diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py index 542d4116..ec62d513 100644 --- a/lib/patterns_targets.py +++ b/lib/patterns_targets.py @@ -9,6 +9,7 @@ from . import common from . import chipseq from . import helpers +from snakemake.io import expand HERE = os.path.abspath(os.path.dirname(__file__)) @@ -80,6 +81,10 @@ def __init__(self, config, patterns, workdir=None): self.n = [1, 2] else: self.n = [1] + if 'Run' in self.sampletable.columns and sum(self.sampletable['Run'].str.startswith('SRR')) > 0: + self.is_sra = True + else: + self.is_sra = False helpers.preflight(self.config) @@ -107,7 +112,14 @@ def __init__(self, config, patterns, workdir=None): self.fill = dict(sample=self.samples, n=self.n) self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None) - self.targets = helpers.fill_patterns(self.patterns, self.fill, zip) + self.targets = helpers.fill_patterns(self.patterns, self.fill) + + # If the sampletable is from an sra metadata table, then we need to set the value of + # 'orig_filename' for each of the samples to where the fastq was downloaded + if self.is_sra: + self.sampletable['orig_filename'] = expand(self.patterns["sra_fastq"], sample=self.samples, n=1) + if self.is_paired: + self.sampletable['orig_filename_R2'] = expand(self.patterns["sra_fastq"], sample=self.samples, n=2) # Then the aggregation if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config: diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index f4245e30..e979cfdc 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -76,6 +76,30 @@ rule targets: """ input: final_targets +if c.is_sra: + + # Convert the sampletable to be indexed by the first column, for + # convenience in generating the input/output filenames. + _st = c.sampletable.set_index(c.sampletable.columns[0]) + + rule fastq_dump: + output: + fastq=render_r1_r2(c.patterns['sra_fastq']) + log: + render_r1_only(c.patterns['sra_fastq'])[0] + '.log' + params: + is_paired=c.is_paired, + sampletable=_st, + # limit = 100000, # [TEST SETTINGS] + resources: + mem_mb=gb(1), + disk_mb=autobump(gb=1), + runtime=autobump(hours=2) + conda: + '../../wrappers/wrappers/fastq-dump/environment.yaml' + script: + wrapper_for('fastq-dump/wrapper.py') + if 'orig_filename' in c.sampletable.columns: localrules: symlinks, symlink_targets @@ -115,31 +139,6 @@ if 'orig_filename' in c.sampletable.columns: rule symlink_targets: input: c.targets['fastq'] - -if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('SRR')) > 0: - - # Convert the sampletable to be indexed by the first column, for - # convenience in generating the input/output filenames. - _st = c.sampletable.set_index(c.sampletable.columns[0]) - - rule fastq_dump: - output: - fastq=render_r1_r2(c.patterns['fastq']) - log: - render_r1_only(c.patterns['fastq'])[0] + '.log' - params: - is_paired=c.is_paired, - sampletable=_st, - # limit = 100000, # [TEST SETTINGS] - resources: - mem_mb=gb(1), - disk_mb=autobump(gb=1), - runtime=autobump(hours=2) - conda: - '../../wrappers/wrappers/fastq-dump/environment.yaml' - script: - wrapper_for('fastq-dump/wrapper.py') - # This can be set at the command line with --config strand_check_reads=1000 config.setdefault('strand_check_reads', 1e5) diff --git a/workflows/rnaseq/config/rnaseq_patterns.yaml b/workflows/rnaseq/config/rnaseq_patterns.yaml index 5379d0dc..92b2a534 100644 --- a/workflows/rnaseq/config/rnaseq_patterns.yaml +++ b/workflows/rnaseq/config/rnaseq_patterns.yaml @@ -3,6 +3,7 @@ strand_check: bam: 'strand_check/{sample}/{sample}.strandedness.bam' tsv: 'strand_check/{sample}/{sample}.strandedness' fastq: 'data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz' +sra_fastq: 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz' cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz' bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam' fastqc: From a2e5448d017063308052fb865d239755bf36a410 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Tue, 31 Dec 2024 21:19:11 -0500 Subject: [PATCH 007/196] mambaforge -> miniforge --- .circleci/config.yml | 58 ++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 35 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index da38e059..4219875a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -28,7 +28,7 @@ variables: save_cache: key: v5-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }} paths: - - /opt/mambaforge + - /opt/miniforge # this file is created by sra-tools upon installation by conda, and so # needs to be included in the cache otherwise fastq-dump thinks it's @@ -73,7 +73,7 @@ variables: # Note that if we don't escape \$PATH, we'll be stuck with the exact # PATH defined here, which will break anything needing conda envs. - echo "export PATH=\$PATH:/opt/mambaforge/bin" >> $BASH_ENV + echo "export PATH=\$PATH:/opt/miniforge/bin" >> $BASH_ENV source $BASH_ENV @@ -85,28 +85,16 @@ variables: command: | source $BASH_ENV echo $PATH - # /opt/mambaforge will only exist if there was a cache restore; otherwise we'll make it here. + # /opt/miniforge will only exist if there was a cache restore; otherwise we'll make it here. # - # Use mambaforge which comes with mamba. - if [ ! -e /opt/mambaforge ]; then - curl -L https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh > mambaforge.sh - bash mambaforge.sh -b -p /opt/mambaforge - source "/opt/mambaforge/etc/profile.d/conda.sh" - source "/opt/mambaforge/etc/profile.d/mamba.sh" + if [ ! -e /opt/miniforge ]; then + curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" + bash Miniforge3-$(uname)-$(uname -m).sh -b -p /opt/miniforge + source "/opt/miniforge/etc/profile.d/conda.sh" conda activate which conda - which mamba - mamba --version - - # Note that mambaforge doesn't come with the defaults channel, but - # we're adding it here at the beginning to simulate what most users - # probably have locally (and following the bioconda docs). Using - # strict channel priority means we should [theoretically] never - # pull packages from defaults because they all exist on - # conda-forge. - conda config --system --add channels defaults - + conda --version conda config --system --add channels bioconda conda config --system --add channels conda-forge conda config --system --set channel_priority strict @@ -115,10 +103,10 @@ variables: # https://docs.conda.io/projects/conda-build/en/latest/resources/link-scripts.html, # post-link scripts should not depend on any installed or # to-be-installed conda packages...but they do. - mamba install -n base r-base yq + conda install -n base r-base yq - time mamba env create -n $LCDBWF_ENV --file env.yml - time mamba env create -n $LCDBWF_ENV_R --file env-r.yml + time conda env create -n $LCDBWF_ENV --file env.yml + time conda env create -n $LCDBWF_ENV_R --file env-r.yml fi # -------------------------------------------------------------------------- @@ -127,7 +115,7 @@ variables: run: name: Download example data command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV conda info --envs conda config --show @@ -172,7 +160,7 @@ variables: run: name: Run pytest suite and testthat suite command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV # run unit tests and doctests for the modules in lib test/lcdb-wf-test unit_tests --pytest @@ -194,7 +182,7 @@ variables: name: chipseq workflow command: | cd $DEPLOY/workflows/chipseq - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p -r $DEPLOY/test/lcdb-wf-test chipseq --trackhub @@ -208,7 +196,7 @@ variables: name: chipseq misc command: | cd $DEPLOY/workflows/chipseq - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV ./run_test.sh --use-conda -j2 -k -p -r \ @@ -237,7 +225,7 @@ variables: run: name: references workflow command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -r -k --orig $ORIG @@ -248,7 +236,7 @@ variables: name: rnaseq workflow command: | cd $DEPLOY - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow -n $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p -r --orig $ORIG @@ -276,7 +264,7 @@ variables: command: | ORIG=$(pwd) cd $DEPLOY - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV # Check the help for test/lcdb-wf-test to see what args these @@ -299,7 +287,7 @@ variables: name: colocalization workflow command: | cd $DEPLOY/workflows/colocalization - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -r -p -j2 --use-conda --orig $ORIG @@ -438,9 +426,9 @@ jobs: - run: name: Install sphinx command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate lcdb-wf-test - mamba install -y sphinx make yaml + conda install -y sphinx make yaml - run: name: OK for unknown github host command: mkdir -p ~/.ssh/ && echo -e "Host github.com\n\tStrictHostKeyChecking no\n" > ~/.ssh/config @@ -450,7 +438,7 @@ jobs: - run: name: Build and upload docs command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda activate lcdb-wf-test ci/build-docs.sh - store_artifacts: @@ -466,7 +454,7 @@ jobs: - run: name: Report environment command: | - source /opt/mambaforge/etc/profile.d/conda.sh + source /opt/miniforge/etc/profile.d/conda.sh conda env export -n lcdb-wf-test > /tmp/env.yaml conda env export -n lcdb-wf-test-r > /tmp/env-r.yaml - store_artifacts: From 4487d9067924f6aec484898be9717d343ad6f28b Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Tue, 31 Dec 2024 22:00:54 -0500 Subject: [PATCH 008/196] latest ubuntu for testing --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4219875a..582a9842 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,7 +5,7 @@ variables: # default settings for all steps defaults: &defaults docker: - - image: ubuntu:20.04 + - image: ubuntu:latest # -------------------------------------------------------------------------- # The caching dramatically speeds up testing time, because we can do the From 836fff09239408a7496b177cfc8eef25f11777f6 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Tue, 31 Dec 2024 22:02:03 -0500 Subject: [PATCH 009/196] https for downloading chainfile --- workflows/external/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/external/Snakefile b/workflows/external/Snakefile index 9f8308c9..c6dd34b0 100644 --- a/workflows/external/Snakefile +++ b/workflows/external/Snakefile @@ -24,7 +24,7 @@ rule download_chainfile: output: 'data/dm3ToDm6.over.chain.gz' shell: 'wget -O- ' - 'http://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz ' + 'https://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz ' '> {output}' From 09dedd783973a07fd850552baf02c8878c2a440f Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Tue, 31 Dec 2024 22:16:53 -0500 Subject: [PATCH 010/196] noninteractive apt install --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 582a9842..df941a5a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -48,7 +48,7 @@ variables: name: Set path command: | # x11-utils required to avoid R::png() segfaulting - apt update && apt install -y \ + DEBIAN_FRONTEND=noninteractive apt update && apt install -y \ curl \ git \ locales \ From b6c663a2daea5e336e655590c1d192ab791ac39a Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Tue, 31 Dec 2024 23:02:28 -0500 Subject: [PATCH 011/196] noninteractive apt install --- .circleci/config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index df941a5a..d351ba7e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -48,7 +48,8 @@ variables: name: Set path command: | # x11-utils required to avoid R::png() segfaulting - DEBIAN_FRONTEND=noninteractive apt update && apt install -y \ + export DEBIAN_FRONTEND=noninteractive + apt update && apt install -y \ curl \ git \ locales \ From 2fc5d71455a2da0da7212449ff1981c72ea4ba8a Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Wed, 1 Jan 2025 10:51:25 -0500 Subject: [PATCH 012/196] debug url --- workflows/external/Snakefile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/workflows/external/Snakefile b/workflows/external/Snakefile index c6dd34b0..48ddf73b 100644 --- a/workflows/external/Snakefile +++ b/workflows/external/Snakefile @@ -23,10 +23,7 @@ rule download_chainfile: """ output: 'data/dm3ToDm6.over.chain.gz' shell: - 'wget -O- ' - 'https://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz ' - '> {output}' - + 'curl -L -v https://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz -o {output}' rule beds: """ From a4703987ecd08e384d98033f77f72c6dcc5ad5cb Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Wed, 1 Jan 2025 11:25:12 -0500 Subject: [PATCH 013/196] for test "external" data, do not do liftover ucsc might be blocking circle-ci given the licenseing requirements of chainfiles --- workflows/external/Snakefile | 29 ++++------------------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/workflows/external/Snakefile b/workflows/external/Snakefile index 48ddf73b..79c3d1e2 100644 --- a/workflows/external/Snakefile +++ b/workflows/external/Snakefile @@ -16,35 +16,14 @@ rule targets: input: list(modencode.keys()), - -rule download_chainfile: - """ - Download the chainfile we need for liftover - """ - output: 'data/dm3ToDm6.over.chain.gz' - shell: - 'curl -L -v https://hgdownload.cse.ucsc.edu/goldenPath/dm3/liftOver/dm3ToDm6.over.chain.gz -o {output}' - rule beds: """ - Download URLs, get rid of "track" lines, and then prepare them for liftover + Download URLs, get rid of "track" lines. """ - output: temporary('data/{factor}_{celltype}.bed.dm3') + output: 'data/{factor}_{celltype}.bed' run: - key = str(output[0]).replace('.dm3', '') + key = str(output[0]) url = modencode[key] - shell( - 'wget -O - "{url}" | grep -v "track" > {output}') - -rule liftover: - """ - Perform the liftover - """ - input: - bed='{prefix}.dm3', - chainfile=rules.download_chainfile.output - output: '{prefix}' - shell: - 'liftOver {input.bed} {input.chainfile} {output} {output}.unmapped' + shell('wget -O - "{url}" | grep -v "track" > {output}') # vim: ft=python From ed9161d20ae05fa8e47c2d0d7e8daee3db9ef819 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Wed, 1 Jan 2025 11:26:07 -0500 Subject: [PATCH 014/196] remove support for GAT this tool was last updated in 2017, and has incompatibilites with recent numpy. --- workflows/colocalization/Snakefile | 36 +----------------------------- 1 file changed, 1 insertion(+), 35 deletions(-) diff --git a/workflows/colocalization/Snakefile b/workflows/colocalization/Snakefile index ac0b0413..cb5a7991 100644 --- a/workflows/colocalization/Snakefile +++ b/workflows/colocalization/Snakefile @@ -64,29 +64,22 @@ if ADD_CHIPSEQ_PEAKS: config['beds'][key] = fn -# Number of shufflings for GAT -# N = 100 [TEST_SETTINGS +1] -N = 10000 - targets = expand( '{outdir}/{algorithm}/{domain}/{query}/{query}_vs_{reference}.txt', outdir=config['output'], domain=config['domains'].keys(), query=config['beds'].keys(), reference=config['beds'].keys(), - algorithm=['IntervalStats', 'GAT', 'jaccard', 'fisher'], + algorithm=['IntervalStats', 'jaccard', 'fisher'], ) # Currently-supported options {algorithm: (possible values)} # IntervalStats: (f_05, f_01, f_001) -# GAT: (l2fold, fractions) # jaccard: (jaccard) # fisher: (pval) pattern = '{outdir}/{algorithm}/{domain}/{value}_heatmap.pdf' targets += expand(pattern, outdir=config['output'], domain=config['domains'], algorithm='IntervalStats', value=['f_01']) -targets += expand(pattern, outdir=config['output'], domain=config['domains'], - algorithm='GAT', value=['l2fold']) targets += expand(pattern, outdir=config['output'], domain=config['domains'], algorithm='jaccard', value=['jaccard']) targets += expand(pattern, outdir=config['output'], domain=config['domains'], @@ -216,33 +209,6 @@ rule intervalstats: df.to_csv(str(output[0]), sep='\t', index=False) -rule gat: - input: - domain=lambda wc: config['domains'][getattr(wc, 'domain')], - query=lambda wc: config['beds'][getattr(wc, 'query')], - reference=lambda wc: config['beds'][getattr(wc, 'reference')], - output: '{outdir}/GAT/{domain}/{query}/{query}_vs_{reference}.txt' - run: - shell('cut -f1,2,3 {input.query} > {output}.query.tmp') - shell('cut -f1,2,3 {input.reference} > {output}.reference.tmp') - if os.stat(output[0] + '.query.tmp').st_size == 0: - shell('touch {output}') - else: - shell( - 'gat-run.py ' - '--ignore-segment-tracks ' - '--annotations {output}.reference.tmp ' - '--segments {output}.query.tmp ' - '--workspace {input.domain} ' - '--counter nucleotide-overlap ' - '--num-samples {N} ' - '--output-counts-pattern {output}.%s.counts ' - '--log {output}.log ' - '--stdout {output} ' - ) - shell('rm {output}.query.tmp {output}.reference.tmp') - - rule heatmap: input: expand( From becdf2168daf748a43f04a649ced0d9a7cb8e415 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Thu, 2 Jan 2025 21:23:37 +0000 Subject: [PATCH 015/196] GAT no longer used, remove from requirements --- include/requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/include/requirements.txt b/include/requirements.txt index 6001f6d5..dc7d4d23 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -7,7 +7,6 @@ deeptools fastq-screen fastqc font-ttf-dejavu-sans-mono -gat gffread gffutils hisat2 From dfaec3e9defaec1dce6cba38981080e072dfc20e Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Thu, 2 Jan 2025 21:23:50 +0000 Subject: [PATCH 016/196] don't pin python --- include/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/requirements.txt b/include/requirements.txt index dc7d4d23..4e2b155e 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -26,7 +26,7 @@ pyfaidx pysam pytest pytest-xdist -python>=3.10 +python rseqc # earlier versions of salmon can segfault on Slurm From b65f4cd61ebd411db108ef2deae67f9c8e92c3fa Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Thu, 2 Jan 2025 21:23:58 +0000 Subject: [PATCH 017/196] pin snakemake >8 --- include/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/requirements.txt b/include/requirements.txt index 4e2b155e..fd8df8be 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -34,7 +34,7 @@ salmon>=1.10.1 samtools seaborn -snakemake-minimal +snakemake>8 sra-tools star subread From 0f81f076a05fff5eb0169a574016cf415f5ca775 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Thu, 2 Jan 2025 21:24:50 +0000 Subject: [PATCH 018/196] update env.yml --- env.yml | 612 +++++++++++++++++++++++++++----------------------------- 1 file changed, 296 insertions(+), 316 deletions(-) diff --git a/env.yml b/env.yml index 5b656720..02f0f695 100644 --- a/env.yml +++ b/env.yml @@ -5,380 +5,360 @@ dependencies: - _libgcc_mutex=0.1 - _openmp_mutex=4.5 - _r-mutex=1.0.1 - - alsa-lib=1.2.3.2 - - amply=0.1.5 + - alabaster=1.0.0 + - alsa-lib=1.2.13 + - amply=0.1.6 + - annotated-types=0.7.0 - appdirs=1.4.4 - - argcomplete=3.0.8 - - argh=0.27.2 - - asttokens=2.2.1 - - attr=2.5.1 - - attrs=23.1.0 - - backcall=0.2.0 - - backports=1.0 - - backports.functools_lru_cache=1.6.4 - - bedtools=2.31.0 - - binutils_impl_linux-64=2.39 - - binutils_linux-64=2.39 - - biopython=1.81 - - boost-cpp=1.74.0 + - argcomplete=3.5.2 + - argh=0.31.3 + - argparse-dataclass=2.0.0 + - asttokens=3.0.0 + - attrs=24.3.0 + - babel=2.16.0 + - beautifulsoup4=4.12.3 + - bedtools=2.31.1 + - binutils_impl_linux-64=2.43 + - biopython=1.84 + - boost-cpp=1.85.0 - bowtie=1.3.1 - - bowtie2=2.5.1 - - brotli=1.0.9 - - brotli-bin=1.0.9 - - brotlipy=0.7.0 - - bwidget=1.9.14 - - bx-python=0.9.0 + - bowtie2=2.5.4 + - brotli=1.1.0 + - brotli-bin=1.1.0 + - brotli-python=1.1.0 + - bwidget=1.10.1 + - bx-python=0.13.0 - bzip2=1.0.8 - - c-ares=1.18.1 - - ca-certificates=2023.5.7 - - cairo=1.16.0 - - certifi=2023.5.7 - - cffi=1.15.1 - - charset-normalizer=3.1.0 - - click=8.1.3 - - coin-or-cbc=2.10.10 - - coin-or-cgl=0.60.7 - - coin-or-clp=1.17.8 - - coin-or-osi=0.108.8 - - coin-or-utils=2.11.9 - - coincbc=2.10.10 + - c-ares=1.34.4 + - ca-certificates=2024.12.14 + - cairo=1.18.2 + - certifi=2024.12.14 + - cffi=1.17.1 + - charset-normalizer=3.4.0 + - click=8.1.8 + - coin-or-cbc=2.10.12 + - coin-or-cgl=0.60.9 + - coin-or-clp=1.17.10 + - coin-or-osi=0.108.11 + - coin-or-utils=2.11.12 + - coincbc=2.10.12 - colorama=0.4.6 - coloredlogs=15.0.1 - colormath=3.0.0 - - configargparse=1.5.3 + - conda-inject=1.3.2 + - configargparse=1.7 - connection_pool=0.0.3 - - contourpy=1.0.7 - - cryptography=39.0.0 - - curl=7.86.0 - - cutadapt=4.4 - - cycler=0.11.0 + - contourpy=1.3.1 + - curl=8.11.1 + - cutadapt=5.0 + - cycler=0.12.1 - datrie=0.8.2 - - dbus=1.13.6 - decorator=5.1.1 - - deeptools=3.5.2 + - deeptools=3.5.5 - deeptoolsintervals=0.1.9 - - dnaio=0.10.0 - - docutils=0.20.1 - - dpath=2.1.5 - - epic2=0.0.52 - - exceptiongroup=1.1.1 - - execnet=1.9.0 - - executing=1.2.0 - - expat=2.5.0 - - fastq-screen=0.15.3 + - dnaio=1.2.2 + - docutils=0.21.2 + - dpath=2.2.0 + - eido=0.2.4 + - et_xmlfile=2.0.0 + - exceptiongroup=1.2.2 + - execnet=2.1.1 + - executing=2.1.0 + - expat=2.6.4 + - fastq-screen=0.16.0 - fastqc=0.12.1 - - fftw=3.3.10 - - filelock=3.12.0 - font-ttf-dejavu-sans-mono=2.37 - font-ttf-inconsolata=3.000 - font-ttf-source-code-pro=2.038 - font-ttf-ubuntu=0.83 - - fontconfig=2.14.2 + - fontconfig=2.15.0 - fonts-conda-ecosystem=1 - fonts-conda-forge=1 - - fonttools=4.39.4 + - fonttools=4.55.3 - freetype=2.12.1 - fribidi=1.0.10 - - future=0.18.3 - - gat=1.3.6 - - gcc_impl_linux-64=10.4.0 - - gcc_linux-64=10.4.0 - - gettext=0.21.1 + - gcc_impl_linux-64=14.2.0 - gffread=0.12.7 - - gffutils=0.11.1 - - gfortran_impl_linux-64=10.4.0 - - gfortran_linux-64=10.4.0 - - giflib=5.2.1 - - gitdb=4.0.10 - - gitpython=3.1.31 - - glib=2.74.1 - - glib-tools=2.74.1 - - gmp=6.2.1 + - gffutils=0.13 + - gfortran_impl_linux-64=14.2.0 + - giflib=5.2.2 + - gitdb=4.0.11 + - gitpython=3.1.43 - graphite2=1.3.13 - - gsl=2.7 - - gst-plugins-base=1.18.5 - - gstreamer=1.20.3 - - gxx_impl_linux-64=10.4.0 - - gxx_linux-64=10.4.0 - - harfbuzz=4.2.0 - - hdf5=1.12.1 + - gsl=1.16 + - gxx_impl_linux-64=14.2.0 + - h2=4.1.0 + - harfbuzz=10.1.0 + - hdf5=1.14.3 - hisat2=2.2.1 - - htslib=1.16 + - hpack=4.0.0 + - html5lib=1.1 + - htslib=1.21 - humanfriendly=10.0 - - icu=69.1 - - idna=3.4 - - importlib-metadata=6.6.0 - - importlib_resources=5.12.0 + - humanize=4.11.0 + - hyperframe=6.0.1 + - icu=75.1 + - idna=3.10 + - imagesize=1.4.1 + - immutables=0.21 + - importlib-metadata=8.5.0 + - importlib_resources=6.4.5 - iniconfig=2.0.0 - intervalstats=1.01 - - ipython=8.13.2 - - isa-l=2.30.0 - - jack=1.9.18 - - jedi=0.18.2 - - jinja2=3.1.2 - - jpeg=9e - - jsonschema=4.17.3 - - jupyter_core=5.3.0 - - kallisto=0.48.0 - - kernel-headers_linux-64=2.6.32 + - ipython=8.31.0 + - isa-l=2.31.0 + - jedi=0.19.2 + - jinja2=3.1.5 + - jsonschema=4.23.0 + - jsonschema-specifications=2024.10.1 + - jupyter_core=5.7.2 + - kaleido-core=0.2.1 + - kallisto=0.51.1 + - kernel-headers_linux-64=3.10.0 - keyutils=1.6.1 - - kiwisolver=1.4.4 - - krb5=1.19.3 - - lcms2=2.14 - - ld_impl_linux-64=2.39 + - kiwisolver=1.4.7 + - krb5=1.21.3 + - lcms2=2.16 + - ld_impl_linux-64=2.43 - lerc=4.0.0 + - libaec=1.1.3 - libblas=3.9.0 - - libbrotlicommon=1.0.9 - - libbrotlidec=1.0.9 - - libbrotlienc=1.0.9 - - libcap=2.64 + - libboost=1.85.0 + - libboost-devel=1.85.0 + - libboost-headers=1.85.0 + - libbrotlicommon=1.1.0 + - libbrotlidec=1.1.0 + - libbrotlienc=1.1.0 - libcblas=3.9.0 - - libclang=13.0.1 - libcups=2.3.3 - - libcurl=7.86.0 - - libdb=6.2.32 - - libdeflate=1.13 + - libcurl=8.11.1 + - libdeflate=1.23 - libedit=3.1.20191231 - libev=4.33 - - libevent=2.1.10 - - libexpat=2.5.0 + - libexpat=2.6.4 - libffi=3.4.2 - - libflac=1.3.4 - - libgcc-devel_linux-64=10.4.0 - - libgcc-ng=12.2.0 + - libgcc=14.2.0 + - libgcc-devel_linux-64=14.2.0 + - libgcc-ng=14.2.0 - libgd=2.3.3 - - libgfortran-ng=12.2.0 - - libgfortran5=12.2.0 - - libglib=2.74.1 - - libgomp=12.2.0 - - libhwloc=2.8.0 + - libgfortran=14.2.0 + - libgfortran-ng=14.2.0 + - libgfortran5=14.2.0 + - libglib=2.82.2 + - libgomp=14.2.0 + - libhwloc=2.11.2 - libiconv=1.17 - libjemalloc=5.3.0 + - libjpeg-turbo=3.0.0 - liblapack=3.9.0 - liblapacke=3.9.0 - - libllvm13=13.0.1 - - libnghttp2=1.51.0 - - libnsl=2.0.0 - - libogg=1.3.4 - - libopenblas=0.3.21 - - libopus=1.3.1 - - libpng=1.6.39 - - libpq=14.5 - - libsanitizer=10.4.0 - - libsndfile=1.0.31 - - libsqlite=3.41.2 - - libssh2=1.10.0 - - libstdcxx-devel_linux-64=10.4.0 - - libstdcxx-ng=12.2.0 - - libtiff=4.4.0 - - libtool=2.4.7 - - libudev1=253 + - liblzma=5.6.3 + - liblzma-devel=5.6.3 + - libnghttp2=1.64.0 + - libnsl=2.0.1 + - libopenblas=0.3.28 + - libopenssl-static=3.4.0 + - libpng=1.6.44 + - libsanitizer=14.2.0 + - libsqlite=3.47.2 + - libssh2=1.11.1 + - libstdcxx=14.2.0 + - libstdcxx-devel_linux-64=14.2.0 + - libstdcxx-ng=14.2.0 + - libtiff=4.7.0 - libuuid=2.38.1 - - libvorbis=1.3.7 - - libwebp=1.2.4 - - libwebp-base=1.2.4 - - libxcb=1.13 - - libxkbcommon=1.0.3 - - libxml2=2.9.14 - - libzlib=1.2.13 - - lzo=2.10 - - lzstring=1.0.4 - - make=4.3 - - markdown=3.4.3 - - markdown-it-py=2.2.0 - - markupsafe=2.1.2 - - matplotlib=3.7.1 - - matplotlib-base=3.7.1 - - matplotlib-inline=0.1.6 - - mdurl=0.1.0 - - multiqc=1.14 + - libwebp-base=1.5.0 + - libxcb=1.17.0 + - libxcrypt=4.4.36 + - libxml2=2.13.5 + - libzlib=1.3.1 + - logmuse=0.2.8 + - logomaker=0.8 + - make=4.4.1 + - markdown=3.6 + - markdown-it-py=3.0.0 + - markupsafe=3.0.2 + - mathjax=2.7.7 + - matplotlib-base=3.10.0 + - matplotlib-inline=0.1.7 + - mdurl=0.1.2 + - multiqc=1.26 - munkres=1.1.4 - - mysql-common=8.0.32 - mysql-connector-c=6.1.11 - - mysql-libs=8.0.32 - natsort=8.4.0 - - nbformat=5.8.0 - - ncbi-vdb=3.0.2 - - ncurses=6.3 - - networkx=3.1 - - nspr=4.35 - - nss=3.89 - - numpy=1.23.5 - - openjdk=11.0.1 - - openjpeg=2.5.0 - - openssl=1.1.1t - - ossuuid=1.6.2 - - packaging=23.1 - - pandas=2.0.1 - - pandoc=3.1.2 - - pango=1.50.7 - - parso=0.8.3 - - patsy=0.5.3 + - nbformat=5.10.4 + - ncurses=6.5 + - networkx=3.4.2 + - nspr=4.36 + - nss=3.107 + - numpy=2.2.1 + - numpydoc=1.8.0 + - openjdk=23.0.1 + - openjpeg=2.5.3 + - openpyxl=3.1.5 + - openssl=3.4.0 + - packaging=24.2 + - pandas=2.2.3 + - pandoc=3.6.1 + - pango=1.54.0 + - parso=0.8.4 + - patsy=1.0.1 - pbzip2=1.1.13 - - pcre2=10.37 + - pcre2=10.44 + - pephubclient=0.4.4 + - peppy=0.40.7 - perl=5.32.1 - - perl-alien-build=2.48 - - perl-alien-libxml2=0.17 - - perl-business-isbn=3.007 - - perl-business-isbn-data=20210112.006 - - perl-capture-tiny=0.48 - - perl-carp=1.50 - - perl-constant=1.33 - - perl-data-dumper=2.183 - - perl-encode=3.19 - - perl-exporter=5.74 - - perl-extutils-makemaker=7.70 - - perl-ffi-checklib=0.28 - - perl-file-chdir=0.1011 - - perl-file-path=2.18 - - perl-file-temp=0.2304 - - perl-file-which=1.24 - - perl-gd=2.76 + - perl-gd=2.56 - perl-gdgraph=1.54 - perl-gdtextutil=0.86 - - perl-importer=0.026 - - perl-mime-base64=3.16 - - perl-parent=0.241 - - perl-path-tiny=0.124 - - perl-pathtools=3.75 - - perl-scope-guard=0.21 - - perl-storable=3.15 - - perl-sub-info=0.002 - - perl-term-table=0.016 - - perl-test-fatal=0.016 - - perl-test-warnings=0.031 - - perl-test2-suite=0.000145 - - perl-try-tiny=0.31 - - perl-uri=5.17 - - perl-xml-libxml=2.0207 - - perl-xml-namespacesupport=1.12 - - perl-xml-sax=1.02 - - perl-xml-sax-base=1.09 - - pexpect=4.8.0 + - pexpect=4.9.0 - picard=2.27.5 - pickleshare=0.7.5 - - pigz=2.6 - - pillow=9.2.0 - - pip=23.1.2 - - pixman=0.40.0 + - pigz=2.8 + - pillow=11.0.0 + - pip=24.3.1 + - pixman=0.44.2 - pkgutil-resolve-name=1.3.10 - - plac=1.3.5 - - platformdirs=3.5.1 - - plotly=5.14.1 - - pluggy=1.0.0 - - pooch=1.7.0 - - preseq=3.2.0 - - prompt-toolkit=3.0.38 - - prompt_toolkit=3.0.38 - - psutil=5.9.5 + - plac=1.4.3 + - platformdirs=4.3.6 + - plotly=5.24.1 + - pluggy=1.5.0 + - preseq=2.0.2 + - prompt-toolkit=3.0.48 + - psutil=6.1.1 - pthread-stubs=0.4 - ptyprocess=0.7.0 - - pulp=2.7.0 - - pulseaudio=14.0 - - pure_eval=0.2.2 + - pulp=2.8.0 + - pure_eval=0.2.3 - py2bit=0.3.0 - - pybedtools=0.9.0 - - pybigwig=0.3.18 - - pycparser=2.21 - - pyfaidx=0.7.2.1 - - pygments=2.15.1 - - pyopenssl=23.1.1 - - pyparsing=3.0.9 - - pyqt=5.15.4 - - pyqt5-sip=12.9.0 - - pyrsistent=0.19.3 - - pysam=0.20.0 + - pyaml-env=1.2.1 + - pybedtools=0.11.0 + - pybigwig=0.3.23 + - pycparser=2.22 + - pydantic=2.10.4 + - pydantic-core=2.27.2 + - pyfaidx=0.8.1.3 + - pygments=2.18.0 + - pyparsing=3.2.1 + - pysam=0.22.1 - pysocks=1.7.1 - - pytest=7.3.1 - - pytest-xdist=3.2.1 - - python=3.10.8 - - python-dateutil=2.8.2 - - python-fastjsonschema=2.16.3 - - python-isal=1.1.0 - - python-lzo=1.14 - - python-tzdata=2023.3 - - python_abi=3.10 - - pytz=2023.3 + - pytest=8.3.4 + - pytest-xdist=3.6.1 + - python=3.12.8 + - python-dateutil=2.9.0.post0 + - python-fastjsonschema=2.21.1 + - python-isal=1.7.1 + - python-kaleido=0.2.1 + - python-tzdata=2024.2 + - python-zlib-ng=0.5.1 + - python_abi=3.12 + - pytz=2024.1 - pyvcf3=1.0.3 - - pyyaml=6.0 - - qt-main=5.15.2 - - r-base=4.1.3 + - pyyaml=6.0.2 + - qhull=2020.2 + - r-base=4.2.3 - readline=8.2 - - requests=2.29.0 + - referencing=0.35.1 + - requests=2.32.3 - reretry=0.11.8 - - rich=13.3.5 - - rich-click=1.6.1 - - rseqc=5.0.1 - - salmon=1.10.1 - - samtools=1.16.1 - - scipy=1.10.1 - - seaborn=0.12.2 - - seaborn-base=0.12.2 + - rich=13.9.4 + - rich-click=1.8.5 + - rpds-py=0.22.3 + - rseqc=5.0.4 + - salmon=1.10.3 + - samtools=1.21 + - scipy=1.14.1 + - seaborn=0.13.2 + - seaborn-base=0.13.2 - sed=4.8 - - setuptools=67.7.2 - - simplejson=3.19.1 - - sip=6.5.1 - - six=1.16.0 - - smart_open=6.3.0 - - smmap=3.0.5 - - snakemake-minimal=7.25.3 + - setuptools=75.6.0 + - shellingham=1.5.4 + - simplejson=3.19.3 + - six=1.17.0 + - slack-sdk=3.34.0 + - slack_sdk=3.34.0 + - smart_open=7.1.0 + - smmap=5.0.0 + - snakemake=8.26.0 + - snakemake-interface-common=1.17.4 + - snakemake-interface-executor-plugins=9.3.3 + - snakemake-interface-report-plugins=1.1.0 + - snakemake-interface-storage-plugins=3.3.0 + - snakemake-minimal=8.26.0 + - snowballstemmer=2.2.0 + - soupsieve=2.5 - spectra=0.0.11 - - sqlite=3.41.2 - - sra-tools=3.0.3 - - stack_data=0.6.2 - - star=2.7.10b - - statsmodels=0.14.0 - - stopit=1.1.2 - - subread=2.0.3 - - sysroot_linux-64=2.12 + - sphinx=8.1.3 + - sphinxcontrib-applehelp=2.0.0 + - sphinxcontrib-devhelp=2.0.0 + - sphinxcontrib-htmlhelp=2.1.0 + - sphinxcontrib-jsmath=1.0.1 + - sphinxcontrib-qthelp=2.0.0 + - sphinxcontrib-serializinghtml=1.1.10 + - sqlite=3.47.2 + - sra-tools=2.9.6 + - stack_data=0.6.3 + - star=2.7.11b + - statsmodels=0.14.4 + - subread=2.0.8 + - sysroot_linux-64=2.17 - tabulate=0.9.0 - - tbb=2021.7.0 - - tenacity=8.2.2 - - throttler=1.2.1 - - tk=8.6.12 + - tbb=2022.0.0 + - tenacity=9.0.0 + - throttler=1.2.2 + - tk=8.6.13 - tktable=2.10 - - toml=0.10.2 - - tomli=2.0.1 - - toposort=1.10 - - tornado=6.3.2 - - trackhub=0.2.4 - - traitlets=5.9.0 - - typing-extensions=4.5.0 - - typing_extensions=4.5.0 - - tzdata=2023c - - ucsc-bedgraphtobigwig=377 - - ucsc-bedsort=377 - - ucsc-bedtobigbed=377 - - ucsc-bigwigmerge=377 - - ucsc-fetchchromsizes=377 - - ucsc-genepredtobed=377 - - ucsc-gtftogenepred=377 - - ucsc-liftover=377 - - ucsc-oligomatch=377 - - ucsc-twobittofa=377 - - ucsc-wigtobigwig=377 - - unicodedata2=15.0.0 - - urllib3=1.26.15 - - wcwidth=0.2.6 - - wheel=0.40.0 - - wrapt=1.15.0 - - xopen=1.7.0 - - xorg-kbproto=1.0.7 - - xorg-libice=1.0.10 - - xorg-libsm=1.2.3 - - xorg-libx11=1.8.4 - - xorg-libxau=1.0.9 - - xorg-libxdmcp=1.1.3 - - xorg-libxext=1.3.4 - - xorg-libxrender=0.9.10 - - xorg-libxt=1.2.1 - - xorg-renderproto=0.11.1 - - xorg-xextproto=7.3.0 - - xorg-xproto=7.0.31 - - xz=5.2.6 + - tomli=2.2.1 + - tqdm=4.67.1 + - trackhub=1.0 + - traitlets=5.14.3 + - typeguard=4.4.1 + - typer=0.15.1 + - typer-slim=0.15.1 + - typer-slim-standard=0.15.1 + - typing-extensions=4.12.2 + - typing_extensions=4.12.2 + - tzdata=2024b + - ubiquerg=0.8.0 + - ucsc-bedgraphtobigwig=472 + - ucsc-bedsort=469 + - ucsc-bedtobigbed=473 + - ucsc-bigwigmerge=469 + - ucsc-fetchchromsizes=469 + - ucsc-genepredtobed=469 + - ucsc-gtftogenepred=469 + - ucsc-liftover=469 + - ucsc-oligomatch=469 + - ucsc-stringify=472 + - ucsc-twobittofa=472 + - ucsc-wigtobigwig=472 + - unicodedata2=15.1.0 + - urllib3=2.3.0 + - veracitools=0.1.3 + - wcwidth=0.2.13 + - webencodings=0.5.1 + - wheel=0.45.1 + - wrapt=1.17.0 + - xopen=2.0.2 + - xorg-libice=1.1.2 + - xorg-libsm=1.2.5 + - xorg-libx11=1.8.10 + - xorg-libxau=1.0.12 + - xorg-libxdmcp=1.1.5 + - xorg-libxext=1.3.6 + - xorg-libxfixes=6.0.1 + - xorg-libxi=1.8.2 + - xorg-libxrandr=1.5.4 + - xorg-libxrender=0.9.12 + - xorg-libxt=1.3.1 + - xorg-libxtst=1.2.5 + - xz=5.6.3 + - xz-gpl-tools=5.6.3 + - xz-tools=5.6.3 - yaml=0.2.5 - - yte=1.5.1 - - zipp=3.15.0 - - zlib=1.2.13 - - zstandard=0.19.0 - - zstd=1.5.2 + - yte=1.5.5 + - zipp=3.21.0 + - zlib=1.3.1 + - zlib-ng=2.2.3 + - zstandard=0.23.0 + - zstd=1.5.6 From f039b64128c26035540d55cc2dc898999ee34152 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Thu, 2 Jan 2025 21:25:20 +0000 Subject: [PATCH 019/196] update snakefiles and lib to reflect changes in snakemake 8 --- lib/helpers.py | 4 ++-- workflows/chipseq/Snakefile | 4 +++- workflows/references/Snakefile | 4 +++- workflows/rnaseq/Snakefile | 8 +++++--- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/lib/helpers.py b/lib/helpers.py index 4723286c..9e0d6323 100644 --- a/lib/helpers.py +++ b/lib/helpers.py @@ -3,7 +3,7 @@ from itertools import product import pandas as pd from snakemake.shell import shell -from snakemake.io import expand, regex +from snakemake.io import expand, regex_from_filepattern from lib import common @@ -118,7 +118,7 @@ def extract_wildcards(pattern, target): >>> assert extract_wildcards(pattern, target) == expected >>> assert extract_wildcards(pattern, 'asdf') is None """ - m = re.compile(regex(pattern)).match(target) + m = re.compile(regex_from_filepattern(pattern)).match(target) if m: return m.groupdict() diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 90c84d28..2b5fc485 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -1,5 +1,4 @@ import sys -sys.path.insert(0, srcdir('../..')) import os from textwrap import dedent import yaml @@ -7,6 +6,9 @@ import tempfile import pandas as pd import numpy as np import pybedtools + +HERE = str(Path(workflow.snakefile).parent) +sys.path.insert(0, HERE + "/../..") from lib import common, utils, helpers, aligners, chipseq from lib.patterns_targets import ChIPSeqConfig from lib.utils import autobump, gb, hours diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index d6bc9d0f..815d00c6 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -1,12 +1,14 @@ import os import sys -sys.path.insert(0, srcdir('../..')) import gzip import yaml import importlib import tempfile import pandas from snakemake.utils import makedirs + +HERE = str(Path(workflow.snakefile).parent) +sys.path.insert(0, HERE + "/../..") from lib.imports import resolve_name from lib import utils from lib.utils import autobump, gb, hours diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index e979cfdc..0ac150b8 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -1,11 +1,13 @@ -import sys - -sys.path.insert(0, srcdir('../..')) import os +import sys +from pathlib import Path from textwrap import dedent import yaml import tempfile import pandas as pd + +HERE = str(Path(workflow.snakefile).parent) +sys.path.insert(0, HERE + "/../..") from lib import common, utils, helpers, aligners from lib.utils import autobump, gb, hours from lib.patterns_targets import RNASeqConfig From bec163d1b3855cb396362614bd3b00dd3c7434d4 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Thu, 2 Jan 2025 21:25:40 +0000 Subject: [PATCH 020/196] rm --bias for kallisto, which was causing segfaults --- workflows/rnaseq/Snakefile | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 0ac150b8..47c2a324 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -900,7 +900,6 @@ rule kallisto: '--output-dir {params.outdir} ' '--threads {threads} ' '--bootstrap-samples 100 ' - '--bias ' '--threads {threads} ' '{se_args} ' '{params.strand_arg} ' From cc310fb83e4d681ac550ea5f5d8543ca8da50911 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Thu, 2 Jan 2025 21:49:19 +0000 Subject: [PATCH 021/196] update test args -r --> --reason for snakemake 8 --- .circleci/config.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index d351ba7e..af992587 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -185,7 +185,7 @@ variables: cd $DEPLOY/workflows/chipseq source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p -r + $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p --reason $DEPLOY/test/lcdb-wf-test chipseq --trackhub # -------------------------------------------------------------------------- @@ -200,7 +200,7 @@ variables: source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - ./run_test.sh --use-conda -j2 -k -p -r \ + ./run_test.sh --use-conda -j2 -k -p --reason \ --configfile $ORIG/test/test_configs/test_chipseq_regression.yaml \ --config sampletable=$ORIG/test/test_configs/chipseq_one_run.tsv \ merged_bigwigs="{}" \ @@ -228,7 +228,7 @@ variables: command: | source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -r -k --orig $ORIG + $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p --reason -k --orig $ORIG # -------------------------------------------------------------------------- # Standard RNA-seq workflow @@ -240,7 +240,7 @@ variables: source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow -n - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p -r --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p --reason --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --trackhub --orig $ORIG @@ -272,12 +272,12 @@ variables: # provide; some of them use the --until argument to restrict the # rules that are run. Note the use of --orig $ORIG to use the test # configs from the original clone rather than the deployed directory. - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass -k -r -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k -r -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k --reason -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k --reason -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k --reason -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass -k --reason -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass -k --reason -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k --reason -p -j2 --use-conda --orig $ORIG @@ -290,7 +290,7 @@ variables: cd $DEPLOY/workflows/colocalization source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -r -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k --reason -p -j2 --use-conda --orig $ORIG # -------------------------------------------------------------------------- # Syntax note: All of the steps above, with their "&step-name" labels, can be From 54514e9642448c5b907582acb20f604c4acf3974 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Thu, 2 Jan 2025 22:31:39 +0000 Subject: [PATCH 022/196] rm --reason for snakemake 8 --- .circleci/config.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index af992587..50b1051a 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -185,7 +185,7 @@ variables: cd $DEPLOY/workflows/chipseq source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p --reason + $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p $DEPLOY/test/lcdb-wf-test chipseq --trackhub # -------------------------------------------------------------------------- @@ -200,7 +200,7 @@ variables: source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - ./run_test.sh --use-conda -j2 -k -p --reason \ + ./run_test.sh --use-conda -j2 -k -p \ --configfile $ORIG/test/test_configs/test_chipseq_regression.yaml \ --config sampletable=$ORIG/test/test_configs/chipseq_one_run.tsv \ merged_bigwigs="{}" \ @@ -228,7 +228,7 @@ variables: command: | source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p --reason -k --orig $ORIG + $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -k --orig $ORIG # -------------------------------------------------------------------------- # Standard RNA-seq workflow @@ -240,7 +240,7 @@ variables: source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow -n - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p --reason --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --trackhub --orig $ORIG @@ -272,12 +272,12 @@ variables: # provide; some of them use the --until argument to restrict the # rules that are run. Note the use of --orig $ORIG to use the test # configs from the original clone rather than the deployed directory. - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k --reason -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k --reason -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k --reason -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass -k --reason -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass -k --reason -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k --reason -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass -k -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass -k -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k -p -j2 --use-conda --orig $ORIG @@ -290,7 +290,7 @@ variables: cd $DEPLOY/workflows/colocalization source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k --reason -p -j2 --use-conda --orig $ORIG + $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -p -j2 --use-conda --orig $ORIG # -------------------------------------------------------------------------- # Syntax note: All of the steps above, with their "&step-name" labels, can be From 06c147b97e2df018114a2ff852b4e1850f1d5b73 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 3 Jan 2025 02:38:38 +0000 Subject: [PATCH 023/196] disable colocalization workflow --- .circleci/config.yml | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 50b1051a..02b27915 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -140,7 +140,7 @@ variables: cp $ORIG/workflows/rnaseq/run_test.sh $DEPLOY/workflows/rnaseq/run_test.sh cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh cp $ORIG/workflows/references/run_test.sh $DEPLOY/workflows/references/run_test.sh - cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh + # cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh mkdir $DEPLOY/ci mkdir $DEPLOY/test @@ -399,14 +399,14 @@ jobs: - *get-data - *rnaseq-misc-step - colocalization: - <<: *defaults - steps: - - checkout - - *restore_cache - - *set-path - - *get-data - - *colocalization-step + # colocalization: + # <<: *defaults + # steps: + # - checkout + # - *restore_cache + # - *set-path + # - *get-data + # - *colocalization-step references: <<: *defaults @@ -493,10 +493,10 @@ workflows: requires: - initial-setup - pytest - - colocalization: - requires: - - initial-setup - - pytest + # - colocalization: + # requires: + # - initial-setup + # - pytest - build-docs: requires: - initial-setup @@ -507,4 +507,4 @@ workflows: - chipseq - chipseq-misc - references - - colocalization + # - colocalization From bea0910394da3e82bfc3cb8a0b76a57943cd4c62 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 3 Jan 2025 19:22:22 +0000 Subject: [PATCH 024/196] delete lots of stuff --- lib/aligners.py | 85 --- lib/common.py | 914 ------------------------ lib/helpers.py | 219 ------ lib/imports.py | 22 - lib/postprocess/adapters.py | 6 - lib/postprocess/dicty.py | 18 - lib/postprocess/hg19.py | 3 - lib/postprocess/hg38.py | 14 - workflows/references/Snakefile | 369 ---------- workflows/references/config/config.yaml | 6 - workflows/references/run_test.sh | 3 - 11 files changed, 1659 deletions(-) delete mode 100644 lib/aligners.py delete mode 100644 lib/common.py delete mode 100644 lib/helpers.py delete mode 100644 lib/imports.py delete mode 100644 lib/postprocess/adapters.py delete mode 100644 lib/postprocess/dicty.py delete mode 100644 lib/postprocess/hg19.py delete mode 100644 lib/postprocess/hg38.py delete mode 100644 workflows/references/Snakefile delete mode 100644 workflows/references/config/config.yaml delete mode 100755 workflows/references/run_test.sh diff --git a/lib/aligners.py b/lib/aligners.py deleted file mode 100644 index 62fe58a5..00000000 --- a/lib/aligners.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Helper functions for working with aligners within Snakefiles -""" - - -def hisat2_index_from_prefix(prefix): - """ - Given a prefix, return a list of the corresponding hisat2 index files. - """ - return ['{prefix}.{n}.ht2'.format(prefix=prefix, n=n) for n in range(1, 9)] - - -def prefix_from_hisat2_index(index_files): - """ - Given a list of index files for hisat2, return the corresponding prefix. - """ - if isinstance(index_files, str): - return '.'.join(index_files.split('.')[:-2]) - else: - prefixes = list( - set( - map( - lambda x: '.'.join(x.split('.')[:-2]), index_files) - ) - ) - if len(prefixes) != 1: - raise ValueError( - "More than one prefix detected from '{0}'".format(prefixes) - ) - return prefixes[0] - - -def bowtie2_index_from_prefix(prefix): - """ - Given a prefix, return a list of the corresponding bowtie2 index files. - """ - return ( - [ - '{prefix}.{n}.bt2'.format(prefix=prefix, n=n) - for n in range(1, 5) - ] + [ - '{prefix}.rev.{n}.bt2'.format(prefix=prefix, n=n) - for n in range(1, 3) - ] - ) - - -def prefix_from_bowtie2_index(index_files): - """ - Given a list of index files for bowtie2, return the corresponding prefix. - """ - if isinstance(index_files, str): - return '.'.join(index_files.replace('.rev', '').split('.')[:-2]) - else: - prefixes = list( - set( - map( - lambda x: '.'.join(x.replace('.rev', '').split('.')[:-2]), - index_files) - ) - ) - if len(prefixes) != 1: - raise ValueError( - "More than one prefix detected from '{0}'".format(prefixes) - ) - return prefixes[0] - -def fastq_arg_from_input(fastqs): - """ - Prepares the correct input FASTQ arguments for bowtie2 and HISAT2 based on - whether or not the sample is paired-end. - - Parameters - ---------- - fastqs : list-like - List or snakemake.input object containing fastq filenames. - """ - - if isinstance(fastqs, str) or len(fastqs) == 1: - fastqs = '-U {0} '.format(fastqs) - else: - assert len(fastqs) == 2 - fastqs = '-1 {0} -2 {1} '.format(*fastqs) - return fastqs - diff --git a/lib/common.py b/lib/common.py deleted file mode 100644 index 829cc129..00000000 --- a/lib/common.py +++ /dev/null @@ -1,914 +0,0 @@ -import glob -import subprocess -import time -import os -import warnings -import urllib.request as request -import contextlib -import yaml -import pandas -from Bio import SeqIO -import gzip -import binascii -from lib.imports import resolve_name -from lib import aligners -from lib import utils -from snakemake.shell import shell -from snakemake.io import expand - -# List of possible keys in config that are to be interpreted as paths -PATH_KEYS = [ - 'references_dir', - 'sampletable', - 'sample_dir', - 'aggregation_dir', - 'merged_dir', - 'peaks_dir', - 'hub_config', -] - - -def _is_gzipped(fn): - """ - Filename-independent method of checking if a file is gzipped or not. Uses - the magic number. - - xref https://stackoverflow.com/a/47080739 - """ - with open(fn, 'rb') as f: - return binascii.hexlify(f.read(2)) == b'1f8b' - - -def openfile(tmp, mode): - """ - Returns an open file handle; auto-detects gzipped files. - """ - if _is_gzipped(tmp): - return gzip.open(tmp, mode) - else: - return open(tmp, mode) - - -def resolve_config(config, workdir=None): - """ - Finds the config file. - - Parameters - ---------- - config : str, dict - If str, assume it's a YAML file and parse it; otherwise pass through - - workdir : str - Optional location to specify relative location of all paths in `config` - """ - if isinstance(config, str): - config = yaml.load(open(config), Loader=yaml.FullLoader) - - def rel(pth): - if workdir is None or os.path.isabs(pth): - return pth - return os.path.join(workdir, pth) - for key in PATH_KEYS: - if key in config: - config[key] = rel(config[key]) - return config - - -def gzipped(tmpfiles, outfile): - """ - Cat-and-gzip a list of uncompressed files into a compressed output file. - """ - with gzip.open(outfile, 'wt') as fout: - for f in tmpfiles: - with open(f) as infile: - for line in infile: - fout.write(line) - - -def cat(tmpfiles, outfile): - """ - Simple concatenation of files. - - Note that gzipped files can be concatenated as-is without un- and re- - compressing. - """ - shell('cat {tmpfiles} > {outfile}') - - -def filter_fastas(tmpfiles, outfile, pattern): - """ - Extract records from fasta file(s) given a search pattern. - - Given input gzipped FASTAs, create a new gzipped fasta containing only - records whose description matches `pattern`. - - Parameters - ---------- - tmpfiles : list - gzipped fasta files to look through - - outfile : str - gzipped output fastq file - - pattern : str - Look for this string in each record's description - - """ - def gen(): - for tmp in tmpfiles: - handle = gzip.open(tmp, 'rt') - parser = SeqIO.parse(handle, 'fasta') - for rec in parser: - if pattern not in rec.description: - continue - rec.seq = rec.seq.back_transcribe() - rec.description = rec.name - yield rec - - with gzip.open(outfile, 'wt') as fout: - SeqIO.write(gen(), fout, 'fasta') - - -def twobit_to_fasta(tmpfiles, outfile): - """ - Converts .2bit files to fasta. - - Parameters - ---------- - tmpfiles : list - 2bit files to convert - - outfile : str - gzipped output fastq file - """ - # Note that twoBitToFa doesn't support multiple input files, but we want to - # support them with this function - lookup = {i: i + '.fa' for i in tmpfiles} - for i in tmpfiles: - fn = lookup[i] - shell('twoBitToFa {i} {fn}') - - # Make sure we retain the order of the originally-provided files from the - # config when concatenating. - fastas = [lookup[i] for i in tmpfiles] - shell('cat {fastas} | gzip -c > {outfile}') - shell('rm {fastas}') - - -def download_and_postprocess(outfile, config, organism, tag, type_): - """ - Given an output file, figure out what to do based on the config. - - See notes below for details. - - Parameters - ---------- - outfile : str - - config : dict - - organism : str - Which organism to use. Must be a key in the "references" section of the - config. - - tag : str - Which tag for the organism to use. Must be a tag for the organism in - the config - - type_ : str - A supported references type (gtf, fasta) to use. - - Notes - ----- - - This function: - - - uses `organism`, `tag`, `type_` as a key into the config dict to - figure out: - - - what postprocessing function (if any) was specified along with - its optional args - - the URL[s] to download - - - resolves the name of the postprocessing function (if provided) and - imports it - - downloads the URL[s] to tempfile[s] - - calls the imported postprocessing function using the tempfile[s] and - outfile plus any additional specified arguments. - - - The postprocessing function must have one of the following signatures, - where `infiles` contains the list of temporary files downloaded from the - URL or URLs specified, and `outfile` is a gzipped file expected to be - created by the function:: - - def func(infiles, outfile): - pass - - or:: - - def func(infiles, outfile, *args): - pass - - or:: - - def func(infiles, outfile, *args, **kwargs): - pass - - - The function is specified as a string that resolves to an importable - function, e.g., `postprocess: lib.postprocess.dm6.fix` will call a function - called `fix` in the file `lib/postprocess/dm6.py`. - - If the contents of `postprocess:` is a dict, it must have at least the key - `function`, and optionally `args` and/or `kwargs` keys. The `function` key - indicates the importable path to the function. `args` can be a string - or list of arguments that will be provided as additional args to a function - with the second kind of signature above. If `kwargs` is provided, it is - a dict that is passed to the function with the third kind of signature - above. For example:: - - postprocess: - function: lib.postprocess.dm6.fix - args: - - True - - 3 - - or:: - - postprocess: - function: lib.postprocess.dm6.fix - args: - - True - - 3 - kwargs: - skip: exon - - """ - - def default_postprocess(origfn, newfn): - """ - If no other postprocess function is defined, then simply move the - original to the new. - """ - shell("mv {origfn} {newfn}") - - block = config['references'][organism][tag][type_] - - # postprocess can be missing, in which case we use the default above - post_process = block.get('postprocess', None) - - if not isinstance(post_process, list): - post_process = [post_process] - - funcs = [] - func_tmpfiles = [] - for i, post_process_block in enumerate(post_process): - if post_process_block is None: - func = default_postprocess - args = () - kwargs = {} - name = None - - # postprocess can have a single string value (indicating the function) or - # it can be a dict with keys "function" and optionally "args". The value of - # "args" can be a string or a list. - else: - if isinstance(post_process_block, dict): - name = post_process_block.get('function', post_process) - args = post_process_block.get('args', ()) - kwargs = post_process_block.get('kwargs', {}) - if isinstance(args, str): - args = (args,) - elif isinstance(post_process_block, str): - name = post_process_block - args = () - kwargs = {} - - # In the special case where there is kwarg beginning and ending - # with "__", this can be a dotted function name so it will be - # resolved here as well and passed along to the postprocessing - # function. - # - # This makes it possible to do things like add ERCC annotations on - # the end of other annotations that themselves need to be - # post-processed. - for kw in kwargs: - if kw.startswith('__') and kw.endswith('__'): - kwargs[kw] = resolve_name(kwargs[kw]) - - # import the function - func = resolve_name(name) - - tmp_outfile = f'{outfile}.{i}.{name}.tmp' - func_tmpfiles.append(tmp_outfile) - funcs.append([func, args, kwargs, tmp_outfile]) - - # The last func's outfile should be the final outfile - funcs[-1][-1] = outfile - - # as described in the docstring above, functions are to assume a list of - # urls - urls = block['url'] - if isinstance(urls, str): - urls = [urls] - - # Download tempfiles into reasonably-named filenames - tmpfiles = ['{0}.{1}.tmp'.format(outfile, i) for i in range(len(urls))] - tmpinputfiles = tmpfiles - try: - for url, tmpfile in zip(urls, tmpfiles): - if url.startswith('file:'): - url = url.replace('file://', '') - shell('cp {url} {tmpfile} 2> {outfile}.log') - else: - shell("wget {url} -O- > {tmpfile} 2> {outfile}.log") - - for func, args, kwargs, outfile in funcs: - func(tmpinputfiles, outfile, *args, **kwargs) - tmpinputfiles = [outfile] - - except Exception as e: - raise e - finally: - for i in tmpfiles + func_tmpfiles: - if os.path.exists(i): - shell('rm {i}') - - -def references_dict(config): - """ - Transforms the references section of the config file. - - The references section of the config file is designed to be human-editable, - and to only need the URL(s). User-specified indexes, conversions, and - post-processing functions can also be added. - - For example, the config might say:: - - human: - gencode: - fasta: - indexes: - - hisat2 - - In this function, we need to convert that "indexes: [hisat2]" into the full - path of the hisat2 index that can be used as input for a Snakemake rule. In - this example, in the dictionary returned below we can then get that path - with `d['human']['gencode']['hisat2']`, or more generally, - `d[organism][tag][type]`. - - Parameters - ---------- - config : dict - - Notes - ----- - - The config file is designed to be easy to edit and use from the user's - standpoint. But it's not so great for practical usage. Here we convert the - config file which has the format:: - - ... references_dir: "/data" - ... references: - ... dm6: - ... r6-11: - ... metadata: - ... reference_genome_build: 'dm6' - ... reference_effective_genome_count: 1.2e7 - ... reference_effective_genome_proportion: 0.97 - ... genome: - ... url: "" - ... indexes: - ... - bowtie2 - ... - hisat2 - ... annotation: - ... url: "" - ... conversions: - ... - refflat - ... transcriptome: - ... indexes: - ... - salmon - - To this format:: - - ... 'dm6': { - ... 'r6-11': { - ... 'annotation': '/data/dm6/r6-11/annotation/dm6_r6-11.gtf', - ... 'bowtie2': '/data/dm6/r6-11/genome/bowtie2/dm6_r6-11.1.bt2', - ... 'bowtie2_fasta': '/data/dm6/r6-11/genome/bowtie2/dm6_r6-11.fasta', - ... 'chromsizes': '/data/dm6/r6-11/genome/dm6_r6-11.chromsizes', - ... 'genome': '/data/dm6/r6-11/genome/dm6_r6-11.fasta', - ... 'hisat2': '/data/dm6/r6-11/genome/hisat2/dm6_r6-11.1.ht2', - ... 'hisat2_fasta': '/data/dm6/r6-11/genome/hisat2/dm6_r6-11.fasta', - ... 'refflat': '/data/dm6/r6-11/annotation/dm6_r6-11.refflat', - ... 'salmon': '/data/dm6/r6-11/transcriptome/salmon/dm6_r6-11/versionInfo.json', - ... 'salmon_fasta': '/data/dm6/r6-11/transcriptome/salmon/dm6_r6-11.fasta', - ... 'transcriptome': '/data/dm6/r6-11/transcriptome/dm6_r6-11.fasta', - ... }, - ... } - - """ - if isinstance(config, str): - config = yaml.load(open(config), Loader=yaml.FullLoader) - - references_dir = get_references_dir(config) - - # Map "indexes" value to a pattern specific to each index. - index_extensions = { - 'bowtie2': aligners.bowtie2_index_from_prefix('')[0], - 'hisat2': aligners.hisat2_index_from_prefix('')[0], - 'star': '/Genome', - - # Notes on salmon indexing: - # - pre-1.0 versions had hash.bin - # - post-1.0 versions do not have hash.bin but do have several other - # different .bin files - # - both appear to have versionInfo.json - # - # In order to support both, we use a filename found in common between - # the version. - 'salmon': '/versionInfo.json', - 'kallisto': '/transcripts.idx', - } - - conversion_extensions = { - - 'intergenic': '.intergenic.gtf', - 'refflat': '.refflat', - 'gffutils': '.gtf.db', - 'bed12': '.bed12', - 'genelist': '.genelist', - 'annotation_hub': '.{keytype}.csv', - 'mappings': '.mapping.tsv.gz', - } - - d = {} - conversion_kwargs = {} - - merged_references = config['references'] - - type_extensions = { - 'genome': 'fasta', - 'annotation': 'gtf', - 'transcriptome': 'fasta' - } - - for organism in merged_references.keys(): - d[organism] = {} - for tag in merged_references[organism].keys(): - e = {} - for type_, block in merged_references[organism][tag].items(): - if type_ == 'metadata': - continue - try: - type_extension = type_extensions[type_] - - except KeyError: - raise ValueError( - - "KeyError: " + type_ + "\n" - "\nConfig file format has changed:\n" - " - 'fasta:' -> 'genome:'\n" - " - 'gtf:' -> 'annotation:'\n" - " - new 'transcriptome:' section\n" - "\nSee docs for details\n\n" - - ) - e[type_] = ( - '{references_dir}/' - '{organism}/' - '{tag}/' - '{type_}/' - '{organism}_{tag}.{type_extension}'.format(**locals()) - ) - - # Add conversions if specified. - if type_ == 'annotation': - conversions = block.get('conversions', []) - for conversion in conversions: - kwargs = {} - if isinstance(conversion, dict): - # if conversion is specified as dict, we assume - # that there is only one key, and that key is the - # actual name of the conversion; the corresponding - # value will be kwargs. This is used e.g. for - # gffutils conversion which often need some - # tweaking of args depending on the gtf format. - assert len(list(conversion.keys())) == 1 - kwargs = list(conversion.values())[0] - conversion = list(conversion.keys())[0] - - # While the full set of columns for annotation hub are - # not known in advance, we can assume at least the - # keytype provided will be an output file. Fill that in - # here. - if conversion == 'annotation_hub': - keytype = kwargs['keytype'] - ext = conversion_extensions[conversion].format(keytype=keytype) - else: - ext = conversion_extensions[conversion] - output = ( - '{references_dir}/' - '{organism}/' - '{tag}/' - '{type_}/' - '{organism}_{tag}{ext}'.format(**locals()) - ) - e[conversion] = output - - conversion_kwargs[output] = kwargs - - if type_ in ['genome', 'transcriptome']: - # Add indexes if specified - indexes = block.get('indexes', []) - for index in indexes: - ext = index_extensions[index] - - e[index] = ( - '{references_dir}/{organism}/{tag}/{type_}/{index}/{organism}_{tag}{ext}' - .format(**locals()) - ) - - # Each index will get the original fasta symlinked over - # to its directory - e[index + '_fasta'] = ( - '{references_dir}/{organism}/{tag}/{type_}/{index}/{organism}_{tag}.fasta' - .format(**locals()) - ) - - # Only makes sense to have chromsizes for genome fasta, not transcriptome. - if type_ == 'genome': - e['chromsizes'] = ( - '{references_dir}/' - '{organism}/' - '{tag}/' - '{type_}/' - '{organism}_{tag}.chromsizes'.format(**locals()) - ) - d[organism][tag] = e - return d, conversion_kwargs - - -def get_references_dir(config): - """ - Identify the references directory based on config and env vars. - - Returns the references dir, preferring the value of an existing environment - variable `REFERENCES_DIR` over the config entry "references_dir". Raise an - error if either can't be found. - - Parameters - ---------- - config : dict - """ - config = resolve_config(config) - references_dir = os.environ.get( - 'REFERENCES_DIR', config.get('references_dir', None)) - if references_dir is None: - raise ValueError('No references dir specified') - return references_dir - - -def get_sampletable(config): - """ - Return samples and pandas.DataFrame of parsed sampletable. - - Returns the sample IDs and the parsed sampletable from the file specified - in the config. - - The sample IDs are assumed to be the first column of the sampletable. - - Parameters - ---------- - config : dict - """ - config = resolve_config(config) - sampletable = pandas.read_csv(config['sampletable'], comment="#", sep='\t') - samples = sampletable.iloc[:, 0] - return samples, sampletable - - -def get_techreps(sampletable, label): - """ - Return all sample IDs for which the "label" column is `label`. - """ - # since we're not requiring a name but we want to use `loc` - first_col = sampletable.columns[0] - result = list(sampletable.loc[sampletable['label'] == label, first_col]) - - # If we're using a ChIP-seq-like sampletable we can provide a more - # informative error message. - - is_chipseq = 'antibody' in sampletable.columns - if is_chipseq: - err = (""" - No technical replicates found for label '{}'. Check the ChIP-seq config - file to ensure the peak-calling section only specifies values from the - sampletable's "label" column.""".format(label) - ) - else: - err = "No technical replicates found for label '{}'.".format(label) - - if len(result) == 0: - raise ValueError(err) - - return result - - -def load_config(config, missing_references_ok=False): - """ - Loads the config. - - Resolves any included references directories/files and runs the deprecation - handler. - """ - if isinstance(config, str): - config = yaml.load(open(config), Loader=yaml.FullLoader) - - # Here we populate a list of reference sections. Items later on the list - # will have higher priority - includes = config.get('include_references', []) - for i in includes: - if not os.path.exists(i): - raise ValueError("include_references: '{}' does not exist".format(i)) - reference_sections = [] - - # First the directories. Directories that come earlier lose to those that - # come later. - for dirname in filter(os.path.isdir, includes): - # Note we're looking recursively for .yaml and .yml, so very large - # reference directories are possible - for fn in glob.glob(os.path.join(dirname, '**/*.y?ml'), - recursive=True): - refs = yaml.load(open(fn), Loader=yaml.FullLoader).get('references', None) - if refs is None: - if not missing_references_ok: - raise ValueError("No 'references:' section in {0}".format(fn)) - else: - reference_sections.append(refs) - - # Now the files - for fn in filter(os.path.isfile, includes): - refs = yaml.load(open(fn), Loader=yaml.FullLoader).get('references', None) - if refs is None: - if not missing_references_ok: - raise ValueError("No 'references:' section in {0}".format(fn)) - else: - reference_sections.append(refs) - - # The last thing we include is the references section as written in the - # config, which wins over all. - reference_sections.append(config.get('references', {})) - - merged_references = {} - for ref in reference_sections: - for organism in ref.keys(): - org_dict = merged_references.get(organism, {}) - for tag in ref[organism].keys(): - org_dict[tag] = ref[organism][tag] - merged_references[organism] = org_dict - config['references'] = merged_references - - # Run the deprecation handler on the final config - config = deprecation_handler(config) - - return config - - -def deprecation_handler(config): - """ - Checks the config to see if anything has been deprecated. - - Also makes any fixes that can be done automatically. - """ - if 'assembly' in config: - config['organism'] = config['assembly'] - warnings.warn( - "'assembly' should be replaced with 'organism' in config files. " - "As a temporary measure, a new 'organism' key has been added with " - "the value of 'assembly'", - DeprecationWarning) - - for org, block1 in config.get('references', {}).items(): - for tag, block2 in block1.items(): - gtf_conversions = block2.get('gtf', {}).get('conversions', []) - for c in gtf_conversions: - if isinstance(c, dict) and 'annotation_hub' in c: - warnings.warn( - "You may want to try the 'mappings' conversion rather " - "than 'annotation_hub' since it works directly off " - "the GTF file rather than assuming concordance between " - "GTF and AnnoationHub instances", - DeprecationWarning) - - return config - - -def is_paired_end(sampletable, sample): - """ - Inspects the sampletable to see if the sample is paired-end or not - - Parameters - ---------- - sampletable : pandas.DataFrame - Contains a "layout" or "LibraryLayout" column (but not both). If the - lowercase value is "pe" or "paired", consider the sample paired-end. - Otherwise consider single-end. - - sample : str - Assumed to be found in the first column of `sampletable` - """ - # We can't fall back to detecting PE based on two fastq files provided for - # each sample when it's an SRA sampletable (which only has SRR accessions). - # - # So detect first detect if SRA sampletable based on presence of "Run" - # column and all values of that column starting with "SRR", and then raise - # an error if the Layout column does not exist. - - if "Run" in sampletable.columns: - if all(sampletable["Run"].str.startswith("SRR")): - if "Layout" not in sampletable.columns and "layout" not in sampletable.columns: - raise ValueError( - "Sampletable appears to be SRA, but no 'Layout' column " - "found. This is required to specify single- or paired-end " - "libraries.") - - row = sampletable.set_index(sampletable.columns[0]).loc[sample] - if 'orig_filename_R2' in row: - return True - if 'layout' in row and 'LibraryLayout' in row: - raise ValueError("Expecting column 'layout' or 'LibraryLayout', " - "not both") - try: - return row['layout'].lower() in ['pe', 'paired'] - except KeyError: - pass - try: - return row['LibraryLayout'].lower() in ['pe', 'paired'] - except KeyError: - pass - return False - - -def fill_r1_r2(sampletable, pattern, r1_only=False): - """ - Returns a function intended to be used as a rule's input function. - - The returned function, when provided with wildcards, will return one or two - rendered versions of a pattern depending on SE or PE respectively. - Specifically, given a pattern (which is expected to contain a placeholder - for "{sample}" and "{n}"), look up in the sampletable whether or not it is - paired-end. - - Parameters - ---------- - - sampletable : pandas.DataFrame - Contains a "layout" column with either "SE" or "PE", or "LibraryLayout" - column with "SINGLE" or "PAIRED". If column does not exist, assume SE. - - pattern : str - Must contain at least a "{sample}" placeholder. - - r1_only : bool - If True, then only return the file for R1 even if PE is configured. - """ - def func(wc): - try: - wc.sample - except AttributeError: - raise ValueError( - 'Need "{{sample}}" in pattern ' - '"{pattern}"'.format(pattern=pattern)) - n = [1] - if is_paired_end(sampletable, wc.sample) and not r1_only: - n = [1, 2] - res = expand(pattern, sample=wc.sample, n=n) - return res - return func - - -def pluck(obj, kv): - """ - For a given dict or list that somewhere contains keys `kv`, return the - values of those keys. - - Named after the dplyr::pluck, and implemented based on - https://stackoverflow.com/a/1987195 - """ - if isinstance(obj, list): - for i in obj: - for x in pluck(i, kv): - yield x - elif isinstance(obj, dict): - if kv in obj: - yield obj[kv] - for j in obj.values(): - for x in pluck(j, kv): - yield x - - -def check_url(url, verbose=False): - """ - Try to open -- and then immediately close -- a URL. - - Any exceptions can be handled upstream. - - """ - - # Some notes here: - # - # - A pure python implementation isn't great because urlopen seems to - # cache or hold sessions open or something. EBI servers reject responses - # because too many clients are connected. This doesn't happen using curl. - # - # - Using the requests module doesn't help, because urls can be ftp:// and - # requests doesn't support that. - # - # - Similarly, using asyncio and aiohttp works great for https, but not - # ftp (I couldn't get aioftp to work properly). - # - # - Not all servers support --head. An example of this is - # https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA. - # - # - Piping curl to head using the -c arg to use bytes seems to work. - # However, we need to set pipefail (otherwise because head exits 0 the - # whole thing exits 0). And in that case, we expect curl to exit every - # time with exit code 23, which is "failed to write output", because of - # the broken pipe. This is handled below. - # - if verbose: - print(f'Checking {url}') - - # Notes on curl args: - # - # --max-time to allow the server some seconds to respond - # --retry to allow multiple tries if transient errors (4xx for FTP, 5xx for HTTP) are found - # --silent to not print anything - # --fail to return non-zero exit codes for 404 (default is exit 0 on hitting 404) - # - # Need to run through bash explicitly to get the pipefail option, which in - # turn means running with shell=True - proc = subprocess.run(f'/bin/bash -o pipefail -c "curl --retry 3 --max-time 10 --silent --fail {url} | head -c 10 > /dev/null"', shell=True) - return proc - - -def check_urls(config, verbose=False): - """ - Given a config filename or existing object, extract the URLs and check - them. - - Parameters - ---------- - - config : str or dict - Config object to inspect - - verbose : bool - Print which URL is being checked - - wait : int - Number of seconds to wait in between checking URLs, to avoid - too-many-connection issues - """ - config = load_config(config, missing_references_ok=True) - failures = [] - urls = list(set(utils.flatten(pluck(config, 'url')))) - for url in urls: - if url.startswith('file://'): - continue - - res = check_url(url, verbose=verbose) - - # we expect exit code 23 because we're triggering SIGPIPE with the - # "|head -c" above. - if res.returncode and res.returncode != 23: - failures.append(f'FAIL with exit code {res.returncode}. Command was: {res.args}') - if failures: - output = '\n '.join(failures) - raise ValueError(f'Found problematic URLs. See https://ec.haxx.se/usingcurl/usingcurl-returns for explanation of exit codes.\n {output}') - - -def check_all_urls_found(verbose=True): - """ - Recursively loads all references that can be included and checks them. - Reports out if there are any failures. - """ - check_urls({'include_references': [ - 'include/reference_configs', - 'test/test_configs', - 'workflows/rnaseq/config', - 'workflows/chipseq/config', - 'workflows/references/config', - ]}, verbose=verbose) - - -def gff2gtf(gff, gtf): - """ - Converts a gff file to a gtf format using the gffread function from Cufflinks - """ - if _is_gzipped(gff[0]): - shell('gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}') - else: - shell('gffread {gff} -T -o- | gzip -c > {gtf}') diff --git a/lib/helpers.py b/lib/helpers.py deleted file mode 100644 index 9e0d6323..00000000 --- a/lib/helpers.py +++ /dev/null @@ -1,219 +0,0 @@ -import collections -import re -from itertools import product -import pandas as pd -from snakemake.shell import shell -from snakemake.io import expand, regex_from_filepattern -from lib import common - - -class ConfigurationError(Exception): - pass - - -def detect_layout(sampletable): - """ - Identifies whether a sampletable represents single-end or paired-end reads. - - Raises NotImplementedError if there's a mixture. - """ - is_pe = [common.is_paired_end(sampletable, s) for s in sampletable.iloc[:, 0]] - if all(is_pe): - return "PE" - elif not any(is_pe): - return "SE" - else: - p = sampletable.iloc[is_pe, 0].to_list() - s = sampletable.iloc[[not i for i in is_pe], 0].to_list() - if len(p) > len(s): - report = f"SE samples: {s}" - else: - report = f"PE samples: {p}" - raise ValueError(f"Only a single layout (SE or PE) is supported. {report}") - - -def fill_patterns(patterns, fill, combination=product): - """ - Fills in a dictionary of patterns with the dictionary `fill`. - - >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2]) - >>> sorted(fill_patterns(patterns, fill)['a']) - ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] - - If using `zip` as a combination, checks to ensure all values in `fill` are - the same length to avoid truncated output. - - This fails: - - >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2]) - >>> sorted(fill_patterns(patterns, fill, zip)['a']) # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ValueError: {'sample': ['one', 'two', 'three'], 'N': [1, 2]} does not have the same number of entries for each key - - But this works: - - >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = dict(sample=['one', 'one', 'two', 'two', 'three', 'three'], N=[1, 2, 1, 2, 1, 2]) - >>> sorted(fill_patterns(patterns, fill, zip)['a']) - ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] - - """ - # In recent Snakemake versions (e.g., this happens in 5.4.5) file patterns - # with no wildcards in them are removed from expand when `zip` is used as - # the combination function. - # - # For example, in 5.4.5: - # - # expand('x', zip, d=[1,2,3]) == [] - # - # But in 4.4.0: - # - # expand('x', zip, d=[1,2,3]) == ['x', 'x', 'x'] - - if combination == zip: - lengths = set([len(v) for v in fill.values()]) - if len(lengths) != 1: - raise ValueError(f"{fill} does not have the same number of entries for each key") - - def update(d, u, c): - for k, v in u.items(): - if isinstance(v, collections.abc.Mapping): - r = update(d.get(k, {}), v, c) - d[k] = r - else: # not a dictionary, so we're at a leaf - if isinstance(fill, pd.DataFrame): - d[k] = list(set(expand(u[k], zip, **fill.to_dict("list")))) - else: - d[k] = list(set(expand(u[k], c, **fill))) - if not d[k]: - d[k] = [u[k]] - return d - - d = {} - return update(d, patterns, combination) - - -def extract_wildcards(pattern, target): - """ - Return a dictionary of wildcards and values identified from `target`. - - Returns None if the regex match failed. - - Parameters - ---------- - pattern : str - Snakemake-style filename pattern, e.g. ``{output}/{sample}.bam``. - - target : str - Filename from which to extract wildcards, e.g., ``data/a.bam``. - - Examples - -------- - >>> pattern = '{output}/{sample}.bam' - >>> target = 'data/a.bam' - >>> expected = {'output': 'data', 'sample': 'a'} - >>> assert extract_wildcards(pattern, target) == expected - >>> assert extract_wildcards(pattern, 'asdf') is None - """ - m = re.compile(regex_from_filepattern(pattern)).match(target) - if m: - return m.groupdict() - - -def rscript(string, scriptname, log=None): - """ - Saves the string as `scriptname` and then runs it - - Parameters - ---------- - string : str - Filled-in template to be written as R script - - scriptname : str - File to save script to - - log : str - File to redirect stdout and stderr to. If None, no redirection occurs. - """ - with open(scriptname, "w") as fout: - fout.write(string) - if log: - _log = "> {0} 2>&1".format(log) - else: - _log = "" - shell("Rscript {scriptname} {_log}") - - -def check_unique_fn(df): - """ - Raises an error if the fastq filenames are not unique - """ - fns = df["orig_filename"] - if "orig_filename_R2" in df.columns: - fns = pd.concat([fns, df["orig_filename_R2"]]) - if len(fns.unique()) < len(fns): - raise ValueError("Fastq filenames non unique, check the sampletable\n") - - -def check_unique_samplename(df): - """ - Raises an error if the samplenames are not unique - """ - ns = df.index - if len(ns.unique()) < len(ns): - raise ConfigurationError("Samplenames non unique, check the sampletable\n") - - -def preflight(config): - """ - Performs verifications on config and sampletable files - - Parameters - ---------- - config: yaml config object - """ - sampletable = pd.read_table(config["sampletable"], index_col=0, comment="#") - check_unique_samplename(sampletable) - if "orig_filename" in sampletable.columns: - check_unique_fn(sampletable) - - -def rnaseq_preflight(c): - if "kallisto" not in c.config: - raise ConfigurationError( - """ - Starting in v1.8, an additional 'kallisto' argument is expected - in the config file. Note that in the future this may be - automatically included, but for now please add the following to the - config, where 'tagname' is the tag for the reference of interest: - - kallisto: - tag: "tagname" - """ - ) - - -def chipseq_preflight(c): - pass - - -def strand_arg_lookup(config, lookup): - """ - Given a config object and lookup dictionary, confirm that the config has - correctly specified strandedness and then return the value for that key. - """ - if not config.stranded: - raise ConfigurationError( - "Starting in v1.8, 'stranded' is required in the config file. " - "Values can be 'unstranded', 'fr-firststrand' (R1 aligns antisense to original transcript), " - "or 'fr-secondstrand' (R1 aligns sense to original transcript). If you are not sure, " - "run the workflow with only the 'strand_check' rule, like " - "'snakemake -j 5 strand_check'." - ) - if config.stranded not in lookup: - keys = list(lookup.keys()) - raise KeyError(f"'{config.stranded}' not one of {keys}") - return lookup[config.stranded] diff --git a/lib/imports.py b/lib/imports.py deleted file mode 100644 index f790ef6f..00000000 --- a/lib/imports.py +++ /dev/null @@ -1,22 +0,0 @@ -def resolve_name(name): - """ - Imports a specific object from a dotted path and returns just that object. - - From nose.utils.resolve_name (with the logging parts taken out) which in - turn is from unittest.TestLoader.loadTestByName - """ - parts = name.split('.') - parts_copy = parts[:] - while parts_copy: - try: - module = __import__('.'.join(parts_copy)) - break - except ImportError: - del parts_copy[-1] - if not parts_copy: - raise - parts = parts[1:] - obj = module - for part in parts: - obj = getattr(obj, part) - return obj diff --git a/lib/postprocess/adapters.py b/lib/postprocess/adapters.py deleted file mode 100644 index 1d8ab7ab..00000000 --- a/lib/postprocess/adapters.py +++ /dev/null @@ -1,6 +0,0 @@ -from snakemake.shell import shell - -def fasta_postprocess(origfn, newfn): - shell( - "gzip -c {origfn} > {newfn} " - "&& rm {origfn}") diff --git a/lib/postprocess/dicty.py b/lib/postprocess/dicty.py deleted file mode 100644 index 237cbbdd..00000000 --- a/lib/postprocess/dicty.py +++ /dev/null @@ -1,18 +0,0 @@ -from Bio import SeqIO -import gzip -from snakemake.shell import shell - -def rrna_postprocess(tmpfiles, outfile): - def gen(): - for tmp in tmpfiles: - handle = gzip.open(tmp, 'rt') - parser = SeqIO.parse(handle, 'fasta') - for rec in parser: - if 'Dictyostelium discoideum' not in rec.description: - continue - rec.seq = rec.seq.back_transcribe() - rec.description = rec.name - yield rec - - with gzip.open(outfile, 'wt') as fout: - SeqIO.write(gen(), fout, 'fasta') diff --git a/lib/postprocess/hg19.py b/lib/postprocess/hg19.py deleted file mode 100644 index 8d042432..00000000 --- a/lib/postprocess/hg19.py +++ /dev/null @@ -1,3 +0,0 @@ -from snakemake.shell import shell -def plus_lncrna_fasta_postprocess(tmpfiles, outfile): - shell('cat {tmpfiles} > {outfile}') diff --git a/lib/postprocess/hg38.py b/lib/postprocess/hg38.py deleted file mode 100644 index d21f54ad..00000000 --- a/lib/postprocess/hg38.py +++ /dev/null @@ -1,14 +0,0 @@ -import pybedtools -import gzip -from snakemake.shell import shell -import os - - -def strip_ensembl_version(infiles, outfile): - def transform(f): - f.attrs['gene_id'] = f.attrs['gene_id'].split('.')[0] - return f - with gzip.open(outfile, 'wt') as fout: - for infile in infiles: - for feature in pybedtools.BedTool(infile): - fout.write(str(transform(feature))) diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile deleted file mode 100644 index 815d00c6..00000000 --- a/workflows/references/Snakefile +++ /dev/null @@ -1,369 +0,0 @@ -import os -import sys -import gzip -import yaml -import importlib -import tempfile -import pandas -from snakemake.utils import makedirs - -HERE = str(Path(workflow.snakefile).parent) -sys.path.insert(0, HERE + "/../..") -from lib.imports import resolve_name -from lib import utils -from lib.utils import autobump, gb, hours -from lib import aligners, helpers -from lib import common - -# Note: when running this workflow on its own (say, to generate all references -# ahead of time) you wil need to provide a config file from the command line. -# -# Otherwise, this file is expected to be `include:`ed into other workflows, -# which will have their own config files. - -config = common.load_config(config) - -references_dir = common.get_references_dir(config) -refdict, conversion_kwargs = common.references_dict(config) - -makedirs([references_dir, os.path.join(references_dir, 'logs')]) - -localrules: symlink_fasta_to_index_dir - -wildcard_constraints: - _type="genome|transcriptome|annotation", - _ext="fasta|gtf" - - -rule all_references: - input: utils.flatten(refdict) - - -rule download_and_process: - """Downloads the configured URL, applies any configured post-processing, and - saves the resulting gzipped file to *.fasta.gz or *.gtf.gz. - """ - output: - temporary('{references_dir}/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}.gz') - run: - common.download_and_postprocess(output[0], config, wildcards.organism, wildcards.tag, wildcards._type) - - -rule unzip: - """Generic rule to unzip files as needed, for example when building - indexes. - """ - input: - rules.download_and_process.output - output: - protected('{references_dir}/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}') - wildcard_constraints: - _type="genome|annotation" - log: - '{references_dir}/logs/{organism}/{tag}/{_type}/{organism}_{tag}.{_ext}.log' - shell: 'gunzip -c {input} > {output}' - - -rule bowtie2_index: - """ - Build bowtie2 index - """ - input: - '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' - output: - protected(aligners.bowtie2_index_from_prefix('{references_dir}/{organism}/{tag}/genome/bowtie2/{organism}_{tag}')) - log: - '{references_dir}/logs/{organism}/{tag}/genome/bowtie2/{organism}_{tag}.log' - resources: - runtime=autobump(hours=8), - mem_mb=autobump(gb=32), - disk_mb=autobump(gb=50) - run: - prefix = aligners.prefix_from_bowtie2_index(output) - shell( - 'bowtie2-build ' - '{input} ' - '{prefix} ' - '&> {log}') - - -rule star_index: - input: - fasta='{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta', - gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf', - output: - protected('{references_dir}/{organism}/{tag}/genome/star/{organism}_{tag}/Genome') - log: - '{references_dir}/{organism}/{tag}/genome/star/{organism}_{tag}/Genome.log' - threads: - 8 - resources: - runtime=autobump(hours=8), - mem_mb=gb(64) - run: - genomedir = os.path.dirname(output[0]) - shell('rm -r {genomedir}') - shell('mkdir -p {genomedir}') - shell( - 'STAR ' - '--runMode genomeGenerate ' - '--runThreadN {threads} ' - '--genomeDir {genomedir} ' - '--genomeFastaFiles {input.fasta} ' - - # NOTE: GTF is optional - '--sjdbGTFfile {input.gtf} ' - - # NOTE: STAR docs say that 100 should work well. - '--sjdbOverhang 100 ' - - # NOTE: for small genomes, may need to scale this down to - # min(14, log2(GenomeLength) / 2 - 1) - # --genomeSAindexNbases 14 - '&> {log}' - ) - # STAR writes a hard-coded Log.out file to the current working - # directory. So put that on the end of the log file for the rule and - # then clean up. - shell('cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out') - - -rule hisat2_index: - """ - Build HISAT2 index - """ - input: - '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' - output: - protected(aligners.hisat2_index_from_prefix('{references_dir}/{organism}/{tag}/genome/hisat2/{organism}_{tag}')) - log: - '{references_dir}/logs/{organism}/{tag}/genome/hisat2/{organism}_{tag}.log' - resources: - runtime=autobump(hours=8), - mem_mb=gb(32), - disk_mb=gb(50) - run: - prefix = aligners.prefix_from_hisat2_index(output) - shell( - 'hisat2-build ' - '{input} ' - '{prefix} ' - '&> {log}') - - -rule symlink_fasta_to_index_dir: - """Aligners often want the reference fasta in the same dir as the index, so - this makes the appropriate symlink - """ - input: - fasta='{references_dir}/{organism}/{tag}/{_type}/{organism}_{tag}.fasta' - output: - '{references_dir}/{organism}/{tag}/{_type}/{index}/{organism}_{tag}.fasta' - resources: - runtime=hours(1) - log: - '{references_dir}/logs/{organism}/{tag}/{_type}/{index}/{organism}_{tag}.fasta.log' - run: - utils.make_relative_symlink(input[0], output[0]) - - -rule transcriptome_fasta: - input: - fasta='{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta', - gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' - output: - protected('{references_dir}/{organism}/{tag}/transcriptome/{organism}_{tag}.fasta') - resources: - runtime=hours(1) - shell: - 'gffread {input.gtf} -w {output} -g {input.fasta}' - - -rule salmon_index: - "Build salmon index" - output: - protected('{references_dir}/{organism}/{tag}/transcriptome/salmon/{organism}_{tag}/versionInfo.json') - input: - fasta='{references_dir}/{organism}/{tag}/transcriptome/{organism}_{tag}.fasta' - log: - '{references_dir}/logs/{organism}/{tag}/transcriptome/salmon/{organism}_{tag}.log' - params: - outdir='{references_dir}/{organism}/{tag}/transcriptome/salmon/{organism}_{tag}' - resources: - mem_mb=gb(32), - runtime=hours(2) - shell: - 'salmon index ' - '--transcripts {input.fasta} ' - '--index {params.outdir} ' - '&> {log}' - - -rule kallisto_index: - "Build kallisto index" - output: - index=protected('{references_dir}/{organism}/{tag}/transcriptome/kallisto/{organism}_{tag}/transcripts.idx') - input: - fasta='{references_dir}/{organism}/{tag}/transcriptome/{organism}_{tag}.fasta' - log: - '{references_dir}/logs/{organism}/{tag}/transcriptome/kallisto/{organism}_{tag}.log' - resources: - runtime=hours(2), - mem_mb=gb(32), - shell: - 'kallisto index ' - '--index {output.index} ' - '{input.fasta} ' - '&> {log}' - - -rule conversion_refflat: - """Converts a GTF into refFlat format - """ - input: - '{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' - output: - protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.refflat') - log: - '{references_dir}/logs/{organism}/{tag}/annotation/{organism}_{tag}.refflat.log' - resources: - runtime=hours(2), - mem_mb=gb(2) - shell: - 'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp ' - '''&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} ''' - '&& rm {output}.tmp ' - - -rule conversion_bed12: - input: - '{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' - output: - protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.bed12') - resources: - runtime=hours(2), - mem_mb=gb(2) - shell: - 'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp ' - '&& genePredToBed {output}.tmp {output} ' - '&& rm {output}.tmp' - -rule conversion_gffutils: - """Converts a GTF into a gffutils sqlite3 database - """ - input: - gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' - output: - db=protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf.db') - log: - '{references_dir}/logs/{organism}/{tag}/annotation/{organism}_{tag}.gtf.db.log' - resources: - runtime=hours(2), - mem_mb=gb(4) - run: - import gffutils - kwargs = conversion_kwargs[output[0]] - fd, tmpdb = tempfile.mkstemp(suffix='.db', prefix='gffutils_') - db = gffutils.create_db(data=input.gtf, dbfn=tmpdb, **kwargs) - shell('mv {tmpdb} {output.db}') - - -rule chromsizes: - """Creates a chromsizes table from fasta - """ - input: - '{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.fasta' - output: - protected('{references_dir}/{organism}/{tag}/genome/{organism}_{tag}.chromsizes') - log: - '{references_dir}/logs/{organism}/{tag}/genome/{organism}_{tag}.fasta.log' - params: - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. - java_args='-Xmx20g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] - resources: - mem_mb=gb(24), - runtime=hours(2) - shell: - 'export LC_COLLATE=C; ' - 'rm -f {output}.tmp ' - '&& picard ' - '{params.java_args} ' - 'CreateSequenceDictionary R={input} O={output}.tmp &> {log} ' - '&& grep "^@SQ" {output}.tmp ' - '''| awk '{{print $2, $3}}' ''' - '| sed "s/SN://g;s/ LN:/\\t/g" ' - '| sort -k1,1 > {output} ' - '&& rm -f {output}.tmp ' - - -rule genelist: - """Creates a list of unique gene names in the GTF - """ - input: - gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' - output: - protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.genelist') - resources: - runtime=hours(1), - mem_mb=gb(2) - run: - attribute = conversion_kwargs[output[0]]['gene_id'] - import gffutils - genes = set() - for feature in gffutils.DataIterator(input.gtf): - genes.update(feature.attributes[attribute]) - with open(output[0], 'w') as fout: - for feature in sorted(list(set(genes))): - fout.write(feature + '\n') - - -rule mappings: - """ - Creates gzipped TSV mapping between attributes in the GTF. - """ - input: - gtf='{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.gtf' - output: - protected('{references_dir}/{organism}/{tag}/annotation/{organism}_{tag}.mapping.tsv.gz') - params: - include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', []) - resources: - runtime=hours(2), - mem_mb=gb(2) - run: - import gffutils - - # Will want to change the setting back to what it was originally when - # we're done - orig_setting = gffutils.constants.always_return_list - gffutils.constants.always_return_list = False - - include_featuretypes = params.include_featuretypes - - res = [] - for f in gffutils.DataIterator(input[0]): - - ft = f.featuretype - - if include_featuretypes and (ft not in include_featuretypes): - continue - - d = dict(f.attributes) - d['__featuretype__'] = ft - res.append(d) - - df = pandas.DataFrame(res) - - # Depending on how many attributes there were and the - # include_featuretypes settings, this may take a while. - df = df.drop_duplicates() - - df.to_csv(output[0], sep='\t', index=False, compression='gzip') - - # Restore original setting - gffutils.constants.always_return_list = orig_setting - -# vim: ft=python diff --git a/workflows/references/config/config.yaml b/workflows/references/config/config.yaml deleted file mode 100644 index 49618dcd..00000000 --- a/workflows/references/config/config.yaml +++ /dev/null @@ -1,6 +0,0 @@ -references_dir: 'references_dir' - -# See the reference config files in the top level of the repo, -# include/reference_configs, for inspiration for more species. -include_references: - - '../../include/reference_configs/test.yaml' diff --git a/workflows/references/run_test.sh b/workflows/references/run_test.sh deleted file mode 100755 index 7aacb413..00000000 --- a/workflows/references/run_test.sh +++ /dev/null @@ -1,3 +0,0 @@ -set -e -python -m doctest ../../ci/preprocessor.py -python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@" From 060c2f8c057c19c1644c406d0032b8971a17b9a0 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 3 Jan 2025 19:22:37 +0000 Subject: [PATCH 025/196] add new references.smk --- lib/postprocess/merge.py | 32 ---- rules/references.smk | 322 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 322 insertions(+), 32 deletions(-) delete mode 100644 lib/postprocess/merge.py create mode 100644 rules/references.smk diff --git a/lib/postprocess/merge.py b/lib/postprocess/merge.py deleted file mode 100644 index c3d1686e..00000000 --- a/lib/postprocess/merge.py +++ /dev/null @@ -1,32 +0,0 @@ -import os -from snakemake.shell import shell -from ..imports import resolve_name - -def file_merge(origfns, newfn, *args): - tmpfiles = ['{0}.{1}.sub.tmp'.format(newfn, i) for i in range(len(origfns))] - try: - for origfn, tmpfile, ppfunc in zip(origfns, tmpfiles, args): - print(ppfunc) - func = resolve_name(ppfunc) - func(origfn, tmpfile) - - if os.path.exists(newfn): - shell('rm {newfn}') - - if newfn.endswith('.gz'): - fn = newfn.replace('.gz', '') - for tmpfile in tmpfiles: - shell("gunzip -c {tmpfile} >> {fn}") - shell("gzip {fn}") - else: - for tmpfile in tmpfiles: - shell("cat {tmpfile} >> {newfn}") - - except Exception as e: - raise e - - finally: - for i in tmpfiles: - if os.path.exists(i): - shell('rm {i}') - diff --git a/rules/references.smk b/rules/references.smk new file mode 100644 index 00000000..f4b104ff --- /dev/null +++ b/rules/references.smk @@ -0,0 +1,322 @@ +import os +import sys +import pandas + +HERE = str(Path(workflow.snakefile).parent) +sys.path.insert(0, HERE + "/../..") +from lib.utils import autobump, gb, hours +from lib import utils + +def default_postprocess(origfn, newfn): + shell("mv {origfn} {newfn}") + +rule fasta: + output: + temporary('references/genome.fa.gz') + run: + utils.download_and_postprocess( + urls=config['fasta']['url'], + postprocess=config['fasta'].get('postprocess', None), + outfile=output[0], + log=log + ) + + +rule gtf: + output: + temporary('references/annotation.gtf.gz') + run: + utils.download_and_postprocess( + urls=config['gtf']['url'], + postprocess=config['gtf'].get('postprocess', None), + outfile=output[0], + log=log + ) + + +rule rrna: + output: + temporary('references/rrna.fa.gz') + run: + utils.download_and_postprocess( + urls=config['rrna']['url'], + postprocess=config['rrna'].get('postprocess', None), + outfile=output[0], + log=log + ) + + +rule unzip: + input: + "references/{prefix}.gz" + output: + "references/{prefix}" + shell: 'gunzip -c {input} > {output}' + + +rule bowtie2_index: + input: + "references/{label}.fa", + output: + multiext( + "references/bowtie2/{label}", + ".1.bt2", + ".2.bt2", + ".3.bt2", + ".4.bt2", + ".rev.1.bt2", + ".rev.2.bt2", + ".fa", + ), + log: + "references/logs/bowtie2_{label}.log" + resources: + runtime=autobump(hours=8), + mem_mb=autobump(gb=32), + disk_mb=autobump(gb=50) + threads: + 8 + run: + index = os.path.commonprefix(output).rstrip(".") + shell( + "bowtie2-build" + " --threads {threads}" + " {input}" + " {index}" + " &> {log}" + ) + utils.make_relative_symlink(input[0], output[-1]) + + +rule star_index: + input: + fasta='references/genome.fa', + gtf='references/annotation.gtf', + output: + protected('references/star/Genome') + log: + 'references/logs/star.log' + threads: + 8 + resources: + runtime=autobump(hours=8), + mem_mb=gb(64) + run: + genomedir = os.path.dirname(output[0]) + shell('rm -r {genomedir}') + shell('mkdir -p {genomedir}') + shell( + 'STAR ' + '--runMode genomeGenerate ' + '--runThreadN {threads} ' + '--genomeDir {genomedir} ' + '--genomeFastaFiles {input.fasta} ' + + # NOTE: GTF is optional + '--sjdbGTFfile {input.gtf} ' + + # NOTE: STAR docs say that 100 should work well. + '--sjdbOverhang 100 ' + + # NOTE: for small genomes, may need to scale this down to + # min(14, log2(GenomeLength) / 2 - 1) + # --genomeSAindexNbases 14 + '&> {log}' + ) + # STAR writes a hard-coded Log.out file to the current working + # directory. So put that on the end of the log file for the rule and + # then clean up. + shell('cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out') + shell("ln -s {input.fasta} {genomedir}") + +rule hisat2_index: + input: + "references/genome.fa", + output: + multiext( + "references/hisat2/genome", + ".1.ht2", + ".2.ht2", + ".3.ht2", + ".4.ht2", + ".5.ht2", + ".6.ht2", + ".7.ht2", + ".8.ht2", + ".fa", + ) + log: + "references/logs/hisat2.log" + resources: + runtime=autobump(hours=8), + mem_mb=autobump(gb=32), + disk_mb=autobump(gb=50) + threads: + 8 + run: + index = os.path.commonprefix(output).rstrip(".") + shell( + "hisat2-build" + " --threads {threads}" + " {input}" + " {index}" + " &> {log}" + ) + shell("ln -s {input} {output[-1]}") + + + +rule transcriptome_fasta: + input: + fasta='references/genome.fa', + gtf='references/annotation.gtf', + output: + 'references/transcriptome.fa' + resources: + runtime=hours(1) + shell: + 'gffread {input.gtf} -w {output} -g {input.fasta}' + + +rule salmon_index: + input: + 'references/transcriptome.fa' + output: + 'references/salmon/versionInfo.json' + log: + 'references/logs/salmon.log' + params: + outdir='references/salmon' + resources: + mem_mb=gb(32), + runtime=hours(2) + run: + outdir = os.path.dirname(output[0]) + shell( + 'salmon index ' + '--transcripts {input} ' + '--index {outdir} ' + '&> {log}' + ) + + +rule kallisto_index: + output: + 'references/kallisto/transcripts.idx', + input: + 'references/genome.fa' + log: + 'references/logs/kallisto.log' + resources: + runtime=hours(2), + mem_mb=gb(32), + shell: + 'kallisto index ' + '--index {output} ' + '{input} ' + '&> {log}' + + +rule conversion_refflat: + input: + 'references/annotation.gtf' + output: + protected('references/annotation.refflat') + log: + 'references/logs/annotation.refflat.log' + resources: + runtime=hours(2), + mem_mb=gb(2) + shell: + 'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp ' + '''&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} ''' + '&& rm {output}.tmp ' + + +rule conversion_bed12: + input: + 'references/annotation.gtf' + output: + protected('references/annotation.bed12') + resources: + runtime=hours(2), + mem_mb=gb(2) + shell: + 'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp ' + '&& genePredToBed {output}.tmp {output} ' + '&& rm {output}.tmp' + + +rule chromsizes: + input: + 'references/genome.fa' + output: + protected('references/genome.chromsizes') + log: + 'references/logs/genome.chromsizes.log' + params: + # NOTE: Be careful with the memory here; make sure you have enough + # and/or it matches the resources you're requesting + java_args='-Xmx20g' + # java_args='-Xmx2g' # [TEST SETTINGS -1] + resources: + mem_mb=gb(24), + runtime=hours(2) + shell: + 'export LC_COLLATE=C; ' + 'rm -f {output}.tmp ' + '&& picard ' + '{params.java_args} ' + 'CreateSequenceDictionary R={input} O={output}.tmp &> {log} ' + '&& grep "^@SQ" {output}.tmp ' + '''| awk '{{print $2, $3}}' ''' + '| sed "s/SN://g;s/ LN:/\\t/g" ' + '| sort -k1,1 > {output} ' + '&& rm -f {output}.tmp ' + + +rule mappings: + """ + Creates gzipped TSV mapping between attributes in the GTF. + """ + input: + gtf='references/annotation.gtf' + output: + protected('references/annotation.mapping.tsv.gz') + params: + include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', []) + resources: + runtime=hours(2), + mem_mb=gb(2) + run: + import gffutils + + # Will want to change the setting back to what it was originally when + # we're done + orig_setting = gffutils.constants.always_return_list + gffutils.constants.always_return_list = False + + include_featuretypes = params.include_featuretypes + + res = [] + for f in gffutils.DataIterator(input[0]): + + ft = f.featuretype + + if include_featuretypes and (ft not in include_featuretypes): + continue + + d = dict(f.attributes) + d['__featuretype__'] = ft + res.append(d) + + df = pandas.DataFrame(res) + + # Depending on how many attributes there were and the + # include_featuretypes settings, this may take a while. + df = df.drop_duplicates() + + df.to_csv(output[0], sep='\t', index=False, compression='gzip') + + # Restore original setting + gffutils.constants.always_return_list = orig_setting From 8bb7398f5a4f3d6c5e1b06e933745011ac250521 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 3 Jan 2025 19:24:00 +0000 Subject: [PATCH 026/196] simplify config --- workflows/rnaseq/config/config.yaml | 68 ++++++++--------------------- 1 file changed, 18 insertions(+), 50 deletions(-) diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml index 7b0db18d..2cbd3d66 100644 --- a/workflows/rnaseq/config/config.yaml +++ b/workflows/rnaseq/config/config.yaml @@ -1,59 +1,27 @@ +fasta: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" + postprocess: 'lib.utils.gzipped' + +gtf: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf" + postprocess: 'lib.utils.gzipped' + +rrna: + url: + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + postprocess: + function: 'lib.utils.filter_fastas' + args: 'Drosophila melanogaster' + + sampletable: 'config/sampletable.tsv' patterns: 'config/rnaseq_patterns.yaml' -# Which key in the `references` dict below to use -organism: 'dmel' - -# If not specified here, use the environment variable REFERENCES_DIR. -references_dir: 'references_data' - # See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info. stranded: 'fr-firststrand' # for dUTP libraries # 'fr-secondstrand' # for ligation libraries # 'unstranded' # for libraries without strand specificity -aligner: - index: 'star' - tag: 'test' - -rrna: - index: 'bowtie2' - tag: 'rRNA' - -gtf: - tag: "test" - -salmon: - tag: "test" - -kallisto: - tag: "test" - -fastq_screen: - - label: rRNA - organism: dmel - tag: test - - label: Fly - organism: dmel - tag: test - -merged_bigwigs: - control_pos: - pos: - - sample1 - - sample2 - treatment_all: - pos: - - sample3 - - sample4 - neg: - - sample3 - - sample4 - -# See the reference config files in the top level of the repo, -# include/reference_configs, for inspiration for more species. - -include_references: - - '../../include/reference_configs/test.yaml' - - '../../include/reference_configs/Drosophila_melanogaster.yaml' +aligner: 'star' From 79081fdb041ebb462e4b535abdc9c9da0bed3050 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 3 Jan 2025 19:24:11 +0000 Subject: [PATCH 027/196] utils, common, and helpers are all now in utils --- lib/utils.py | 897 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 878 insertions(+), 19 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index 3c280890..fd8c4dba 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1,9 +1,47 @@ -import os -import contextlib +import binascii import collections +import contextlib +import gzip +import os +import re +import subprocess +import warnings from collections.abc import Iterable +from itertools import product + +import pandas +import pandas as pd +import yaml +from Bio import SeqIO +from snakemake.io import expand, regex_from_filepattern from snakemake.shell import shell +# Small helper functions + + +def resolve_name(name): + """ + Imports a specific object from a dotted path and returns just that object. + + From nose.utils.resolve_name (with the logging parts taken out) which in + turn is from unittest.TestLoader.loadTestByName + """ + parts = name.split(".") + parts_copy = parts[:] + while parts_copy: + try: + module = __import__(".".join(parts_copy)) + break + except ImportError: + del parts_copy[-1] + if not parts_copy: + raise + parts = parts[1:] + obj = module + for part in parts: + obj = getattr(obj, part) + return obj + @contextlib.contextmanager def temp_env(env): @@ -52,22 +90,19 @@ def gen(): def test_flatten(): - assert ( - sorted( - flatten( - { - "a": { - "b": { - "c": ["a", "b", "c"], - }, + assert sorted( + flatten( + { + "a": { + "b": { + "c": ["a", "b", "c"], }, - "x": ["e", "f", "g"], - "y": {"z": "d"}, - } - ) + }, + "x": ["e", "f", "g"], + "y": {"z": "d"}, + } ) - == ["a", "b", "c", "d", "e", "f", "g"] - ) + ) == ["a", "b", "c", "d", "e", "f", "g"] assert flatten("a", True) == "a" assert flatten(["a"], True) == "a" @@ -171,7 +206,7 @@ def boolean_labels(names, idx, mapping={True: "AND", False: "NOT"}, strip="AND_" a_AND_b_AND_c_NOT_d_AND_e """ s = [] - for i, (n, x) in enumerate(zip(names, idx)): + for n, x in zip(names, idx): s.append(mapping[x] + "_" + n) s = "_".join(s) if s.startswith(strip): @@ -191,7 +226,188 @@ def make_relative_symlink(target, linkname): linkbase = os.path.basename(linkname) if not os.path.exists(linkdir): shell("mkdir -p {linkdir}") - shell("cd {linkdir}; ln -sf {relative_target} {linkbase}") + shell(f"cd {linkdir}; ln -sf {relative_target} {linkbase}") + + +def extract_wildcards(pattern, target): + """ + Return a dictionary of wildcards and values identified from `target`. + + Returns None if the regex match failed. + + Parameters + ---------- + pattern : str + Snakemake-style filename pattern, e.g. ``{output}/{sample}.bam``. + + target : str + Filename from which to extract wildcards, e.g., ``data/a.bam``. + + Examples + -------- + >>> pattern = '{output}/{sample}.bam' + >>> target = 'data/a.bam' + >>> expected = {'output': 'data', 'sample': 'a'} + >>> assert extract_wildcards(pattern, target) == expected + >>> assert extract_wildcards(pattern, 'asdf') is None + """ + m = re.compile(regex_from_filepattern(pattern)).match(target) + if m: + return m.groupdict() + + +def _is_gzipped(fn): + """ + Filename-independent method of checking if a file is gzipped or not. Uses + the magic number. + + xref https://stackoverflow.com/a/47080739 + """ + with open(fn, "rb") as f: + return binascii.hexlify(f.read(2)) == b"1f8b" + + +def openfile(tmp, mode): + """ + Returns an open file handle; auto-detects gzipped files. + """ + if _is_gzipped(tmp): + return gzip.open(tmp, mode) + else: + return open(tmp, mode) + + +def gzipped(tmpfiles, outfile): + """ + Cat-and-gzip a list of uncompressed files into a compressed output file. + """ + with gzip.open(outfile, "wt") as fout: + for f in tmpfiles: + with open(f) as infile: + for line in infile: + fout.write(line) + + +def cat(tmpfiles, outfile): + """ + Simple concatenation of files. + + Note that gzipped files can be concatenated as-is without un- and re- + compressing. + """ + shell(f"cat {tmpfiles} > {outfile}") + + +def is_paired_end(sampletable, sample): + """ + Inspects the sampletable to see if the sample is paired-end or not + + Parameters + ---------- + sampletable : pandas.DataFrame + Contains a "layout" or "LibraryLayout" column (but not both). If the + lowercase value is "pe" or "paired", consider the sample paired-end. + Otherwise consider single-end. + + sample : str + Assumed to be found in the first column of `sampletable` + """ + # We can't fall back to detecting PE based on two fastq files provided for + # each sample when it's an SRA sampletable (which only has SRR accessions). + # + # So detect first detect if SRA sampletable based on presence of "Run" + # column and all values of that column starting with "SRR", and then raise + # an error if the Layout column does not exist. + + if "Run" in sampletable.columns: + if all(sampletable["Run"].str.startswith("SRR")): + if ( + "Layout" not in sampletable.columns + and "layout" not in sampletable.columns + ): + raise ValueError( + "Sampletable appears to be SRA, but no 'Layout' column " + "found. This is required to specify single- or paired-end " + "libraries." + ) + + row = sampletable.set_index(sampletable.columns[0]).loc[sample] + if "orig_filename_R2" in row: + return True + if "layout" in row and "LibraryLayout" in row: + raise ValueError("Expecting column 'layout' or 'LibraryLayout', " "not both") + try: + return row["layout"].lower() in ["pe", "paired"] + except KeyError: + pass + try: + return row["LibraryLayout"].lower() in ["pe", "paired"] + except KeyError: + pass + return False + + +def fill_r1_r2(sampletable, pattern, r1_only=False): + """ + Returns a function intended to be used as a rule's input function. + + The returned function, when provided with wildcards, will return one or two + rendered versions of a pattern depending on SE or PE respectively. + Specifically, given a pattern (which is expected to contain a placeholder + for "{sample}" and "{n}"), look up in the sampletable whether or not it is + paired-end. + + Parameters + ---------- + + sampletable : pandas.DataFrame + Contains a "layout" column with either "SE" or "PE", or "LibraryLayout" + column with "SINGLE" or "PAIRED". If column does not exist, assume SE. + + pattern : str + Must contain at least a "{sample}" placeholder. + + r1_only : bool + If True, then only return the file for R1 even if PE is configured. + """ + + def func(wc): + try: + wc.sample + except AttributeError: + raise ValueError( + 'Need "{{sample}}" in pattern ' '"{pattern}"'.format(pattern=pattern) + ) + n = [1] + if is_paired_end(sampletable, wc.sample) and not r1_only: + n = [1, 2] + res = expand(pattern, sample=wc.sample, n=n) + return res + + return func + + +def pluck(obj, kv): + """ + For a given dict or list that somewhere contains keys `kv`, return the + values of those keys. + + Named after the dplyr::pluck, and implemented based on + https://stackoverflow.com/a/1987195 + """ + if isinstance(obj, list): + for i in obj: + for x in pluck(i, kv): + yield x + elif isinstance(obj, dict): + if kv in obj: + yield obj[kv] + for j in obj.values(): + for x in pluck(j, kv): + yield x + + +# Functions for conveniently working with resources def autobump(*args, **kwargs): @@ -308,7 +524,7 @@ def autobump(*args, **kwargs): raise ValueError(f"Unhandled args and kwargs: {args}, {kwargs}") def f(wildcards, attempt): - return baseline_converted + (attempt - 1) * increment_converted + return baseline_converted + (attempt - 1) * increment_converted return f @@ -319,3 +535,646 @@ def gb(size_in_gb): def hours(time_in_hours): return time_in_hours * 60 + + +# Config parsing and handling + + +class ConfigurationError(Exception): + pass + + +def detect_layout(sampletable): + """ + Identifies whether a sampletable represents single-end or paired-end reads. + + Raises NotImplementedError if there's a mixture. + """ + is_pe = [is_paired_end(sampletable, s) for s in sampletable.iloc[:, 0]] + if all(is_pe): + return "PE" + elif not any(is_pe): + return "SE" + else: + p = sampletable.iloc[is_pe, 0].to_list() + s = sampletable.iloc[[not i for i in is_pe], 0].to_list() + if len(p) > len(s): + report = f"SE samples: {s}" + else: + report = f"PE samples: {p}" + raise ValueError(f"Only a single layout (SE or PE) is supported. {report}") + + +def fill_patterns(patterns, fill, combination=product): + """ + Fills in a dictionary of patterns with the dictionary `fill`. + + >>> patterns = dict(a='{sample}_R{N}.fastq') + >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2]) + >>> sorted(fill_patterns(patterns, fill)['a']) + ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] + + If using `zip` as a combination, checks to ensure all values in `fill` are + the same length to avoid truncated output. + + This fails: + + >>> patterns = dict(a='{sample}_R{N}.fastq') + >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2]) + >>> sorted(fill_patterns(patterns, fill, zip)['a']) # doctest: +IGNORE_EXCEPTION_DETAIL + Traceback (most recent call last): + ... + ValueError: {'sample': ['one', 'two', 'three'], 'N': [1, 2]} does not have the same number of entries for each key + + But this works: + + >>> patterns = dict(a='{sample}_R{N}.fastq') + >>> fill = dict(sample=['one', 'one', 'two', 'two', 'three', 'three'], N=[1, 2, 1, 2, 1, 2]) + >>> sorted(fill_patterns(patterns, fill, zip)['a']) + ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] + + """ + # In recent Snakemake versions (e.g., this happens in 5.4.5) file patterns + # with no wildcards in them are removed from expand when `zip` is used as + # the combination function. + # + # For example, in 5.4.5: + # + # expand('x', zip, d=[1,2,3]) == [] + # + # But in 4.4.0: + # + # expand('x', zip, d=[1,2,3]) == ['x', 'x', 'x'] + + if combination == zip: + lengths = set([len(v) for v in fill.values()]) + if len(lengths) != 1: + raise ValueError( + f"{fill} does not have the same number of entries for each key" + ) + + def update(d, u, c): + for k, v in u.items(): + if isinstance(v, collections.abc.Mapping): + r = update(d.get(k, {}), v, c) + d[k] = r + else: # not a dictionary, so we're at a leaf + if isinstance(fill, pd.DataFrame): + d[k] = list(set(expand(u[k], zip, **fill.to_dict("list")))) + else: + d[k] = list(set(expand(u[k], c, **fill))) + if not d[k]: + d[k] = [u[k]] + return d + + d = {} + return update(d, patterns, combination) + + +def rscript(string, scriptname, log=None): + """ + Saves the string as `scriptname` and then runs it + + Parameters + ---------- + string : str + Filled-in template to be written as R script + + scriptname : str + File to save script to + + log : str + File to redirect stdout and stderr to. If None, no redirection occurs. + """ + with open(scriptname, "w") as fout: + fout.write(string) + if log: + _log = "> {0} 2>&1".format(log) + else: + _log = "" + shell("Rscript {scriptname} {_log}") + + +def check_unique_fn(df): + """ + Raises an error if the fastq filenames are not unique + """ + fns = df["orig_filename"] + if "orig_filename_R2" in df.columns: + fns = pd.concat([fns, df["orig_filename_R2"]]) + if len(fns.unique()) < len(fns): + raise ValueError("Fastq filenames non unique, check the sampletable\n") + + +def check_unique_samplename(df): + """ + Raises an error if the samplenames are not unique + """ + ns = df.index + if len(ns.unique()) < len(ns): + raise ConfigurationError("Samplenames non unique, check the sampletable\n") + + +def preflight(config): + """ + Performs verifications on config and sampletable files + + Parameters + ---------- + config: yaml config object + """ + sampletable = pd.read_table(config["sampletable"], index_col=0, comment="#") + check_unique_samplename(sampletable) + if "orig_filename" in sampletable.columns: + check_unique_fn(sampletable) + + +def rnaseq_preflight(c): + pass + + +def chipseq_preflight(c): + pass + + +def strand_arg_lookup(config, lookup): + """ + Given a config object and lookup dictionary, confirm that the config has + correctly specified strandedness and then return the value for that key. + """ + if not config.stranded: + raise ConfigurationError( + "Starting in v1.8, 'stranded' is required in the config file. " + "Values can be 'unstranded', 'fr-firststrand' (R1 aligns antisense to original transcript), " + "or 'fr-secondstrand' (R1 aligns sense to original transcript). If you are not sure, " + "run the workflow with only the 'strand_check' rule, like " + "'snakemake -j 5 strand_check'." + ) + if config.stranded not in lookup: + keys = list(lookup.keys()) + raise KeyError(f"'{config.stranded}' not one of {keys}") + return lookup[config.stranded] + + +def filter_fastas(tmpfiles, outfile, pattern): + """ + Extract records from fasta file(s) given a search pattern. + + Given input gzipped FASTAs, create a new gzipped fasta containing only + records whose description matches `pattern`. + + Parameters + ---------- + tmpfiles : list + gzipped fasta files to look through + + outfile : str + gzipped output fastq file + + pattern : str + Look for this string in each record's description + + """ + + def gen(): + for tmp in tmpfiles: + handle = gzip.open(tmp, "rt") + parser = SeqIO.parse(handle, "fasta") + for rec in parser: + if pattern not in rec.description: + continue + rec.seq = rec.seq.back_transcribe() + rec.description = rec.name + yield rec + + with gzip.open(outfile, "wt") as fout: + SeqIO.write(gen(), fout, "fasta") + + +def twobit_to_fasta(tmpfiles, outfile): + """ + Converts .2bit files to fasta. + + Parameters + ---------- + tmpfiles : list + 2bit files to convert + + outfile : str + gzipped output fastq file + """ + # Note that twoBitToFa doesn't support multiple input files, but we want to + # support them with this function + lookup = {i: i + ".fa" for i in tmpfiles} + for i in tmpfiles: + fn = lookup[i] + shell("twoBitToFa {i} {fn}") + + # Make sure we retain the order of the originally-provided files from the + # config when concatenating. + fastas = [lookup[i] for i in tmpfiles] + shell("cat {fastas} | gzip -c > {outfile}") + shell("rm {fastas}") + + +def download_and_postprocess(urls, postprocess, outfile, log): + """ + Many reference files cannot be used as-is and need to be modified. + + This function supports providing one or more URLs, and any postprocess + functions to get the reference files usable. + + Parameters + ---------- + urls : str or list + URL(s) to download. Can be a list, in which case they will be concatenated. + + postprocess : str | dict | list | None + Postprocessing config. See below for details. + + outfile : str + Output filename to save final output. Expected to be gzipped. + + log : str + Log filename that will accumulate all logs + + Notes + ----- + + This function: + + - downloads the URL[s] to tempfile[s] + - resolves the name of the postprocessing function(s) if provided and + imports it + - calls the imported postprocessing function using the tempfile[s] and + outfile plus any additional specified arguments. + + The postprocessing function must have one of the following signatures, + where `infiles` contains the list of temporary files downloaded from the + URL or URLs specified, and `outfile` is a gzipped file expected to be + created by the function:: + + def func(infiles, outfile): + pass + + or:: + + def func(infiles, outfile, *args): + pass + + or:: + + def func(infiles, outfile, *args, **kwargs): + pass + + + The function is specified as a string that resolves to an importable + function, e.g., `postprocess: lib.postprocess.dm6.fix` will call a function + called `fix` in the file `lib/postprocess/dm6.py`. + + If the contents of `postprocess:` is a dict, it must have at least the key + `function`, and optionally `args` and/or `kwargs` keys. The `function` key + indicates the importable path to the function. `args` can be a string + or list of arguments that will be provided as additional args to a function + with the second kind of signature above. If `kwargs` is provided, it is + a dict that is passed to the function with the third kind of signature + above. For example:: + + postprocess: + function: lib.postprocess.dm6.fix + args: + - True + - 3 + + or:: + + postprocess: + function: lib.postprocess.dm6.fix + args: + - True + - 3 + kwargs: + skip: exon + + """ + + def default_postprocess(origfn, newfn): + shell("mv {origfn} {newfn}") + + if not isinstance(postprocess, list): + postprocess = [postprocess] + + # Will contain tuples of (func, args, kwargs, tmp_outfile) + funcs = [] + + # It is possible to chain multiple postprocessing functions together by + # providing them as a list. + # + # postprocess = [ + # + # "lib.func1", + # + # { + # "function": "lib.func2", + # "args": (True, True), + # }, + # + # { + # "function": "lib.func3", + # "args": (1, 2), + # "kwargs": {"gzipped": True), + # }, + # + # ] + # + for i, postprocess_i in enumerate(postprocess): + + if postprocess_i is None: + func = default_postprocess + args = () + kwargs = {} + name = None + + # postprocess can have a single string value indicating the function or + # it can be a dict with keys "function" and optionally "args". The value of + # "args" can be a string or a list. + else: + if isinstance(postprocess_i, dict): + name = postprocess_i.get("function", postprocess) + args = postprocess_i.get("args", ()) + kwargs = postprocess_i.get("kwargs", {}) + if isinstance(args, str): + args = (args,) + elif isinstance(postprocess_i, str): + name = postprocess_i + args = () + kwargs = {} + + else: + raise ValueError( + f"Unhandled type of postprocessing configuration: {postprocess_i}" + ) + + # In the special case where there is kwarg beginning and ending + # with "__", this can be a dotted function name so it will be + # resolved here as well and passed along to the postprocessing + # function. + # + # This makes it possible to do things like add ERCC annotations on + # the end of other annotations that themselves need to be + # post-processed. + for kw in kwargs: + if kw.startswith("__") and kw.endswith("__"): + kwargs[kw] = resolve_name(kwargs[kw]) + + # import the function + func = resolve_name(name) + + tmp_outfile = f"{outfile}.{i}.{name}.tmp" + funcs.append([func, args, kwargs, tmp_outfile]) + + # The last func's outfile should be the final outfile + funcs[-1][-1] = outfile + + # as described in the docstring above, functions are to assume a list of + # urls + if isinstance(urls, str): + urls = [urls] + + # Download into reasonably-named temp filenames + downloaded_tmpfiles = [f"{outfile}.{i}.tmp" for i in range(len(urls))] + + # For the first postprocess, its input will be all the downloaded files. + postprocess_input = downloaded_tmpfiles + try: + # Copy (if local URI) or download into the specified temp files + for url, tmpfile in zip(urls, downloaded_tmpfiles): + if url.startswith("file:"): + url = url.replace("file://", "") + shell("cp {url} {tmpfile} 2> {log}") + else: + shell("wget {url} -O- > {tmpfile} 2> {log}") + + for func, args, kwargs, tmp_outfile in funcs: + func( + # all downloaded files (if the first postprocess), or the + # output of the last postprocess + postprocess_input, + # the temp output for just this postprocess + tmp_outfile, + *args, + **kwargs, + ) + + # We want the next postprocess to use the output of what we just + # ran; as documented above the input files are expected to be in + # a list. + postprocess_input = [tmp_outfile] + + except Exception as e: + raise e + finally: + to_delete = downloaded_tmpfiles + + # all but the last postprocess func output (the last one is the final + # output that we want to keep!) + to_delete += [i[-1] for i in funcs[:-1]] + + for i in to_delete: + if os.path.exists(i): + shell("rm {i}") + if not _is_gzipped(outfile): + raise ValueError(f"{outfile} does not appear to be gzipped.") + + +def get_sampletable(config): + """ + Return samples and pandas.DataFrame of parsed sampletable. + + Returns the sample IDs and the parsed sampletable from the file specified + in the config. + + The sample IDs are assumed to be the first column of the sampletable. + + Parameters + ---------- + config : dict + """ + sampletable = pandas.read_csv(config["sampletable"], comment="#", sep="\t") + samples = sampletable.iloc[:, 0] + return samples, sampletable + + +def get_techreps(sampletable, label): + """ + Return all sample IDs for which the "label" column is `label`. + """ + # since we're not requiring a name but we want to use `loc` + first_col = sampletable.columns[0] + result = list(sampletable.loc[sampletable["label"] == label, first_col]) + + # If we're using a ChIP-seq-like sampletable we can provide a more + # informative error message. + + is_chipseq = "antibody" in sampletable.columns + if is_chipseq: + err = """ + No technical replicates found for label '{}'. Check the ChIP-seq config + file to ensure the peak-calling section only specifies values from the + sampletable's "label" column.""".format( + label + ) + else: + err = "No technical replicates found for label '{}'.".format(label) + + if len(result) == 0: + raise ValueError(err) + + return result + + +def deprecation_handler(config): + """ + Checks the config to see if anything has been deprecated. + + Also makes any fixes that can be done automatically. + """ + if "assembly" in config: + config["organism"] = config["assembly"] + warnings.warn( + "'assembly' should be replaced with 'organism' in config files. " + "As a temporary measure, a new 'organism' key has been added with " + "the value of 'assembly'", + DeprecationWarning, + ) + + for org, block1 in config.get("references", {}).items(): + for tag, block2 in block1.items(): + gtf_conversions = block2.get("gtf", {}).get("conversions", []) + for c in gtf_conversions: + if isinstance(c, dict) and "annotation_hub" in c: + warnings.warn( + "You may want to try the 'mappings' conversion rather " + "than 'annotation_hub' since it works directly off " + "the GTF file rather than assuming concordance between " + "GTF and AnnoationHub instances", + DeprecationWarning, + ) + + return config + + +def check_url(url, verbose=False): + """ + Try to open -- and then immediately close -- a URL. + + Any exceptions can be handled upstream. + + """ + + # Some notes here: + # + # - A pure python implementation isn't great because urlopen seems to + # cache or hold sessions open or something. EBI servers reject responses + # because too many clients are connected. This doesn't happen using curl. + # + # - Using the requests module doesn't help, because urls can be ftp:// and + # requests doesn't support that. + # + # - Similarly, using asyncio and aiohttp works great for https, but not + # ftp (I couldn't get aioftp to work properly). + # + # - Not all servers support --head. An example of this is + # https://www-s.nist.gov/srmors/certificates/documents/SRM2374_Sequence_v1.FASTA. + # + # - Piping curl to head using the -c arg to use bytes seems to work. + # However, we need to set pipefail (otherwise because head exits 0 the + # whole thing exits 0). And in that case, we expect curl to exit every + # time with exit code 23, which is "failed to write output", because of + # the broken pipe. This is handled below. + # + if verbose: + print(f"Checking {url}") + + # Notes on curl args: + # + # --max-time to allow the server some seconds to respond + # --retry to allow multiple tries if transient errors (4xx for FTP, 5xx for HTTP) are found + # --silent to not print anything + # --fail to return non-zero exit codes for 404 (default is exit 0 on hitting 404) + # + # Need to run through bash explicitly to get the pipefail option, which in + # turn means running with shell=True + proc = subprocess.run( + f'/bin/bash -o pipefail -c "curl --retry 3 --max-time 10 --silent --fail {url} | head -c 10 > /dev/null"', + shell=True, + ) + return proc + + +def check_urls(config, verbose=False): + """ + Given a config filename or existing object, extract the URLs and check + them. + + Parameters + ---------- + + config : str or dict + Config object to inspect + + verbose : bool + Print which URL is being checked + + wait : int + Number of seconds to wait in between checking URLs, to avoid + too-many-connection issues + """ + failures = [] + urls = list(set(utils.flatten(pluck(config, "url")))) + for url in urls: + if url.startswith("file://"): + continue + + res = check_url(url, verbose=verbose) + + # we expect exit code 23 because we're triggering SIGPIPE with the + # "|head -c" above. + if res.returncode and res.returncode != 23: + failures.append( + f"FAIL with exit code {res.returncode}. Command was: {res.args}" + ) + if failures: + output = "\n ".join(failures) + raise ValueError( + f"Found problematic URLs. See https://ec.haxx.se/usingcurl/usingcurl-returns for explanation of exit codes.\n {output}" + ) + + +def check_all_urls_found(verbose=True): + """ + Recursively loads all references that can be included and checks them. + Reports out if there are any failures. + """ + check_urls( + { + "include_references": [ + "include/reference_configs", + "test/test_configs", + "workflows/rnaseq/config", + "workflows/chipseq/config", + "workflows/references/config", + ] + }, + verbose=verbose, + ) + + +def gff2gtf(gff, gtf): + """ + Converts a gff file to a gtf format using the gffread function from Cufflinks + """ + if _is_gzipped(gff[0]): + shell("gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}") + else: + shell("gffread {gff} -T -o- | gzip -c > {gtf}") From 8337b98654604d38bbbeae64b5e381cc267aa6a8 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 3 Jan 2025 19:24:34 +0000 Subject: [PATCH 028/196] cleanup patterns_targets --- lib/patterns_targets.py | 32 ++++++++++++-------------------- 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py index ec62d513..08fedb26 100644 --- a/lib/patterns_targets.py +++ b/lib/patterns_targets.py @@ -6,9 +6,8 @@ import os import collections import yaml -from . import common +from . import utils from . import chipseq -from . import helpers from snakemake.io import expand HERE = os.path.abspath(os.path.dirname(__file__)) @@ -53,11 +52,7 @@ def __init__(self, config, patterns, workdir=None): patterns = os.path.join(workdir, patterns) self.workdir = workdir - if isinstance(config, str): - self.path = config - - self.config = common.load_config( - common.resolve_config(config, workdir)) + self.config = config stranded = self.config.get('stranded', None) self.stranded = None @@ -71,12 +66,9 @@ def __init__(self, config, patterns, workdir=None): # Read the config file and extract all sort of useful bits. This mostly # uses the `common` module to handle the details. - self.config['references_dir'] = common.get_references_dir(self.config) - self.samples, self.sampletable = common.get_sampletable(self.config) - self.refdict, self.conversion_kwargs = common.references_dict(self.config) - self.organism = self.config['organism'] + self.samples, self.sampletable = utils.get_sampletable(self.config) self.patterns = yaml.load(open(patterns), Loader=yaml.FullLoader) - self.is_paired = helpers.detect_layout(self.sampletable) == 'PE' + self.is_paired = utils.detect_layout(self.sampletable) == 'PE' if self.is_paired: self.n = [1, 2] else: @@ -86,7 +78,7 @@ def __init__(self, config, patterns, workdir=None): else: self.is_sra = False - helpers.preflight(self.config) + ##########################utils.preflight(self.config) class RNASeqConfig(SeqConfig): def __init__(self, config, patterns, workdir=None): @@ -112,7 +104,7 @@ def __init__(self, config, patterns, workdir=None): self.fill = dict(sample=self.samples, n=self.n) self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None) - self.targets = helpers.fill_patterns(self.patterns, self.fill) + self.targets = utils.fill_patterns(self.patterns, self.fill) # If the sampletable is from an sra metadata table, then we need to set the value of # 'orig_filename' for each of the samples to where the fastq was downloaded @@ -126,14 +118,14 @@ def __init__(self, config, patterns, workdir=None): self.fill_by_aggregation = dict( merged_bigwig_label=self.config['merged_bigwigs'].keys(), ) - self.targets_by_aggregation = helpers.fill_patterns( + self.targets_by_aggregation = utils.fill_patterns( self.patterns_by_aggregation, self.fill_by_aggregation ) self.targets.update(self.targets_by_aggregation) self.patterns.update(self.patterns_by_aggregation) - helpers.rnaseq_preflight(self) + #########################utils.rnaseq_preflight(self) class ChIPSeqConfig(SeqConfig): @@ -179,7 +171,7 @@ def __init__(self, config, patterns, workdir=None): ip_label=self.sampletable.label[ self.sampletable.antibody != 'input'].values ) - self.targets_by_sample = helpers.fill_patterns( + self.targets_by_sample = utils.fill_patterns( self.patterns_by_sample, self.fill_by_sample) self.targets.update(self.targets_by_sample) @@ -191,7 +183,7 @@ def __init__(self, config, patterns, workdir=None): self.fill_by_aggregation = dict( merged_bigwig_label=self.config['merged_bigwigs'].keys(), ) - self.targets_by_aggregation = helpers.fill_patterns( + self.targets_by_aggregation = utils.fill_patterns( self.patterns_by_aggregation, self.fill_by_aggregation ) @@ -254,11 +246,11 @@ def __init__(self, config, patterns, workdir=None): # targets as they're built. update_recursive( self.targets_for_peaks, - helpers.fill_patterns(_peak_patterns, _fill) + utils.fill_patterns(_peak_patterns, _fill) ) self.targets.update(self.targets_for_peaks) self.patterns.update(self.patterns_by_peaks) - helpers.chipseq_preflight(self) + utils.chipseq_preflight(self) From e8d16df366a3d2549e35122dcdcc6b303aa12aea Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 3 Jan 2025 19:24:57 +0000 Subject: [PATCH 029/196] rnaseq workflow --- workflows/rnaseq/Snakefile | 163 +++++++++++++++---------------------- 1 file changed, 66 insertions(+), 97 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 47c2a324..634f1d91 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -8,26 +8,17 @@ import pandas as pd HERE = str(Path(workflow.snakefile).parent) sys.path.insert(0, HERE + "/../..") -from lib import common, utils, helpers, aligners +from lib import utils from lib.utils import autobump, gb, hours from lib.patterns_targets import RNASeqConfig -# ---------------------------------------------------------------------------- -# -# Search for the string "NOTE:" to look for points of configuration that might -# be helpful for your experiment. -# -# ---------------------------------------------------------------------------- - -if not workflow.overwrite_configfiles: - configfile: 'config/config.yaml' +configfile: 'config/config.yaml' -config = common.load_config(config) +include: '../../rules/references.smk' -include: '../references/Snakefile' # Verify configuration of config and sampletable files -helpers.preflight(config) +################utils.preflight(config) c = RNASeqConfig(config, config.get('patterns', 'config/rnaseq_patterns.yaml')) @@ -47,19 +38,18 @@ def wrapper_for(path): # See "patterns and targets" in the documentation for what's going on here. final_targets = utils.flatten(( - utils.flatten(c.targets['fastqc']), - [c.targets['fastq_screen']], - [c.targets['rrna_percentages_table']], - [c.targets['multiqc']], - utils.flatten(c.targets['featurecounts']), - utils.flatten(c.targets['markduplicates']), - utils.flatten(c.targets['salmon']), - utils.flatten(c.targets['kallisto']), - utils.flatten(c.targets['preseq']), - utils.flatten(c.targets['rseqc']), - utils.flatten(c.targets['collectrnaseqmetrics']), - utils.flatten(c.targets['bigwig']), - utils.flatten(c.targets['samtools']), + c.targets['fastqc'], + c.targets['rrna_percentages_table'], + c.targets['multiqc'], + c.targets['featurecounts'], + c.targets['markduplicates'], + c.targets['salmon'], + c.targets['kallisto'], + c.targets['preseq'], + c.targets['rseqc'], + c.targets['collectrnaseqmetrics'], + c.targets['bigwig'], + c.targets['samtools'], )) if config.get('merged_bigwigs', None): @@ -146,9 +136,9 @@ config.setdefault('strand_check_reads', 1e5) rule sample_strand_check: input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['fastq']), - index=[c.refdict[c.organism][config['aligner']['tag']]['bowtie2']], - bed12=c.refdict[c.organism][config['gtf']['tag']]['bed12'] + fastq=utils.fill_r1_r2(c.sampletable, c.patterns['fastq']), + index=rules.bowtie2_index.output, + bed12=rules.conversion_bed12.output, output: strandedness=c.patterns['strand_check']['tsv'], bam=temporary(c.patterns['strand_check']['bam']), @@ -276,14 +266,14 @@ rule fastqc: wrapper_for('fastqc/wrapper.py') -if config['aligner']['index'] == 'hisat2': +if config['aligner'] == 'hisat2': rule hisat2: """ Map reads with HISAT2 """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['aligner']['tag']]['hisat2']] + fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + index=rules.hisat2_index.output, output: bam=temporary(c.patterns['bam']) log: @@ -293,9 +283,11 @@ if config['aligner']['index'] == 'hisat2': mem_mb=gb(32), runtime=autobump(hours=8) run: - prefix = aligners.prefix_from_bowtie2_index(input.index) + + prefix = os.path.commonprefix(input.index).rstrip(".") sam = output.bam.replace('.bam', '.sam') + if c.is_paired: assert len(input.fastq) == 2 fastqs = '-1 {0} -2 {1} '.format(*input.fastq) @@ -319,7 +311,7 @@ if config['aligner']['index'] == 'hisat2': "&& rm {sam}" ) -if config['aligner']['index'].startswith('star'): +if config['aligner'].startswith('star'): # STAR can be run in 1-pass or 2-pass modes. Since we may be running it # more than once in almost the same way, we pull out the shell command here @@ -348,16 +340,16 @@ if config['aligner']['index'].startswith('star'): ) logfile_extensions = ['Log.progress.out', 'Log.out', 'Log.final.out', 'Log.std.out'] -if config['aligner']['index'] == 'star': +if config['aligner'] == 'star': rule star: """ Align with STAR (1-pass mode) """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['aligner']['tag']]['star']], - annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'], + fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + index=rules.star_index.output, + annotation="references/annotation.gtf" output: bam=temporary(c.patterns['bam']), sjout=temporary(c.patterns['bam'].replace('.bam', '.star.SJ.out.tab')), @@ -384,16 +376,16 @@ if config['aligner']['index'] == 'star': shell('mkdir -p {outdir}/star_logs ' '&& mv {logfiles} {outdir}/star_logs') -if config['aligner']['index'] == 'star-twopass': +if config['aligner'] == 'star-twopass': rule star_pass1: """ First pass of alignment with STAR to get the junctions """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['aligner']['tag']]['star']], - annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'], + fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + index=rules.star_index.output, + annotation="references/annotation.gtf" output: sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')), log: @@ -430,9 +422,9 @@ if config['aligner']['index'] == 'star-twopass': """ input: sjout=expand(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES), - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['aligner']['tag']]['star']], - annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'], + fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + index=rules.star_index.output, + annotation="references/annotation.gtf" output: bam=temporary(c.patterns['bam']), sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')), @@ -474,7 +466,16 @@ rule rRNA: """ input: fastq=render_r1_only(c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['rrna']['tag']]['bowtie2']] + index=multiext( + "references/bowtie2/rrna", + ".1.bt2", + ".2.bt2", + ".3.bt2", + ".4.bt2", + ".rev.1.bt2", + ".rev.2.bt2", + ".fa", + ), output: bam=temporary(c.patterns['rrna']['bam']) log: @@ -484,7 +485,7 @@ rule rRNA: mem_mb=gb(2), runtime=autobump(hours=2) run: - prefix = aligners.prefix_from_bowtie2_index(input.index) + prefix = os.path.commonprefix(input.index).rstrip(".") sam = output.bam.replace('.bam', '.sam') shell( @@ -553,43 +554,12 @@ rule bam_index: 'samtools index {input} {output}' -def fastq_screen_references(): - """ - Returns the Bowtie2 indexes for the configured references from the - `fastq_screen:` section of the config - """ - refs = {} - for i in config['fastq_screen']: - refs[i['label']] = c.refdict[i['organism']][i['tag']]['bowtie2'] - return refs - - -rule fastq_screen: - """ - Run fastq_screen to look for contamination from other genomes - """ - input: - **fastq_screen_references(), - fastq=render_r1_only(rules.cutadapt.output.fastq), - output: - txt=c.patterns['fastq_screen'] - log: - c.patterns['fastq_screen'] + '.log' - threads: 6 - resources: - mem_mb=gb(4), - runtime=autobump(hours=2) - params: subset=100000 - script: - wrapper_for('fastq_screen/wrapper.py') - - rule featurecounts: """ Count reads in annotations with featureCounts from the subread package """ input: - annotation=c.refdict[c.organism][config['gtf']['tag']]['annotation'], + annotation=rules.gtf.output, bam=c.targets['markduplicates']['bam'] output: counts='{sample_dir}/rnaseq_aggregation/featurecounts.txt' @@ -600,7 +570,7 @@ rule featurecounts: mem_mb=gb(16), runtime=autobump(hours=2) params: - strand_arg = helpers.strand_arg_lookup( + strand_arg = utils.strand_arg_lookup( c, { 'unstranded': '-s0 ', 'fr-firststrand': '-s2 ', @@ -640,10 +610,10 @@ rule rrna_libsizes_table: runtime=autobump(hours=2) run: def rrna_sample(f): - return helpers.extract_wildcards(c.patterns['rrna']['libsize'], f)['sample'] + return utils.extract_wildcards(c.patterns['rrna']['libsize'], f)['sample'] def sample(f): - return helpers.extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample'] + return utils.extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample'] def million(f): return float(open(f).read()) / 1e6 @@ -694,7 +664,6 @@ rule multiqc: utils.flatten(c.targets['markduplicates']) + utils.flatten(c.targets['salmon']) + utils.flatten(c.targets['rseqc']) + - utils.flatten(c.targets['fastq_screen']) + utils.flatten(c.targets['preseq']) + utils.flatten(c.targets['collectrnaseqmetrics']) + utils.flatten(c.targets['samtools']) @@ -762,7 +731,7 @@ rule collectrnaseqmetrics: """ input: bam=c.patterns['markduplicates']['bam'], - refflat=c.refdict[c.organism][config['gtf']['tag']]['refflat'] + refflat=rules.conversion_refflat.output, output: metrics=c.patterns['collectrnaseqmetrics']['metrics'], params: @@ -771,7 +740,7 @@ rule collectrnaseqmetrics: # config. java_args='-Xmx20g', # java_args='-Xmx2g', # [TEST SETTINGS -1] - strand_arg = helpers.strand_arg_lookup( + strand_arg = utils.strand_arg_lookup( c, { 'unstranded': 'STRAND=NONE ', 'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ', @@ -822,12 +791,12 @@ rule salmon: Quantify reads coming from transcripts with Salmon """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=c.refdict[c.organism][config['salmon']['tag']]['salmon'], + fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + index='references/salmon/versionInfo.json' output: c.patterns['salmon'] params: - index_dir=os.path.dirname(c.refdict[c.organism][config['salmon']['tag']]['salmon']), + index_dir=os.path.dirname('references/salmon/versionInfo.json'), outdir=os.path.dirname(c.patterns['salmon']) log: c.patterns['salmon'] + '.log' @@ -864,14 +833,14 @@ rule kallisto: Quantify reads coming from transcripts with Kallisto """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=c.refdict[c.organism][config['kallisto']['tag']]['kallisto'], + fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + index='references/kallisto/transcripts.idx', output: c.patterns['kallisto'] params: - index_dir=os.path.dirname(c.refdict[c.organism][config['kallisto']['tag']]['kallisto']), + index_dir=os.path.dirname('references/kallisto/transcripts.idx'), outdir=os.path.dirname(c.patterns['kallisto']), - strand_arg = helpers.strand_arg_lookup( + strand_arg = utils.strand_arg_lookup( c, { 'unstranded': '', 'fr-firststrand': '--rf-stranded', @@ -913,7 +882,7 @@ rule rseqc_infer_experiment: """ input: bam=c.patterns['markduplicates']['bam'], - bed12=c.refdict[c.organism][config['gtf']['tag']]['bed12'] + bed12=rules.conversion_bed12.output, output: txt=c.patterns['rseqc']['infer_experiment'] log: @@ -931,7 +900,7 @@ rule rseqc_read_distribution: """ input: bam=c.patterns['markduplicates']['bam'], - bed12=c.refdict[c.organism][config['gtf']['tag']]['bed12'], + bed12=rules.conversion_bed12.output, output: txt=c.patterns['rseqc']['read_distribution'] log: @@ -985,7 +954,7 @@ rule bigwig_neg: log: c.patterns['bigwig']['neg'] + '.log' params: - strand_arg = helpers.strand_arg_lookup( + strand_arg = utils.strand_arg_lookup( c, { 'unstranded': '', 'fr-firststrand': '--filterRNAstrand reverse ', @@ -1019,7 +988,7 @@ rule bigwig_pos: log: c.patterns['bigwig']['pos'] + '.log' params: - strand_arg = helpers.strand_arg_lookup( + strand_arg = utils.strand_arg_lookup( c, { 'unstranded': '', 'fr-firststrand': '--filterRNAstrand forward ', @@ -1059,7 +1028,7 @@ if 'merged_bigwigs' in config: """ input: bigwigs=bigwigs_to_merge, - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], + chromsizes='references/genome.chromsizes' output: c.patterns['merged_bigwig'] log: From 36fd2e0f167c8c8d4ddec9449ffd9a427e8b0ae2 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 3 Jan 2025 21:38:54 +0000 Subject: [PATCH 030/196] specify references dir from config --- rules/references.smk | 78 +++++++++++++++++++++----------------- workflows/rnaseq/Snakefile | 19 +++++----- 2 files changed, 53 insertions(+), 44 deletions(-) diff --git a/rules/references.smk b/rules/references.smk index f4b104ff..157eeed9 100644 --- a/rules/references.smk +++ b/rules/references.smk @@ -7,12 +7,16 @@ sys.path.insert(0, HERE + "/../..") from lib.utils import autobump, gb, hours from lib import utils +REFERENCES = config.get('reference_dir', '../../references') + def default_postprocess(origfn, newfn): shell("mv {origfn} {newfn}") rule fasta: output: - temporary('references/genome.fa.gz') + temporary(REFERENCES + '/genome.fa.gz') + log: + REFERENCES + "/logs/genome.fa.gz.log" run: utils.download_and_postprocess( urls=config['fasta']['url'], @@ -24,7 +28,9 @@ rule fasta: rule gtf: output: - temporary('references/annotation.gtf.gz') + temporary(REFERENCES + '/annotation.gtf.gz') + log: + REFERENCES + "/logs/annotation.gtf.gz.log" run: utils.download_and_postprocess( urls=config['gtf']['url'], @@ -36,7 +42,9 @@ rule gtf: rule rrna: output: - temporary('references/rrna.fa.gz') + temporary(REFERENCES + '/rrna.fa.gz') + log: + REFERENCES + "/logs/rrna.fa.gz.log" run: utils.download_and_postprocess( urls=config['rrna']['url'], @@ -48,18 +56,18 @@ rule rrna: rule unzip: input: - "references/{prefix}.gz" + REFERENCES + '/{prefix}.gz' output: - "references/{prefix}" + REFERENCES + '/{prefix}' shell: 'gunzip -c {input} > {output}' rule bowtie2_index: input: - "references/{label}.fa", + REFERENCES + '/{label}.fa', output: multiext( - "references/bowtie2/{label}", + REFERENCES + '/bowtie2/{label}', ".1.bt2", ".2.bt2", ".3.bt2", @@ -69,7 +77,7 @@ rule bowtie2_index: ".fa", ), log: - "references/logs/bowtie2_{label}.log" + REFERENCES + '/logs/bowtie2_{label}.log' resources: runtime=autobump(hours=8), mem_mb=autobump(gb=32), @@ -90,12 +98,12 @@ rule bowtie2_index: rule star_index: input: - fasta='references/genome.fa', - gtf='references/annotation.gtf', + fasta=REFERENCES + '/genome.fa', + gtf=REFERENCES + '/annotation.gtf', output: - protected('references/star/Genome') + REFERENCES + '/star/Genome' log: - 'references/logs/star.log' + REFERENCES + '/logs/star.log' threads: 8 resources: @@ -131,10 +139,10 @@ rule star_index: rule hisat2_index: input: - "references/genome.fa", + REFERENCES + '/genome.fa', output: multiext( - "references/hisat2/genome", + REFERENCES + '/hisat2/genome', ".1.ht2", ".2.ht2", ".3.ht2", @@ -146,7 +154,7 @@ rule hisat2_index: ".fa", ) log: - "references/logs/hisat2.log" + REFERENCES + '/logs/hisat2.log' resources: runtime=autobump(hours=8), mem_mb=autobump(gb=32), @@ -168,10 +176,10 @@ rule hisat2_index: rule transcriptome_fasta: input: - fasta='references/genome.fa', - gtf='references/annotation.gtf', + fasta=REFERENCES + '/genome.fa', + gtf=REFERENCES + '/annotation.gtf', output: - 'references/transcriptome.fa' + REFERENCES + '/transcriptome.fa' resources: runtime=hours(1) shell: @@ -180,13 +188,13 @@ rule transcriptome_fasta: rule salmon_index: input: - 'references/transcriptome.fa' + REFERENCES + '/transcriptome.fa' output: - 'references/salmon/versionInfo.json' + REFERENCES + '/salmon/versionInfo.json' log: - 'references/logs/salmon.log' + REFERENCES + '/logs/salmon.log' params: - outdir='references/salmon' + outdir=REFERENCES + '/salmon' resources: mem_mb=gb(32), runtime=hours(2) @@ -202,11 +210,11 @@ rule salmon_index: rule kallisto_index: output: - 'references/kallisto/transcripts.idx', + REFERENCES + '/kallisto/transcripts.idx', input: - 'references/genome.fa' + REFERENCES + '/genome.fa' log: - 'references/logs/kallisto.log' + REFERENCES + '/logs/kallisto.log' resources: runtime=hours(2), mem_mb=gb(32), @@ -219,11 +227,11 @@ rule kallisto_index: rule conversion_refflat: input: - 'references/annotation.gtf' + REFERENCES + '/annotation.gtf' output: - protected('references/annotation.refflat') + REFERENCES + '/annotation.refflat' log: - 'references/logs/annotation.refflat.log' + REFERENCES + '/logs/annotation.refflat.log' resources: runtime=hours(2), mem_mb=gb(2) @@ -235,9 +243,9 @@ rule conversion_refflat: rule conversion_bed12: input: - 'references/annotation.gtf' + REFERENCES + '/annotation.gtf' output: - protected('references/annotation.bed12') + REFERENCES + '/annotation.bed12' resources: runtime=hours(2), mem_mb=gb(2) @@ -249,11 +257,11 @@ rule conversion_bed12: rule chromsizes: input: - 'references/genome.fa' + REFERENCES + '/genome.fa' output: - protected('references/genome.chromsizes') + REFERENCES + '/genome.chromsizes' log: - 'references/logs/genome.chromsizes.log' + REFERENCES + '/logs/genome.chromsizes.log' params: # NOTE: Be careful with the memory here; make sure you have enough # and/or it matches the resources you're requesting @@ -280,9 +288,9 @@ rule mappings: Creates gzipped TSV mapping between attributes in the GTF. """ input: - gtf='references/annotation.gtf' + gtf=REFERENCES + '/annotation.gtf' output: - protected('references/annotation.mapping.tsv.gz') + REFERENCES + '/annotation.mapping.tsv.gz' params: include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', []) resources: diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 634f1d91..374a1437 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -16,6 +16,7 @@ configfile: 'config/config.yaml' include: '../../rules/references.smk' +REFERENCES = config.get('reference_dir', '../../references') # Verify configuration of config and sampletable files ################utils.preflight(config) @@ -349,7 +350,7 @@ if config['aligner'] == 'star': input: fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), index=rules.star_index.output, - annotation="references/annotation.gtf" + annotation=REFERENCES + "/annotation.gtf" output: bam=temporary(c.patterns['bam']), sjout=temporary(c.patterns['bam'].replace('.bam', '.star.SJ.out.tab')), @@ -385,7 +386,7 @@ if config['aligner'] == 'star-twopass': input: fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), index=rules.star_index.output, - annotation="references/annotation.gtf" + annotation=REFERENCES + "/annotation.gtf" output: sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')), log: @@ -424,7 +425,7 @@ if config['aligner'] == 'star-twopass': sjout=expand(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES), fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), index=rules.star_index.output, - annotation="references/annotation.gtf" + annotation=REFERENCES + "/annotation.gtf" output: bam=temporary(c.patterns['bam']), sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')), @@ -467,7 +468,7 @@ rule rRNA: input: fastq=render_r1_only(c.patterns['cutadapt']), index=multiext( - "references/bowtie2/rrna", + REFERENCES + "/bowtie2/rrna", ".1.bt2", ".2.bt2", ".3.bt2", @@ -792,11 +793,11 @@ rule salmon: """ input: fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index='references/salmon/versionInfo.json' + index=REFERENCES + "/salmon/versionInfo.json" output: c.patterns['salmon'] params: - index_dir=os.path.dirname('references/salmon/versionInfo.json'), + index_dir=os.path.dirname(REFERENCES + "/salmon/versionInfo.json"), outdir=os.path.dirname(c.patterns['salmon']) log: c.patterns['salmon'] + '.log' @@ -834,11 +835,11 @@ rule kallisto: """ input: fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index='references/kallisto/transcripts.idx', + index=REFERENCES + "/kallisto/transcripts.idx", output: c.patterns['kallisto'] params: - index_dir=os.path.dirname('references/kallisto/transcripts.idx'), + index_dir=os.path.dirname(REFERENCES + "/kallisto/transcripts.idx"), outdir=os.path.dirname(c.patterns['kallisto']), strand_arg = utils.strand_arg_lookup( c, { @@ -1028,7 +1029,7 @@ if 'merged_bigwigs' in config: """ input: bigwigs=bigwigs_to_merge, - chromsizes='references/genome.chromsizes' + chromsizes=REFERENCES + "/genome.chromsizes" output: c.patterns['merged_bigwig'] log: From c321ed3015d6fcba91d88e452849a1adc04ca1ba Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 3 Jan 2025 21:39:40 +0000 Subject: [PATCH 031/196] round of cleanup --- workflows/rnaseq/Snakefile | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 374a1437..3c251eb3 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -1,7 +1,6 @@ import os import sys from pathlib import Path -from textwrap import dedent import yaml import tempfile import pandas as pd @@ -462,9 +461,6 @@ if config['aligner'] == 'star-twopass': rule rRNA: - """ - Map reads with bowtie2 to the rRNA reference - """ input: fastq=render_r1_only(c.patterns['cutadapt']), index=multiext( @@ -508,9 +504,6 @@ rule rRNA: rule fastq_count: - """ - Count reads in a FASTQ file - """ input: fastq='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz' output: @@ -524,9 +517,6 @@ rule fastq_count: rule bam_count: - """ - Count reads in a BAM file - """ input: bam='{sample_dir}/{sample}/{suffix}.bam' output: @@ -540,9 +530,6 @@ rule bam_count: rule bam_index: - """ - Index a BAM - """ input: bam='{prefix}.bam' output: From 32f43bc036d5f27e556b4c2cc1d0ce82b6f78f38 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 3 Jan 2025 21:39:55 +0000 Subject: [PATCH 032/196] better use of params --- workflows/rnaseq/Snakefile | 53 ++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 3c251eb3..cc66821f 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -216,20 +216,21 @@ rule cutadapt: resources: mem_mb=gb(2), runtime=autobump(hours=2) + params: + extra=( + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + ) + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " if c.is_paired else "" run: - - # NOTE: Change cutadapt params here if c.is_paired: shell( "cutadapt " "-o {output[0]} " "-p {output[1]} " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " - "--nextseq-trim 20 " - "--overlap 6 " - '-j {threads} ' - '--minimum-length 25 ' + "-j {threads} " + "{params.extra} " "{input.fastq[0]} " "{input.fastq[1]} " "&> {log}" @@ -238,11 +239,8 @@ rule cutadapt: shell( "cutadapt " "-o {output[0]} " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - "--nextseq-trim 20 " - "--overlap 6 " - '-j {threads} ' - '--minimum-length 25 ' + "-j {threads} " + "{params.extra} " "{input.fastq[0]} " "&> {log}" ) @@ -282,6 +280,8 @@ if config['aligner'] == 'hisat2': resources: mem_mb=gb(32), runtime=autobump(hours=8) + params: + extra="" run: prefix = os.path.commonprefix(input.index).rstrip(".") @@ -323,7 +323,9 @@ if config['aligner'].startswith('star'): '--readFilesIn {input.fastq} ' '--readFilesCommand zcat ' '--outFileNamePrefix {prefix} ' - + '{params.extra} ' + ) + STAR_PARAMS = ( # NOTE: The STAR docs indicate that the following parameters are # standard options for ENCODE long-RNA-seq pipeline. Comments are from # the STAR docs. @@ -359,6 +361,9 @@ if config['aligner'] == 'star': resources: mem_mb=gb(64), runtime=autobump(hours=8) + params: + extra=STAR_PARAMS + run: genomedir = os.path.dirname(input.index[0]) outdir = os.path.dirname(output[0]) @@ -394,6 +399,8 @@ if config['aligner'] == 'star-twopass': resources: mem_mb=gb(64), runtime=autobump(hours=8) + params: + extra=STAR_PARAMS run: genomedir = os.path.dirname(input.index[0]) outdir = os.path.dirname(output[0]) @@ -434,6 +441,8 @@ if config['aligner'] == 'star-twopass': resources: mem_mb=gb(64), runtime=autobump(hours=8) + params: + extra=STAR_PARAMS run: genomedir = os.path.dirname(input.index[0]) outdir = os.path.dirname(output[0]) @@ -481,6 +490,12 @@ rule rRNA: resources: mem_mb=gb(2), runtime=autobump(hours=2) + params: + extra=( + '-k 1 ' # NOTE: we only care if >=1 mapped + '--no-unal ' # NOTE: suppress unaligned reads + ) + run: prefix = os.path.commonprefix(input.index).rstrip(".") sam = output.bam.replace('.bam', '.sam') @@ -489,10 +504,9 @@ rule rRNA: "bowtie2 " "-x {prefix} " "-U {input.fastq} " - '-k 1 ' # NOTE: we only care if >=1 mapped - '--no-unal ' # NOTE: suppress unaligned reads "--threads {threads} " "-S {sam} " + "{params.extra} " "> {log} 2>&1" ) @@ -558,13 +572,12 @@ rule featurecounts: mem_mb=gb(16), runtime=autobump(hours=2) params: - strand_arg = utils.strand_arg_lookup( - c, { + strand_arg={ 'unstranded': '-s0 ', 'fr-firststrand': '-s2 ', 'fr-secondstrand': '-s1 ', - } - ) + }[config["stranded"]], + extra="" run: # NOTE: By default, we use -p for paired-end p_arg = '' From a9216b91897ff5f4d8aeffd6b868984fedbf1301 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sat, 4 Jan 2025 03:57:11 +0000 Subject: [PATCH 033/196] try moving utils.py to common.smk --- lib/utils.py => rules/common.smk | 9 +++++++++ 1 file changed, 9 insertions(+) rename lib/utils.py => rules/common.smk (99%) diff --git a/lib/utils.py b/rules/common.smk similarity index 99% rename from lib/utils.py rename to rules/common.smk index fd8c4dba..d0401625 100644 --- a/lib/utils.py +++ b/rules/common.smk @@ -1178,3 +1178,12 @@ def gff2gtf(gff, gtf): shell("gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}") else: shell("gffread {gff} -T -o- | gzip -c > {gtf}") + + +def wrapper_for(path): + return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) + +def detect_sra(sampletable): + return 'Run' in self.sampletable.columns and any(self.sampletable['Run'].str.startswith('SRR')) + +# vim: ft=python From 300e73d9d868492c92d286b4de8706109dc2549b Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sat, 4 Jan 2025 03:57:28 +0000 Subject: [PATCH 034/196] add strand_check and sra rules --- rules/sra.smk | 34 +++++++++++++++++++++ rules/strand_check.smk | 69 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 rules/sra.smk create mode 100644 rules/strand_check.smk diff --git a/rules/sra.smk b/rules/sra.smk new file mode 100644 index 00000000..861b5098 --- /dev/null +++ b/rules/sra.smk @@ -0,0 +1,34 @@ + +sampletable['orig_filename'] = expand( + 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1) + +if is_paired: + sampletable['orig_filename_R2'] = expand( + 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2) + +rule fastq_dump: + output: + fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n) + log: + 'original_data/sra_samples/{sample}/{sample}.fastq.gz.log' + params: + is_paired=is_paired, + sampletable=_st, + # extra="-X 100000", # [TEST SETTINGS] + resources: + mem_mb=gb(1), + disk_mb=autobump(gb=1), + runtime=autobump(hours=2) + run: + _st = sampletable.set_index(sampletable.columns[0]) + srr = _st.loc[wildcards.sample, "Run"] + extra = params.get("extra", "") + if is_paired: + shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") + shell("mv {srr}_1.fastq.gz {output[0]}") + shell("mv {srr}_2.fastq.gz {output[1]}") + else: + shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp") + shell("mv {output[0]}.tmp {output[0]}") + +# vim: ft=snakemake diff --git a/rules/strand_check.smk b/rules/strand_check.smk new file mode 100644 index 00000000..625ba3e2 --- /dev/null +++ b/rules/strand_check.smk @@ -0,0 +1,69 @@ + +rule sample_strand_check: + input: + fastq=fill_r1_r2(c.sampletable, c.patterns['fastq']), + index=rules.bowtie2_index.output, + bed12=rules.conversion_bed12.output, + output: + strandedness='strand_check/{sample}/{sample}.strandedness', + bam=temporary('strand_check/{sample}/{sample}.strandedness.bam'), + bai=temporary('strand_check/{sample}/{sample}.strandedness.bam.bai'), + fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', sample=SAMPLES, n=n)), + log: + 'strand_check/{sample}/{sample}.strandedness.log' + threads: 6 + resources: + mem_mb=gb(8), + runtime=autobump(hours=2) + run: + prefix = aligners.prefix_from_bowtie2_index(input.index) + nreads = int(config['strand_check_reads']) * 4 + if c.is_paired: + assert len(input.fastq) == 2 + assert len(output.fastqs) == 2 + shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') + shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}') + fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} ' + else: + assert len(input.fastq) == 1 + assert len(output.fastqs) == 1 + shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') + fastqs = f'-U {output.fastqs[0]} ' + shell( + "bowtie2 " + "-x {prefix} " + "{fastqs} " + '--no-unal ' + "--threads {threads} 2> {log} " + "| samtools view -Sb - " + "| samtools sort - -o {output.bam} " + ) + shell("samtools index {output.bam}") + shell( + 'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}' + ) + +rule strand_check: + input: + expand('strand_check/{sample}/{sample}.strandedness', sample=SAMPLES) + output: + html='strand_check/strandedness.html', + filelist=temporary('strand_check/filelist') + log: + 'strand_check/strandedness.log' + resources: + mem_mb=gb(1), + runtime=autobump(hours=2) + run: + with open(output.filelist, 'w') as fout: + for i in input: + fout.write(i + '\n') + shell( + 'multiqc ' + '--force ' + '--module rseqc ' + '--file-list {output.filelist} ' + '--filename {output.html} &> {log}' + ) + +# vim: ft=snakemake From 407332e443a0c3c44dc3ce4df16f2798119b8c43 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sat, 4 Jan 2025 03:57:47 +0000 Subject: [PATCH 035/196] mega refactor, still only partway done.... --- workflows/rnaseq/Snakefile | 358 ++++++++++--------------------------- 1 file changed, 91 insertions(+), 267 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index cc66821f..84b9bdd4 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -1,217 +1,78 @@ import os -import sys -from pathlib import Path import yaml import tempfile import pandas as pd -HERE = str(Path(workflow.snakefile).parent) -sys.path.insert(0, HERE + "/../..") -from lib import utils -from lib.utils import autobump, gb, hours -from lib.patterns_targets import RNASeqConfig - configfile: 'config/config.yaml' include: '../../rules/references.smk' +include: '../../rules/common.smk' REFERENCES = config.get('reference_dir', '../../references') -# Verify configuration of config and sampletable files -################utils.preflight(config) +sampletable = pd.read_table(config["sampletable"], sep="\t") +_st = c.sampletable.set_index(c.sampletable.columns[0]) +is_paired = detect_layout(sampletable) == "PE" +is_sra = detect_sra(sampletable) +n = ["1", "2"] if is_paired else ["1"] +SAMPLES = sampletable.iloc[:, 0].values -c = RNASeqConfig(config, config.get('patterns', 'config/rnaseq_patterns.yaml')) +# TODO: moved utils.py over to common.smk; not sure if this means that postprocessing will fail or not... -SAMPLES = c.sampletable.iloc[:, 0].values wildcard_constraints: n = '[1,2]', sample = '|'.join(SAMPLES) +localrules: symlinks, symlink_targets -def wrapper_for(path): - return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) - - -# ---------------------------------------------------------------------------- -# RULES -# ---------------------------------------------------------------------------- - -# See "patterns and targets" in the documentation for what's going on here. -final_targets = utils.flatten(( - c.targets['fastqc'], - c.targets['rrna_percentages_table'], - c.targets['multiqc'], - c.targets['featurecounts'], - c.targets['markduplicates'], - c.targets['salmon'], - c.targets['kallisto'], - c.targets['preseq'], - c.targets['rseqc'], - c.targets['collectrnaseqmetrics'], - c.targets['bigwig'], - c.targets['samtools'], -)) - -if config.get('merged_bigwigs', None): - final_targets.extend(utils.flatten(c.targets['merged_bigwig'])) - +rule all: + input: + 'data/rnaseq_aggregation/multiqc.html', -def render_r1_r2(pattern): - return expand(pattern, sample='{sample}', n=c.n) +if is_sra: + include: '../../rules/sra.smk' -def render_r1_only(pattern): - return expand(pattern, sample='{sample}', n=1) -rule targets: +def orig_for_sample(wc): """ - Final targets to create + Given a sample, returns either one or two original fastq files + depending on whether the library was single- or paired-end. """ - input: final_targets - -if c.is_sra: - - # Convert the sampletable to be indexed by the first column, for - # convenience in generating the input/output filenames. - _st = c.sampletable.set_index(c.sampletable.columns[0]) - - rule fastq_dump: - output: - fastq=render_r1_r2(c.patterns['sra_fastq']) - log: - render_r1_only(c.patterns['sra_fastq'])[0] + '.log' - params: - is_paired=c.is_paired, - sampletable=_st, - # limit = 100000, # [TEST SETTINGS] - resources: - mem_mb=gb(1), - disk_mb=autobump(gb=1), - runtime=autobump(hours=2) - conda: - '../../wrappers/wrappers/fastq-dump/environment.yaml' - script: - wrapper_for('fastq-dump/wrapper.py') - -if 'orig_filename' in c.sampletable.columns: - - localrules: symlinks, symlink_targets - - # Convert the sampletable to be indexed by the first column, for - # convenience in generating the input/output filenames. - _st = c.sampletable.set_index(c.sampletable.columns[0]) - - def orig_for_sample(wc): - """ - Given a sample, returns either one or two original fastq files - depending on whether the library was single- or paired-end. - """ - if c.is_paired: - return _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] - return _st.loc[wc.sample, ['orig_filename']] - - - rule symlinks: - """ - Symlinks files over from original filename - """ - input: - orig_for_sample - output: - render_r1_r2(c.patterns['fastq']) - threads: 1 - resources: - mem_mb=100, - runtime=10, - run: - assert len(output) == len(input), (input, output) - for src, linkname in zip(input, output): - utils.make_relative_symlink(src, linkname) - + if is_paired: + return _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] + return _st.loc[wc.sample, ['orig_filename']] - rule symlink_targets: - input: c.targets['fastq'] -# This can be set at the command line with --config strand_check_reads=1000 -config.setdefault('strand_check_reads', 1e5) - -rule sample_strand_check: +rule symlinks: input: - fastq=utils.fill_r1_r2(c.sampletable, c.patterns['fastq']), - index=rules.bowtie2_index.output, - bed12=rules.conversion_bed12.output, + lambda wc: _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] if is_paired + else _st.loc[wc.sample, ['orig_filename']] output: - strandedness=c.patterns['strand_check']['tsv'], - bam=temporary(c.patterns['strand_check']['bam']), - idx=temporary(c.patterns['strand_check']['bam'] + '.bai'), - fastqs=temporary(render_r1_r2(c.patterns['strand_check']['fastq'])), - log: - c.patterns['strand_check']['tsv'] + '.log' - threads: 6 + expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n) + threads: 1 resources: - mem_mb=gb(8), - runtime=autobump(hours=2) + mem_mb=100, + runtime=10, run: - prefix = aligners.prefix_from_bowtie2_index(input.index) - nreads = int(config['strand_check_reads']) * 4 - if c.is_paired: - assert len(input.fastq) == 2 - assert len(output.fastqs) == 2 - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}') - fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} ' - else: - assert len(input.fastq) == 1 - assert len(output.fastqs) == 1 - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') - fastqs = f'-U {output.fastqs[0]} ' - shell( - "bowtie2 " - "-x {prefix} " - "{fastqs} " - '--no-unal ' - "--threads {threads} 2> {log} " - "| samtools view -Sb - " - "| samtools sort - -o {output.bam} " - ) - shell("samtools index {output.bam}") - shell( - 'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}' - ) + assert len(output) == len(input), (input, output) + for src, linkname in zip(input, output): + make_relative_symlink(src, linkname) -rule strand_check: - input: - expand(c.patterns['strand_check']['tsv'], sample=SAMPLES) - output: - html='strand_check/strandedness.html', - filelist=temporary('strand_check/filelist') - log: - 'strand_check/strandedness.log' - resources: - mem_mb=gb(1), - runtime=autobump(hours=2) - run: - with open(output.filelist, 'w') as fout: - for i in input: - fout.write(i + '\n') - shell( - 'multiqc ' - '--force ' - '--module rseqc ' - '--file-list {output.filelist} ' - '--filename {output.html} &> {log}' - ) +rule symlink_targets: + input: c.targets['fastq'] + +# This can be set at the command line with --config strand_check_reads=1000 +config.setdefault('strand_check_reads', 1e5) +include: '../../rules/strand_check.smk' rule cutadapt: - """ - Run cutadapt - """ input: - fastq=render_r1_r2(c.patterns['fastq']) + fastq=expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n) output: - fastq=render_r1_r2(c.patterns['cutadapt']) + fastq=expand('data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz', sample=SAMPLES, n=n) log: - render_r1_r2(c.patterns['cutadapt'])[0] + '.log' + 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz' threads: 6 resources: mem_mb=gb(2), @@ -222,7 +83,7 @@ rule cutadapt: "--overlap 6 " "--minimum-length 25 " "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - ) + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " if c.is_paired else "" + ) + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " if is_paired else "" run: if c.is_paired: shell( @@ -245,11 +106,8 @@ rule cutadapt: "&> {log}" ) - +# TODO: rm wrapper rule fastqc: - """ - Run FastQC - """ input: '{sample_dir}/{sample}/{sample}{suffix}' threads: @@ -266,11 +124,14 @@ rule fastqc: if config['aligner'] == 'hisat2': rule hisat2: - """ - Map reads with HISAT2 - """ input: - fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + # TODO: make sure this works + fastq=( + 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz', + 'data/rnaseq_samples/{sample}/{sample}_R2.fastq.gz', + ) if is_paired else ( + 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz', + ), index=rules.hisat2_index.output, output: bam=temporary(c.patterns['bam']) @@ -349,7 +210,7 @@ if config['aligner'] == 'star': Align with STAR (1-pass mode) """ input: - fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']), index=rules.star_index.output, annotation=REFERENCES + "/annotation.gtf" output: @@ -388,7 +249,7 @@ if config['aligner'] == 'star-twopass': First pass of alignment with STAR to get the junctions """ input: - fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']), index=rules.star_index.output, annotation=REFERENCES + "/annotation.gtf" output: @@ -429,7 +290,7 @@ if config['aligner'] == 'star-twopass': """ input: sjout=expand(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES), - fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']), index=rules.star_index.output, annotation=REFERENCES + "/annotation.gtf" output: @@ -611,10 +472,10 @@ rule rrna_libsizes_table: runtime=autobump(hours=2) run: def rrna_sample(f): - return utils.extract_wildcards(c.patterns['rrna']['libsize'], f)['sample'] + return extract_wildcards(c.patterns['rrna']['libsize'], f)['sample'] def sample(f): - return utils.extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample'] + return extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample'] def million(f): return float(open(f).read()) / 1e6 @@ -651,27 +512,38 @@ rule rrna_libsizes_table: rule multiqc: - """ - Aggregate various QC stats and logs into a single HTML report with MultiQC - """ - # NOTE: if you add more rules and want MultiQC to pick up the output, then - # add outputs from those rules to the inputs here. input: files=( - utils.flatten(c.targets['fastqc']) + - utils.flatten(c.targets['rrna_percentages_yaml']) + - utils.flatten(c.targets['cutadapt']) + - utils.flatten(c.targets['featurecounts']) + - utils.flatten(c.targets['markduplicates']) + - utils.flatten(c.targets['salmon']) + - utils.flatten(c.targets['rseqc']) + - utils.flatten(c.targets['preseq']) + - utils.flatten(c.targets['collectrnaseqmetrics']) + - utils.flatten(c.targets['samtools']) + expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n), + expand( + 'data/rnaseq_samples/{sample}/fastqc/{sample}_R1{kind}.fastq.gz_fastqc.zip', + sample=SAMPLES, kind=["", ".cutadapt", ".cutadapt.bam"] + ), + expand( + 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups{ext}', + sample=SAMPLES, ext=['.bam', '.bam.bai'] + ), + expand( + 'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf', + sample=SAMPLES + ), + expand('data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5', sample=SAMPLES), + expand('data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt', sample=SAMPLES), + expand('data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt', sample=SAMPLES), + expand('data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt', sample=SAMPLES), + expand('data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics', sample=SAMPLES), + expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig', sample=SAMPLES, dir=["pos", "neg"]), + expand('data/rnaseq_samples/{sample}/idxstat_{sample}.txt', sample=SAMPLES), + expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat', sample=SAMPLES), + expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats', sample=SAMPLES), + 'data/rnaseq_aggregation/rrna_percentages_table.tsv', + 'data/rnaseq_aggregation/featurecounts.txt', ), config='config/multiqc_config.yaml' - output: c.targets['multiqc'] - log: c.targets['multiqc'][0] + '.log' + output: + 'data/rnaseq_aggregation/multiqc.html' + log: + 'data/rnaseq_aggregation/multiqc.log' threads: 1 resources: mem_mb=gb(2), @@ -741,13 +613,11 @@ rule collectrnaseqmetrics: # config. java_args='-Xmx20g', # java_args='-Xmx2g', # [TEST SETTINGS -1] - strand_arg = utils.strand_arg_lookup( - c, { + strand_arg={ 'unstranded': 'STRAND=NONE ', 'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ', 'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ', - } - ) + }[config["stranded"] log: c.patterns['collectrnaseqmetrics']['metrics'] + '.log' threads: 1 @@ -792,7 +662,7 @@ rule salmon: Quantify reads coming from transcripts with Salmon """ input: - fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']), index=REFERENCES + "/salmon/versionInfo.json" output: c.patterns['salmon'] @@ -834,20 +704,18 @@ rule kallisto: Quantify reads coming from transcripts with Kallisto """ input: - fastq=utils.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']), index=REFERENCES + "/kallisto/transcripts.idx", output: c.patterns['kallisto'] params: index_dir=os.path.dirname(REFERENCES + "/kallisto/transcripts.idx"), outdir=os.path.dirname(c.patterns['kallisto']), - strand_arg = utils.strand_arg_lookup( - c, { + strand_arg={ 'unstranded': '', 'fr-firststrand': '--rf-stranded', 'fr-secondstrand': '--fr-stranded', - } - ) + }[config["stranded"] log: c.patterns['kallisto'] + '.log' threads: @@ -955,13 +823,11 @@ rule bigwig_neg: log: c.patterns['bigwig']['neg'] + '.log' params: - strand_arg = utils.strand_arg_lookup( - c, { + strand_arg = { 'unstranded': '', 'fr-firststrand': '--filterRNAstrand reverse ', 'fr-secondstrand': '--filterRNAstrand forward ', - } - ) + }[config["stranded"] run: shell( 'bamCoverage ' @@ -989,13 +855,11 @@ rule bigwig_pos: log: c.patterns['bigwig']['pos'] + '.log' params: - strand_arg = utils.strand_arg_lookup( - c, { + strand_arg={ 'unstranded': '', 'fr-firststrand': '--filterRNAstrand forward ', 'fr-secondstrand': '--filterRNAstrand reverse ', - } - ) + }[config["stranded"] run: shell( 'bamCoverage ' @@ -1021,46 +885,6 @@ def bigwigs_to_merge(wc): sample=neg_labels) return pos_bigwigs + neg_bigwigs -if 'merged_bigwigs' in config: - rule merge_bigwigs: - """ - Merge together bigWigs as specified in the config ("merged_bigwigs" - section). - """ - input: - bigwigs=bigwigs_to_merge, - chromsizes=REFERENCES + "/genome.chromsizes" - output: - c.patterns['merged_bigwig'] - log: - c.patterns['merged_bigwig'] + '.log' - resources: - mem_mb=gb(16), - runtime=autobump(hours=2) - script: - wrapper_for('average-bigwigs/wrapper.py') - - -rule rnaseq_rmarkdown: - """ - Run and render the RMarkdown file that performs differential expression - """ - input: - featurecounts=utils.flatten(c.targets['featurecounts']), - salmon=utils.flatten(c.targets['salmon']), - - # NOTE: the Rmd will likely need heavy editing depending on the project. - rmd='downstream/rnaseq.Rmd', - sampletable=config['sampletable'] - output: - 'downstream/rnaseq.html' - log: - 'downstream/rnaseq.log' - shell: - 'Rscript -e ' - '''"rmarkdown::render('{input.rmd}')" ''' - '> {log} 2>&1' - # [TEST_SETTINGS -1] rule flagstat: input: From e3308bf08e897df00120854e01691824b412627d Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 6 Jan 2025 19:28:27 -0500 Subject: [PATCH 036/196] back to utils.py --- rules/common.smk => lib/utils.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) rename rules/common.smk => lib/utils.py (98%) diff --git a/rules/common.smk b/lib/utils.py similarity index 98% rename from rules/common.smk rename to lib/utils.py index d0401625..f1a97c79 100644 --- a/rules/common.smk +++ b/lib/utils.py @@ -18,6 +18,12 @@ # Small helper functions +def render_r1_r2(pattern): + return expand(pattern, sample='{sample}', n=c.n) + +def render_r1_only(pattern): + return expand(pattern, sample='{sample}', n=1) + def resolve_name(name): """ @@ -30,14 +36,14 @@ def resolve_name(name): parts_copy = parts[:] while parts_copy: try: - module = __import__(".".join(parts_copy)) + module_ = __import__(".".join(parts_copy)) break except ImportError: del parts_copy[-1] if not parts_copy: raise parts = parts[1:] - obj = module + obj = module_ for part in parts: obj = getattr(obj, part) return obj @@ -559,10 +565,10 @@ def detect_layout(sampletable): p = sampletable.iloc[is_pe, 0].to_list() s = sampletable.iloc[[not i for i in is_pe], 0].to_list() if len(p) > len(s): - report = f"SE samples: {s}" + report_ = f"SE samples: {s}" else: - report = f"PE samples: {p}" - raise ValueError(f"Only a single layout (SE or PE) is supported. {report}") + report_ = f"PE samples: {p}" + raise ValueError(f"Only a single layout (SE or PE) is supported. {report_}") def fill_patterns(patterns, fill, combination=product): @@ -1184,6 +1190,6 @@ def wrapper_for(path): return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) def detect_sra(sampletable): - return 'Run' in self.sampletable.columns and any(self.sampletable['Run'].str.startswith('SRR')) + return 'Run' in sampletable.columns and any(sampletable['Run'].str.startswith('SRR')) # vim: ft=python From 1b62efc9ab904e2106eeaca4990c57befb597bed Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 6 Jan 2025 19:28:44 -0500 Subject: [PATCH 037/196] rm libsizes table from multiqc --- workflows/rnaseq/config/multiqc_config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/rnaseq/config/multiqc_config.yaml b/workflows/rnaseq/config/multiqc_config.yaml index 3e291495..0fe650a7 100644 --- a/workflows/rnaseq/config/multiqc_config.yaml +++ b/workflows/rnaseq/config/multiqc_config.yaml @@ -53,7 +53,6 @@ module_order: - '*.cutadapt.fastq.gz_fastqc.zip' path_filters: - '*.fastq.gz_fastqc.zip' - - libsizes_table - rrna_percentages_table - cutadapt - fastqc: From 16d8489d66e052c11930bacbfb741639a9858565 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 6 Jan 2025 19:30:07 -0500 Subject: [PATCH 038/196] use patterns --- workflows/rnaseq/Snakefile | 354 ++++++++++++++++--------------------- 1 file changed, 156 insertions(+), 198 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 84b9bdd4..1c041d22 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -1,23 +1,27 @@ +import sys import os import yaml import tempfile import pandas as pd +sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") +from lib import utils +from lib.utils import autobump, gb, hours + + configfile: 'config/config.yaml' include: '../../rules/references.smk' -include: '../../rules/common.smk' REFERENCES = config.get('reference_dir', '../../references') - sampletable = pd.read_table(config["sampletable"], sep="\t") -_st = c.sampletable.set_index(c.sampletable.columns[0]) -is_paired = detect_layout(sampletable) == "PE" -is_sra = detect_sra(sampletable) +sampletable = sampletable.set_index(sampletable.columns[0], drop=False) +is_paired = utils.detect_layout(sampletable) == "PE" +is_sra = utils.detect_sra(sampletable) n = ["1", "2"] if is_paired else ["1"] SAMPLES = sampletable.iloc[:, 0].values +patterns = yaml.safe_load(open('config/rnaseq_patterns.yaml')) -# TODO: moved utils.py over to common.smk; not sure if this means that postprocessing will fail or not... wildcard_constraints: n = '[1,2]', @@ -27,28 +31,18 @@ localrules: symlinks, symlink_targets rule all: input: - 'data/rnaseq_aggregation/multiqc.html', + patterns["multiqc"] if is_sra: include: '../../rules/sra.smk' -def orig_for_sample(wc): - """ - Given a sample, returns either one or two original fastq files - depending on whether the library was single- or paired-end. - """ - if is_paired: - return _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] - return _st.loc[wc.sample, ['orig_filename']] - - rule symlinks: input: - lambda wc: _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] if is_paired - else _st.loc[wc.sample, ['orig_filename']] + lambda wc: sampletable.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] if is_paired + else sampletable.loc[wc.sample, ['orig_filename']] output: - expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n) + expand(patterns["fastq"], n=n, allow_missing=True) threads: 1 resources: mem_mb=100, @@ -56,23 +50,26 @@ rule symlinks: run: assert len(output) == len(input), (input, output) for src, linkname in zip(input, output): - make_relative_symlink(src, linkname) + utils.make_relative_symlink(src, linkname) rule symlink_targets: - input: c.targets['fastq'] + input: + expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n) # This can be set at the command line with --config strand_check_reads=1000 config.setdefault('strand_check_reads', 1e5) -include: '../../rules/strand_check.smk' + +# TODO: re-enable +# include: '../../rules/strand_check.smk' rule cutadapt: input: - fastq=expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n) + fastq=expand(patterns["fastq"], n=n, allow_missing=True) output: - fastq=expand('data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz', sample=SAMPLES, n=n) + fastq=expand(patterns["cutadapt"], n=n, allow_missing=True) log: - 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz' + 'data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log' threads: 6 resources: mem_mb=gb(2), @@ -85,7 +82,7 @@ rule cutadapt: "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " ) + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " if is_paired else "" run: - if c.is_paired: + if is_paired: shell( "cutadapt " "-o {output[0]} " @@ -119,24 +116,18 @@ rule fastqc: mem_mb=gb(8), runtime=autobump(hours=2) script: - wrapper_for('fastqc/wrapper.py') + utils.wrapper_for('fastqc/wrapper.py') if config['aligner'] == 'hisat2': rule hisat2: input: - # TODO: make sure this works - fastq=( - 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz', - 'data/rnaseq_samples/{sample}/{sample}_R2.fastq.gz', - ) if is_paired else ( - 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz', - ), + fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=rules.hisat2_index.output, output: - bam=temporary(c.patterns['bam']) + bam=temporary(patterns['bam']) log: - c.patterns['bam'] + '.log' + patterns['bam'] + '.log' threads: 6 resources: mem_mb=gb(32), @@ -144,12 +135,10 @@ if config['aligner'] == 'hisat2': params: extra="" run: - prefix = os.path.commonprefix(input.index).rstrip(".") sam = output.bam.replace('.bam', '.sam') - - if c.is_paired: + if is_paired: assert len(input.fastq) == 2 fastqs = '-1 {0} -2 {1} '.format(*input.fastq) else: @@ -172,6 +161,8 @@ if config['aligner'] == 'hisat2': "&& rm {sam}" ) +# TODO: star has lots of rules. Better to be in rules/aligner.smk? + if config['aligner'].startswith('star'): # STAR can be run in 1-pass or 2-pass modes. Since we may be running it @@ -210,14 +201,14 @@ if config['aligner'] == 'star': Align with STAR (1-pass mode) """ input: - fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=rules.star_index.output, - annotation=REFERENCES + "/annotation.gtf" + annotation=f"{REFERENCES}/annotation.gtf" output: - bam=temporary(c.patterns['bam']), - sjout=temporary(c.patterns['bam'].replace('.bam', '.star.SJ.out.tab')), + bam=temporary(patterns['bam']), + sjout=temporary(patterns['bam'].replace('.bam', '.star.SJ.out.tab')), log: - c.patterns['bam'].replace('.bam', '.star.bam.log') + patterns['bam'].replace('.bam', '.star.bam.log') threads: 16 resources: mem_mb=gb(64), @@ -249,13 +240,13 @@ if config['aligner'] == 'star-twopass': First pass of alignment with STAR to get the junctions """ input: - fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=rules.star_index.output, - annotation=REFERENCES + "/annotation.gtf" + annotation=f"{REFERENCES}/annotation.gtf" output: - sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')), + sjout=temporary(patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')), log: - c.patterns['bam'].replace('.bam', '.star-pass1.bam.log') + patterns['bam'].replace('.bam', '.star-pass1.bam.log') threads: 16 resources: mem_mb=gb(64), @@ -289,15 +280,15 @@ if config['aligner'] == 'star-twopass': samples to get the final BAM """ input: - sjout=expand(c.patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES), - fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=rules.star_index.output, - annotation=REFERENCES + "/annotation.gtf" + annotation=f"{REFERENCES}/annotation.gtf", + sjout=expand(patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES), output: - bam=temporary(c.patterns['bam']), - sjout=temporary(c.patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')), + bam=temporary(patterns['bam']), + sjout=temporary(patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')), log: - c.patterns['bam'].replace('.bam', '.star-pass2.bam.log') + patterns['bam'].replace('.bam', '.star-pass2.bam.log') threads: 16 resources: mem_mb=gb(64), @@ -332,9 +323,9 @@ if config['aligner'] == 'star-twopass': rule rRNA: input: - fastq=render_r1_only(c.patterns['cutadapt']), + fastq=expand(patterns["cutadapt"], n=1, allow_missing=True), # currently only R1 index=multiext( - REFERENCES + "/bowtie2/rrna", + f"{REFERENCES}/bowtie2/rrna", ".1.bt2", ".2.bt2", ".3.bt2", @@ -344,9 +335,9 @@ rule rRNA: ".fa", ), output: - bam=temporary(c.patterns['rrna']['bam']) + bam=temporary(patterns['rrna']['bam']) log: - c.patterns['rrna']['bam'] + '.log' + patterns['rrna']['bam'] + '.log' threads: 6 resources: mem_mb=gb(2), @@ -356,7 +347,6 @@ rule rRNA: '-k 1 ' # NOTE: we only care if >=1 mapped '--no-unal ' # NOTE: suppress unaligned reads ) - run: prefix = os.path.commonprefix(input.index).rstrip(".") sam = output.bam.replace('.bam', '.sam') @@ -366,8 +356,8 @@ rule rRNA: "-x {prefix} " "-U {input.fastq} " "--threads {threads} " - "-S {sam} " "{params.extra} " + "-S {sam} " "> {log} 2>&1" ) @@ -417,13 +407,14 @@ rule bam_index: 'samtools index {input} {output}' +# TODO: split into multiple featurecounts runs, since PE needs to be sorted each time. rule featurecounts: """ Count reads in annotations with featureCounts from the subread package """ input: annotation=rules.gtf.output, - bam=c.targets['markduplicates']['bam'] + bam=expand(patterns['markduplicates']['bam'], sample=SAMPLES), output: counts='{sample_dir}/rnaseq_aggregation/featurecounts.txt' log: @@ -442,7 +433,7 @@ rule featurecounts: run: # NOTE: By default, we use -p for paired-end p_arg = '' - if c.is_paired: + if is_paired: p_arg = '-p --countReadPairs ' shell( 'featureCounts ' @@ -461,21 +452,21 @@ rule rrna_libsizes_table: Aggregate rRNA counts into a table """ input: - rrna=c.targets['rrna']['libsize'], - fastq=c.targets['libsizes']['cutadapt'] + rrna=expand(patterns['rrna']['libsize'], sample=SAMPLES), + fastq=expand(patterns['libsizes']['cutadapt'], sample=SAMPLES), output: - json=c.patterns['rrna_percentages_yaml'], - tsv=c.patterns['rrna_percentages_table'] + json=patterns['rrna_percentages_yaml'], + tsv=patterns['rrna_percentages_table'] threads: 1 resources: mem_mb=gb(2), runtime=autobump(hours=2) run: def rrna_sample(f): - return extract_wildcards(c.patterns['rrna']['libsize'], f)['sample'] + return utils.extract_wildcards(patterns['rrna']['libsize'], f)['sample'] def sample(f): - return extract_wildcards(c.patterns['libsizes']['cutadapt'], f)['sample'] + return utils.extract_wildcards(patterns['libsizes']['cutadapt'], f)['sample'] def million(f): return float(open(f).read()) / 1e6 @@ -514,30 +505,21 @@ rule rrna_libsizes_table: rule multiqc: input: files=( - expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n), - expand( - 'data/rnaseq_samples/{sample}/fastqc/{sample}_R1{kind}.fastq.gz_fastqc.zip', - sample=SAMPLES, kind=["", ".cutadapt", ".cutadapt.bam"] - ), - expand( - 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups{ext}', - sample=SAMPLES, ext=['.bam', '.bam.bai'] - ), - expand( - 'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf', - sample=SAMPLES - ), - expand('data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5', sample=SAMPLES), - expand('data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt', sample=SAMPLES), - expand('data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt', sample=SAMPLES), - expand('data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt', sample=SAMPLES), - expand('data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics', sample=SAMPLES), - expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig', sample=SAMPLES, dir=["pos", "neg"]), - expand('data/rnaseq_samples/{sample}/idxstat_{sample}.txt', sample=SAMPLES), - expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat', sample=SAMPLES), - expand('data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats', sample=SAMPLES), - 'data/rnaseq_aggregation/rrna_percentages_table.tsv', - 'data/rnaseq_aggregation/featurecounts.txt', + expand(patterns["fastqc"]["raw"], sample=SAMPLES), + expand(patterns["fastqc"]["cutadapt"], sample=SAMPLES), + expand(patterns["fastqc"]["bam"], sample=SAMPLES), + expand(patterns["markduplicates"]["bam"], sample=SAMPLES), + expand(patterns["salmon"], sample=SAMPLES), + expand(patterns["kallisto"], sample=SAMPLES), + expand(patterns["preseq"], sample=SAMPLES), + expand(patterns["rseqc"]["infer_experiment"], sample=SAMPLES), + expand(patterns["rseqc"]["read_distribution"], sample=SAMPLES), + expand(patterns["collectrnaseqmetrics"]["metrics"], sample=SAMPLES), + expand(patterns["samtools"]["idxstats"], sample=SAMPLES), + expand(patterns["samtools"]["flagstat"], sample=SAMPLES), + expand(patterns["samtools"]["stats"], sample=SAMPLES), + patterns["rrna_percentages_table"], + patterns["featurecounts"], ), config='config/multiqc_config.yaml' output: @@ -550,8 +532,8 @@ rule multiqc: runtime=autobump(hours=2) run: analysis_directory = set([os.path.dirname(i) for i in input]) - outdir = os.path.dirname(c.targets['multiqc'][0]) - basename = os.path.basename(c.targets['multiqc'][0]) + outdir = os.path.dirname(output[0]) + basename = os.path.basename(output[0]) shell( 'LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 ' 'multiqc ' @@ -570,16 +552,13 @@ rule markduplicates: Mark or remove PCR duplicates with Picard MarkDuplicates """ input: - bam=c.patterns['bam'] + bam=patterns['bam'] output: - bam=c.patterns['markduplicates']['bam'], - metrics=c.patterns['markduplicates']['metrics'] + bam=patterns['markduplicates']['bam'], + metrics=patterns['markduplicates']['metrics'], log: - c.patterns['markduplicates']['bam'] + '.log' + patterns['markduplicates']['bam'] + '.log' params: - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. java_args='-Xmx20g' # java_args='-Xmx2g' # [TEST SETTINGS -1] threads: 1 @@ -603,23 +582,20 @@ rule collectrnaseqmetrics: Calculate various RNA-seq QC metrics with Picarc CollectRnaSeqMetrics """ input: - bam=c.patterns['markduplicates']['bam'], + bam=patterns['markduplicates']['bam'], refflat=rules.conversion_refflat.output, output: - metrics=c.patterns['collectrnaseqmetrics']['metrics'], + metrics=patterns['collectrnaseqmetrics']['metrics'], params: - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. java_args='-Xmx20g', # java_args='-Xmx2g', # [TEST SETTINGS -1] strand_arg={ 'unstranded': 'STRAND=NONE ', 'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ', 'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ', - }[config["stranded"] + }[config["stranded"]] log: - c.patterns['collectrnaseqmetrics']['metrics'] + '.log' + patterns['collectrnaseqmetrics']['metrics'] + '.log' threads: 1 resources: mem_mb=gb(32), @@ -643,9 +619,9 @@ rule preseq: Compute a library complexity curve with preseq """ input: - bam=c.patterns['bam'] + bam=patterns['bam'] output: - c.patterns['preseq'] + patterns['preseq'] threads: 1 resources: mem_mb=gb(1), @@ -662,38 +638,36 @@ rule salmon: Quantify reads coming from transcripts with Salmon """ input: - fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=REFERENCES + "/salmon/versionInfo.json" output: - c.patterns['salmon'] - params: - index_dir=os.path.dirname(REFERENCES + "/salmon/versionInfo.json"), - outdir=os.path.dirname(c.patterns['salmon']) + patterns['salmon'] log: - c.patterns['salmon'] + '.log' + patterns['salmon'] + '.log' + params: + extra=( + "--libType=A " + "--gcBias " + "--seqBias " + "--validateMappings " + ) threads: 6 resources: mem_mb=gb(32), runtime=autobump(hours=2) run: - if c.is_paired: + outdir = os.path.dirname(output[0]) + index_dir = os.path.dirname(input.index) + if is_paired: fastq_arg = f'-1 {input.fastq[0]} -2 {input.fastq[1]} ' else: fastq_arg = f'-r {input.fastq} ' shell( 'salmon quant ' - '--index {params.index_dir} ' - '--output {params.outdir} ' + '--index {index_dir} ' + '--output {outdir} ' '--threads {threads} ' - - # NOTE: --libType=A auto-detects library type. Change if needed. - '--libType=A ' - - # NOTE: Docs suggest using --gcBias, --validateMappings, and - # --seqBias is a good idea - '--gcBias ' - '--seqBias ' - '--validateMappings ' + '{params.extra} ' '{fastq_arg} ' '&> {log}' ) @@ -704,43 +678,38 @@ rule kallisto: Quantify reads coming from transcripts with Kallisto """ input: - fastq=fill_r1_r2(c.sampletable, c.patterns['cutadapt']), + fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=REFERENCES + "/kallisto/transcripts.idx", output: - c.patterns['kallisto'] + patterns['kallisto'] params: - index_dir=os.path.dirname(REFERENCES + "/kallisto/transcripts.idx"), - outdir=os.path.dirname(c.patterns['kallisto']), strand_arg={ 'unstranded': '', 'fr-firststrand': '--rf-stranded', 'fr-secondstrand': '--fr-stranded', - }[config["stranded"] + }[config["stranded"]], + extra=( + "--bootstrap-samples 100" if is_paired else + "--single --fragment-length 300 --sd 20 --bootstrap-samples 100" + ), log: - c.patterns['kallisto'] + '.log' + patterns['kallisto'] + '.log' threads: 8 resources: mem_mb=gb(32), runtime=autobump(hours=2), run: - if c.is_paired: - se_args = '' - assert len(input.fastq) == 2 - else: - # For single-end, add the experimentally-determined fragment length - # and standard deviation here - se_args = '--single --fragment-length 300 --sd 20 ' - assert len(input.fastq) == 1 + outdir = os.path.dirname(output[0]) shell( 'kallisto quant ' '--index {input.index} ' - '--output-dir {params.outdir} ' + '--output-dir {outdir} ' '--threads {threads} ' '--bootstrap-samples 100 ' '--threads {threads} ' - '{se_args} ' '{params.strand_arg} ' + '{params.extra} ' '{input.fastq} ' '&> {log}' ) @@ -750,30 +719,30 @@ rule rseqc_infer_experiment: Infer strandedness of experiment """ input: - bam=c.patterns['markduplicates']['bam'], + bam=patterns['markduplicates']['bam'], bed12=rules.conversion_bed12.output, output: - txt=c.patterns['rseqc']['infer_experiment'] + txt=patterns['rseqc']['infer_experiment'] log: - c.patterns['rseqc']['infer_experiment'] + '.log' + patterns['rseqc']['infer_experiment'] + '.log' resources: mem_mb=gb(2), runtime=autobump(hours=2) - shell: 'infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}' + rule rseqc_read_distribution: """ read distribution plots """ input: - bam=c.patterns['markduplicates']['bam'], + bam=patterns['markduplicates']['bam'], bed12=rules.conversion_bed12.output, output: - txt=c.patterns['rseqc']['read_distribution'] + txt=patterns['rseqc']['read_distribution'] log: - c.patterns['rseqc']['read_distribution'] + '.log' + patterns['rseqc']['read_distribution'] + '.log' resources: mem_mb=gb(2), runtime=autobump(hours=2) @@ -786,12 +755,12 @@ rule idxstats: Run samtools idxstats on sample bams """ input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai' + bam=patterns['markduplicates']['bam'], + bai=patterns['markduplicates']['bam'] + '.bai' output: - txt=c.patterns['samtools']['idxstats'] + txt=patterns['samtools']['idxstats'] log: - c.patterns['samtools']['idxstats'] + '.log' + patterns['samtools']['idxstats'] + '.log' resources: mem_mb=gb(16), runtime=autobump(hours=2) @@ -801,40 +770,39 @@ rule idxstats: ) -# Common arguments used for bamCoverage rules below -BAMCOVERAGE_ARGS = ( - '--minMappingQuality 20 ' # excludes multimappers - '--smoothLength 10 ' # smooth signal with specified window - # '--normalizeUsing BPM ' # equivalent to TPM # [TEST SETTINGS] -) - rule bigwig_neg: """ Create a bigwig for negative-strand reads """ input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai', - output: c.patterns['bigwig']['neg'] + bam=patterns['markduplicates']['bam'], + bai=patterns['markduplicates']['bam'] + '.bai', + output: + patterns['bigwig']['neg'] threads: 8 resources: mem_mb=gb(16), runtime=autobump(hours=2) log: - c.patterns['bigwig']['neg'] + '.log' + patterns['bigwig']['neg'] + '.log' params: strand_arg = { 'unstranded': '', 'fr-firststrand': '--filterRNAstrand reverse ', 'fr-secondstrand': '--filterRNAstrand forward ', - }[config["stranded"] + }[config["stranded"]], + extra=( + '--minMappingQuality 20 ' + '--smoothLength 10 ' + '--normalizeUsing BPM ' # equivalent to TPM # [TEST SETTINGS] + ), run: shell( 'bamCoverage ' '--bam {input.bam} ' '-o {output} ' '-p {threads} ' - '{BAMCOVERAGE_ARGS} ' + '{params.extra} ' '{params.strand_arg} ' '&> {log}' ) @@ -845,21 +813,27 @@ rule bigwig_pos: Create a bigwig for postive-strand reads. """ input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai', - output: c.patterns['bigwig']['pos'] + bam=patterns['markduplicates']['bam'], + bai=patterns['markduplicates']['bam'] + '.bai', + output: + patterns['bigwig']['pos'] threads: 8 resources: mem_mb=gb(16), runtime=autobump(hours=2) log: - c.patterns['bigwig']['pos'] + '.log' + patterns['bigwig']['pos'] + '.log' params: strand_arg={ 'unstranded': '', 'fr-firststrand': '--filterRNAstrand forward ', 'fr-secondstrand': '--filterRNAstrand reverse ', - }[config["stranded"] + }[config["stranded"]], + extra=( + '--minMappingQuality 20 ' + '--smoothLength 10 ' + '--normalizeUsing BPM ' # equivalent to TPM # [TEST SETTINGS] + ), run: shell( 'bamCoverage ' @@ -872,43 +846,27 @@ rule bigwig_pos: ) -def bigwigs_to_merge(wc): - chunk = config['merged_bigwigs'][wc.merged_bigwig_label] - neg_labels = chunk.get('neg', []) - pos_labels = chunk.get('pos', []) - pos_bigwigs = expand( - c.patterns['bigwig']['pos'], - sample=pos_labels - ) - neg_bigwigs = expand( - c.patterns['bigwig']['neg'], - sample=neg_labels) - return pos_bigwigs + neg_bigwigs - - rule flagstat: input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai' + bam=patterns['markduplicates']['bam'], + bai=patterns['markduplicates']['bam'] + '.bai' output: - c.patterns['samtools']['flagstat'] + patterns['samtools']['flagstat'] log: - c.patterns['samtools']['flagstat'] + '.log' + patterns['samtools']['flagstat'] + '.log' shell: 'samtools flagstat {input.bam} > {output}' rule samtools_stats: input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai' + bam=patterns['markduplicates']['bam'], + bai=patterns['markduplicates']['bam'] + '.bai' output: - c.patterns['samtools']['stats'] + patterns['samtools']['stats'] log: - c.patterns['samtools']['stats'] + '.log' + patterns['samtools']['stats'] + '.log' shell: 'samtools stats {input.bam} > {output}' - - # vim: ft=python From d6b512aa4c70d262567ca9729a0b0c76f6ca1a96 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 6 Jan 2025 22:06:57 -0500 Subject: [PATCH 039/196] mv back to workflows/references/Snakefile --- rules/references.smk => workflows/references/Snakefile | 0 workflows/rnaseq/Snakefile | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename rules/references.smk => workflows/references/Snakefile (100%) diff --git a/rules/references.smk b/workflows/references/Snakefile similarity index 100% rename from rules/references.smk rename to workflows/references/Snakefile diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 1c041d22..f5408301 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -11,7 +11,7 @@ from lib.utils import autobump, gb, hours configfile: 'config/config.yaml' -include: '../../rules/references.smk' +include: '../references/Snakefile' REFERENCES = config.get('reference_dir', '../../references') sampletable = pd.read_table(config["sampletable"], sep="\t") From 99f3f6b63dead9a8e4eae73968ca8843dc4a5295 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 6 Jan 2025 22:07:33 -0500 Subject: [PATCH 040/196] fix params for bigwig --- workflows/rnaseq/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index f5408301..62d2788b 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -840,7 +840,7 @@ rule bigwig_pos: '--bam {input.bam} ' '-o {output} ' '-p {threads} ' - '{BAMCOVERAGE_ARGS} ' + '{params.extra} ' '{params.strand_arg} ' '&> {log}' ) From bf276c3a7af8ef1510fb5f2f9efdc8f190fbd9ad Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 6 Jan 2025 22:07:53 -0500 Subject: [PATCH 041/196] use slightly cleaner syntax --- workflows/references/Snakefile | 79 +++++++++++++++++----------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index 157eeed9..bf4cf212 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -2,8 +2,7 @@ import os import sys import pandas -HERE = str(Path(workflow.snakefile).parent) -sys.path.insert(0, HERE + "/../..") +sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") from lib.utils import autobump, gb, hours from lib import utils @@ -14,9 +13,9 @@ def default_postprocess(origfn, newfn): rule fasta: output: - temporary(REFERENCES + '/genome.fa.gz') + temporary(f'{REFERENCES}/genome.fa.gz') log: - REFERENCES + "/logs/genome.fa.gz.log" + f"{REFERENCES}/logs/genome.fa.gz.log" run: utils.download_and_postprocess( urls=config['fasta']['url'], @@ -28,9 +27,9 @@ rule fasta: rule gtf: output: - temporary(REFERENCES + '/annotation.gtf.gz') + temporary(f"{REFERENCES}/annotation.gtf.gz") log: - REFERENCES + "/logs/annotation.gtf.gz.log" + f"{REFERENCES}/logs/annotation.gtf.gz.log" run: utils.download_and_postprocess( urls=config['gtf']['url'], @@ -42,9 +41,9 @@ rule gtf: rule rrna: output: - temporary(REFERENCES + '/rrna.fa.gz') + temporary(f"{REFERENCES}/rrna.fa.gz") log: - REFERENCES + "/logs/rrna.fa.gz.log" + f"{REFERENCES}/logs/rrna.fa.gz.log" run: utils.download_and_postprocess( urls=config['rrna']['url'], @@ -56,18 +55,18 @@ rule rrna: rule unzip: input: - REFERENCES + '/{prefix}.gz' + f"{REFERENCES}/{prefix}.gz" output: - REFERENCES + '/{prefix}' + f"{REFERENCES}/{prefix}" shell: 'gunzip -c {input} > {output}' rule bowtie2_index: input: - REFERENCES + '/{label}.fa', + f"{REFERENCES}/{label}.fa", output: multiext( - REFERENCES + '/bowtie2/{label}', + f"{REFERENCES}/bowtie2/{label}", ".1.bt2", ".2.bt2", ".3.bt2", @@ -77,7 +76,7 @@ rule bowtie2_index: ".fa", ), log: - REFERENCES + '/logs/bowtie2_{label}.log' + f"{REFERENCES}/logs/bowtie2_{label}.log' resources: runtime=autobump(hours=8), mem_mb=autobump(gb=32), @@ -98,12 +97,12 @@ rule bowtie2_index: rule star_index: input: - fasta=REFERENCES + '/genome.fa', - gtf=REFERENCES + '/annotation.gtf', + fasta=f"{REFERENCES}/genome.fa", + gtf=f"{REFERENCES}/annotation.gtf", output: - REFERENCES + '/star/Genome' + f"{REFERENCES}/star/Genome" log: - REFERENCES + '/logs/star.log' + f"{REFERENCES}/logs/star.log" threads: 8 resources: @@ -139,10 +138,10 @@ rule star_index: rule hisat2_index: input: - REFERENCES + '/genome.fa', + f"{REFERENCES}/genome.fa", output: multiext( - REFERENCES + '/hisat2/genome', + f"{REFERENCES}/hisat2/genome", ".1.ht2", ".2.ht2", ".3.ht2", @@ -154,7 +153,7 @@ rule hisat2_index: ".fa", ) log: - REFERENCES + '/logs/hisat2.log' + f"{REFERENCES}/logs/hisat2.log" resources: runtime=autobump(hours=8), mem_mb=autobump(gb=32), @@ -176,10 +175,10 @@ rule hisat2_index: rule transcriptome_fasta: input: - fasta=REFERENCES + '/genome.fa', - gtf=REFERENCES + '/annotation.gtf', + fasta=f"{REFERENCES}/genome.fa", + gtf=f"{REFERENCES}/annotation.gtf", output: - REFERENCES + '/transcriptome.fa' + f"{REFERENCES}/transcriptome.fa" resources: runtime=hours(1) shell: @@ -188,13 +187,13 @@ rule transcriptome_fasta: rule salmon_index: input: - REFERENCES + '/transcriptome.fa' + f"{REFERENCES}/transcriptome.fa" output: - REFERENCES + '/salmon/versionInfo.json' + f"{REFERENCES}/salmon/versionInfo.json" log: - REFERENCES + '/logs/salmon.log' + f"{REFERENCES}/logs/salmon.log" params: - outdir=REFERENCES + '/salmon' + outdir=f"{REFERENCES}/salmon" resources: mem_mb=gb(32), runtime=hours(2) @@ -210,11 +209,11 @@ rule salmon_index: rule kallisto_index: output: - REFERENCES + '/kallisto/transcripts.idx', + f"{REFERENCES}/kallisto/transcripts.idx", input: - REFERENCES + '/genome.fa' + f"{REFERENCES}/genome.fa" log: - REFERENCES + '/logs/kallisto.log' + f"{REFERENCES}/logs/kallisto.log" resources: runtime=hours(2), mem_mb=gb(32), @@ -227,11 +226,11 @@ rule kallisto_index: rule conversion_refflat: input: - REFERENCES + '/annotation.gtf' + f"{REFERENCES}/annotation.gtf" output: - REFERENCES + '/annotation.refflat' + f"{REFERENCES}/annotation.refflat" log: - REFERENCES + '/logs/annotation.refflat.log' + f"{REFERENCES}/logs/annotation.refflat.log" resources: runtime=hours(2), mem_mb=gb(2) @@ -243,9 +242,9 @@ rule conversion_refflat: rule conversion_bed12: input: - REFERENCES + '/annotation.gtf' + f"{REFERENCES}/annotation.gtf" output: - REFERENCES + '/annotation.bed12' + f"{REFERENCES}/annotation.bed12" resources: runtime=hours(2), mem_mb=gb(2) @@ -257,11 +256,11 @@ rule conversion_bed12: rule chromsizes: input: - REFERENCES + '/genome.fa' + f"{REFERENCES}/genome.fa" output: - REFERENCES + '/genome.chromsizes' + f"{REFERENCES}/genome.chromsizes" log: - REFERENCES + '/logs/genome.chromsizes.log' + f"{REFERENCES}/logs/genome.chromsizes.log" params: # NOTE: Be careful with the memory here; make sure you have enough # and/or it matches the resources you're requesting @@ -288,9 +287,9 @@ rule mappings: Creates gzipped TSV mapping between attributes in the GTF. """ input: - gtf=REFERENCES + '/annotation.gtf' + gtf=f"{REFERENCES}/annotation.gtf" output: - REFERENCES + '/annotation.mapping.tsv.gz' + f"{REFERENCES}/annotation.mapping.tsv.gz" params: include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', []) resources: From 4328264119571baefb5ce17518670629fe5c6b8a Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 6 Jan 2025 22:12:33 -0500 Subject: [PATCH 042/196] always put params directly before run/shell --- workflows/rnaseq/Snakefile | 44 ++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 23 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 62d2788b..5e37daf1 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -446,7 +446,7 @@ rule featurecounts: '&> {log}' ) - +# TODO: port some of this over to utils, or maybe script. rule rrna_libsizes_table: """ Aggregate rRNA counts into a table @@ -558,14 +558,14 @@ rule markduplicates: metrics=patterns['markduplicates']['metrics'], log: patterns['markduplicates']['bam'] + '.log' - params: - java_args='-Xmx20g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] threads: 1 resources: mem_mb=gb(32), runtime=autobump(hours=2), disk_mb=autobump(gb=100), + params: + java_args='-Xmx20g' + # java_args='-Xmx2g' # [TEST SETTINGS -1] shell: 'picard ' '{params.java_args} ' @@ -586,6 +586,12 @@ rule collectrnaseqmetrics: refflat=rules.conversion_refflat.output, output: metrics=patterns['collectrnaseqmetrics']['metrics'], + log: + patterns['collectrnaseqmetrics']['metrics'] + '.log' + threads: 1 + resources: + mem_mb=gb(32), + runtime=autobump(hours=2) params: java_args='-Xmx20g', # java_args='-Xmx2g', # [TEST SETTINGS -1] @@ -594,12 +600,6 @@ rule collectrnaseqmetrics: 'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ', 'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ', }[config["stranded"]] - log: - patterns['collectrnaseqmetrics']['metrics'] + '.log' - threads: 1 - resources: - mem_mb=gb(32), - runtime=autobump(hours=2) run: shell( 'picard ' @@ -644,6 +644,10 @@ rule salmon: patterns['salmon'] log: patterns['salmon'] + '.log' + threads: 6 + resources: + mem_mb=gb(32), + runtime=autobump(hours=2) params: extra=( "--libType=A " @@ -651,10 +655,6 @@ rule salmon: "--seqBias " "--validateMappings " ) - threads: 6 - resources: - mem_mb=gb(32), - runtime=autobump(hours=2) run: outdir = os.path.dirname(output[0]) index_dir = os.path.dirname(input.index) @@ -682,6 +682,13 @@ rule kallisto: index=REFERENCES + "/kallisto/transcripts.idx", output: patterns['kallisto'] + log: + patterns['kallisto'] + '.log' + threads: + 8 + resources: + mem_mb=gb(32), + runtime=autobump(hours=2), params: strand_arg={ 'unstranded': '', @@ -692,13 +699,6 @@ rule kallisto: "--bootstrap-samples 100" if is_paired else "--single --fragment-length 300 --sd 20 --bootstrap-samples 100" ), - log: - patterns['kallisto'] + '.log' - threads: - 8 - resources: - mem_mb=gb(32), - runtime=autobump(hours=2), run: outdir = os.path.dirname(output[0]) shell( @@ -868,5 +868,3 @@ rule samtools_stats: patterns['samtools']['stats'] + '.log' shell: 'samtools stats {input.bam} > {output}' - -# vim: ft=python From 5134c9e415660a4c77b550a60b378c12f09217c3 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 6 Jan 2025 22:24:27 -0500 Subject: [PATCH 043/196] run snakefmt on references --- workflows/references/Snakefile | 214 +++++++++++++++------------------ 1 file changed, 98 insertions(+), 116 deletions(-) diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index bf4cf212..d6dcf759 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -6,67 +6,70 @@ sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") from lib.utils import autobump, gb, hours from lib import utils -REFERENCES = config.get('reference_dir', '../../references') +REFERENCES = config.get("reference_dir", "../../references") + def default_postprocess(origfn, newfn): shell("mv {origfn} {newfn}") + rule fasta: output: - temporary(f'{REFERENCES}/genome.fa.gz') + temporary(f"{REFERENCES}/genome.fa.gz"), log: - f"{REFERENCES}/logs/genome.fa.gz.log" + f"{REFERENCES}/logs/genome.fa.gz.log", run: utils.download_and_postprocess( - urls=config['fasta']['url'], - postprocess=config['fasta'].get('postprocess', None), + urls=config["fasta"]["url"], + postprocess=config["fasta"].get("postprocess", None), outfile=output[0], - log=log + log=log, ) rule gtf: output: - temporary(f"{REFERENCES}/annotation.gtf.gz") + temporary(f"{REFERENCES}/annotation.gtf.gz"), log: - f"{REFERENCES}/logs/annotation.gtf.gz.log" + f"{REFERENCES}/logs/annotation.gtf.gz.log", run: utils.download_and_postprocess( - urls=config['gtf']['url'], - postprocess=config['gtf'].get('postprocess', None), + urls=config["gtf"]["url"], + postprocess=config["gtf"].get("postprocess", None), outfile=output[0], - log=log + log=log, ) rule rrna: output: - temporary(f"{REFERENCES}/rrna.fa.gz") + temporary(f"{REFERENCES}/rrna.fa.gz"), log: - f"{REFERENCES}/logs/rrna.fa.gz.log" + f"{REFERENCES}/logs/rrna.fa.gz.log", run: utils.download_and_postprocess( - urls=config['rrna']['url'], - postprocess=config['rrna'].get('postprocess', None), + urls=config["rrna"]["url"], + postprocess=config["rrna"].get("postprocess", None), outfile=output[0], - log=log + log=log, ) rule unzip: input: - f"{REFERENCES}/{prefix}.gz" + f"{REFERENCES}/{{prefix}}.gz", output: - f"{REFERENCES}/{prefix}" - shell: 'gunzip -c {input} > {output}' + f"{REFERENCES}/{{prefix}}", + shell: + "gunzip -c {input} > {output}" rule bowtie2_index: input: - f"{REFERENCES}/{label}.fa", + f"{REFERENCES}/{{label}}.fa", output: multiext( - f"{REFERENCES}/bowtie2/{label}", + f"{REFERENCES}/bowtie2/{{label}}", ".1.bt2", ".2.bt2", ".3.bt2", @@ -76,22 +79,15 @@ rule bowtie2_index: ".fa", ), log: - f"{REFERENCES}/logs/bowtie2_{label}.log' + f"{REFERENCES}/logs/bowtie2_{{label}}.log", resources: runtime=autobump(hours=8), mem_mb=autobump(gb=32), - disk_mb=autobump(gb=50) - threads: - 8 + disk_mb=autobump(gb=50), + threads: 8 run: index = os.path.commonprefix(output).rstrip(".") - shell( - "bowtie2-build" - " --threads {threads}" - " {input}" - " {index}" - " &> {log}" - ) + shell("bowtie2-build" " --threads {threads}" " {input}" " {index}" " &> {log}") utils.make_relative_symlink(input[0], output[-1]) @@ -100,42 +96,39 @@ rule star_index: fasta=f"{REFERENCES}/genome.fa", gtf=f"{REFERENCES}/annotation.gtf", output: - f"{REFERENCES}/star/Genome" + f"{REFERENCES}/star/Genome", log: - f"{REFERENCES}/logs/star.log" - threads: - 8 + f"{REFERENCES}/logs/star.log", + threads: 8 resources: runtime=autobump(hours=8), - mem_mb=gb(64) + mem_mb=gb(64), run: genomedir = os.path.dirname(output[0]) - shell('rm -r {genomedir}') - shell('mkdir -p {genomedir}') + shell("rm -r {genomedir}") + shell("mkdir -p {genomedir}") shell( - 'STAR ' - '--runMode genomeGenerate ' - '--runThreadN {threads} ' - '--genomeDir {genomedir} ' - '--genomeFastaFiles {input.fasta} ' - + "STAR " + "--runMode genomeGenerate " + "--runThreadN {threads} " + "--genomeDir {genomedir} " + "--genomeFastaFiles {input.fasta} " # NOTE: GTF is optional - '--sjdbGTFfile {input.gtf} ' - + "--sjdbGTFfile {input.gtf} " # NOTE: STAR docs say that 100 should work well. - '--sjdbOverhang 100 ' - + "--sjdbOverhang 100 " # NOTE: for small genomes, may need to scale this down to # min(14, log2(GenomeLength) / 2 - 1) # --genomeSAindexNbases 14 - '&> {log}' + "&> {log}" ) # STAR writes a hard-coded Log.out file to the current working # directory. So put that on the end of the log file for the rule and # then clean up. - shell('cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out') + shell("cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out") shell("ln -s {input.fasta} {genomedir}") + rule hisat2_index: input: f"{REFERENCES}/genome.fa", @@ -151,135 +144,122 @@ rule hisat2_index: ".7.ht2", ".8.ht2", ".fa", - ) + ), log: - f"{REFERENCES}/logs/hisat2.log" + f"{REFERENCES}/logs/hisat2.log", resources: runtime=autobump(hours=8), mem_mb=autobump(gb=32), - disk_mb=autobump(gb=50) - threads: - 8 + disk_mb=autobump(gb=50), + threads: 8 run: index = os.path.commonprefix(output).rstrip(".") - shell( - "hisat2-build" - " --threads {threads}" - " {input}" - " {index}" - " &> {log}" - ) + shell("hisat2-build" " --threads {threads}" " {input}" " {index}" " &> {log}") shell("ln -s {input} {output[-1]}") - rule transcriptome_fasta: input: fasta=f"{REFERENCES}/genome.fa", gtf=f"{REFERENCES}/annotation.gtf", output: - f"{REFERENCES}/transcriptome.fa" + f"{REFERENCES}/transcriptome.fa", resources: - runtime=hours(1) + runtime=hours(1), shell: - 'gffread {input.gtf} -w {output} -g {input.fasta}' + "gffread {input.gtf} -w {output} -g {input.fasta}" rule salmon_index: input: - f"{REFERENCES}/transcriptome.fa" + f"{REFERENCES}/transcriptome.fa", output: - f"{REFERENCES}/salmon/versionInfo.json" + f"{REFERENCES}/salmon/versionInfo.json", log: - f"{REFERENCES}/logs/salmon.log" + f"{REFERENCES}/logs/salmon.log", params: - outdir=f"{REFERENCES}/salmon" + outdir=f"{REFERENCES}/salmon", resources: mem_mb=gb(32), - runtime=hours(2) + runtime=hours(2), run: outdir = os.path.dirname(output[0]) - shell( - 'salmon index ' - '--transcripts {input} ' - '--index {outdir} ' - '&> {log}' - ) + shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}") rule kallisto_index: output: f"{REFERENCES}/kallisto/transcripts.idx", input: - f"{REFERENCES}/genome.fa" + f"{REFERENCES}/genome.fa", log: - f"{REFERENCES}/logs/kallisto.log" + f"{REFERENCES}/logs/kallisto.log", resources: runtime=hours(2), mem_mb=gb(32), shell: - 'kallisto index ' - '--index {output} ' - '{input} ' - '&> {log}' + "kallisto index " + "--index {output} " + "{input} " + "&> {log}" rule conversion_refflat: input: - f"{REFERENCES}/annotation.gtf" + f"{REFERENCES}/annotation.gtf", output: - f"{REFERENCES}/annotation.refflat" + f"{REFERENCES}/annotation.refflat", log: - f"{REFERENCES}/logs/annotation.refflat.log" + f"{REFERENCES}/logs/annotation.refflat.log", resources: runtime=hours(2), - mem_mb=gb(2) + mem_mb=gb(2), shell: - 'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp ' - '''&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} ''' - '&& rm {output}.tmp ' + "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp " + """&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} """ + "&& rm {output}.tmp " rule conversion_bed12: input: - f"{REFERENCES}/annotation.gtf" + f"{REFERENCES}/annotation.gtf", output: - f"{REFERENCES}/annotation.bed12" + f"{REFERENCES}/annotation.bed12", resources: runtime=hours(2), - mem_mb=gb(2) + mem_mb=gb(2), shell: - 'gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp ' - '&& genePredToBed {output}.tmp {output} ' - '&& rm {output}.tmp' + "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp " + "&& genePredToBed {output}.tmp {output} " + "&& rm {output}.tmp" rule chromsizes: input: - f"{REFERENCES}/genome.fa" + f"{REFERENCES}/genome.fa", output: - f"{REFERENCES}/genome.chromsizes" + f"{REFERENCES}/genome.chromsizes", log: - f"{REFERENCES}/logs/genome.chromsizes.log" + f"{REFERENCES}/logs/genome.chromsizes.log", params: # NOTE: Be careful with the memory here; make sure you have enough # and/or it matches the resources you're requesting - java_args='-Xmx20g' + java_args="-Xmx20g", # java_args='-Xmx2g' # [TEST SETTINGS -1] resources: mem_mb=gb(24), - runtime=hours(2) + runtime=hours(2), shell: - 'export LC_COLLATE=C; ' - 'rm -f {output}.tmp ' - '&& picard ' - '{params.java_args} ' - 'CreateSequenceDictionary R={input} O={output}.tmp &> {log} ' + "export LC_COLLATE=C; " + "rm -f {output}.tmp " + "&& picard " + "{params.java_args} " + "CreateSequenceDictionary R={input} O={output}.tmp &> {log} " '&& grep "^@SQ" {output}.tmp ' - '''| awk '{{print $2, $3}}' ''' + """| awk '{{print $2, $3}}' """ '| sed "s/SN://g;s/ LN:/\\t/g" ' - '| sort -k1,1 > {output} ' - '&& rm -f {output}.tmp ' + "| sort -k1,1 > {output} " + "&& rm -f {output}.tmp " rule mappings: @@ -287,14 +267,16 @@ rule mappings: Creates gzipped TSV mapping between attributes in the GTF. """ input: - gtf=f"{REFERENCES}/annotation.gtf" + gtf=f"{REFERENCES}/annotation.gtf", output: - f"{REFERENCES}/annotation.mapping.tsv.gz" + f"{REFERENCES}/annotation.mapping.tsv.gz", params: - include_featuretypes=lambda wildcards, output: conversion_kwargs[output[0]].get('include_featuretypes', []) + include_featuretypes=lambda wildcards, output: conversion_kwargs[ + output[0] + ].get("include_featuretypes", []), resources: runtime=hours(2), - mem_mb=gb(2) + mem_mb=gb(2), run: import gffutils @@ -314,7 +296,7 @@ rule mappings: continue d = dict(f.attributes) - d['__featuretype__'] = ft + d["__featuretype__"] = ft res.append(d) df = pandas.DataFrame(res) @@ -323,7 +305,7 @@ rule mappings: # include_featuretypes settings, this may take a while. df = df.drop_duplicates() - df.to_csv(output[0], sep='\t', index=False, compression='gzip') + df.to_csv(output[0], sep="\t", index=False, compression="gzip") # Restore original setting gffutils.constants.always_return_list = orig_setting From 0b9beec823996cd89c4357b089e34b0b312f6717 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 6 Jan 2025 22:44:39 -0500 Subject: [PATCH 044/196] run snakefmt on rnaseq (and then re-add some comments that caused failure) --- workflows/rnaseq/Snakefile | 690 ++++++++++++++++++++----------------- 1 file changed, 365 insertions(+), 325 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 5e37daf1..3b384cd3 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -9,40 +9,51 @@ from lib import utils from lib.utils import autobump, gb, hours -configfile: 'config/config.yaml' +configfile: "config/config.yaml" -include: '../references/Snakefile' -REFERENCES = config.get('reference_dir', '../../references') +include: "../references/Snakefile" + + +REFERENCES = config.get("reference_dir", "../../references") sampletable = pd.read_table(config["sampletable"], sep="\t") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" is_sra = utils.detect_sra(sampletable) n = ["1", "2"] if is_paired else ["1"] SAMPLES = sampletable.iloc[:, 0].values -patterns = yaml.safe_load(open('config/rnaseq_patterns.yaml')) +patterns = yaml.safe_load(open("config/rnaseq_patterns.yaml")) wildcard_constraints: - n = '[1,2]', - sample = '|'.join(SAMPLES) + n="[1,2]", + sample="|".join(SAMPLES), + + +localrules: + symlinks, + symlink_targets, -localrules: symlinks, symlink_targets rule all: input: - patterns["multiqc"] + patterns["multiqc"], + if is_sra: - include: '../../rules/sra.smk' + + include: "../../rules/sra.smk" rule symlinks: input: - lambda wc: sampletable.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] if is_paired - else sampletable.loc[wc.sample, ['orig_filename']] + lambda wc: ( + sampletable.loc[wc.sample, ["orig_filename", "orig_filename_R2"]] + if is_paired + else sampletable.loc[wc.sample, ["orig_filename"]] + ), output: - expand(patterns["fastq"], n=n, allow_missing=True) + expand(patterns["fastq"], n=n, allow_missing=True), threads: 1 resources: mem_mb=100, @@ -54,33 +65,42 @@ rule symlinks: rule symlink_targets: - input: - expand('data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=n) + input: + expand( + "data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=n + ), + # This can be set at the command line with --config strand_check_reads=1000 -config.setdefault('strand_check_reads', 1e5) +config.setdefault("strand_check_reads", 1e5) # TODO: re-enable # include: '../../rules/strand_check.smk' + rule cutadapt: input: - fastq=expand(patterns["fastq"], n=n, allow_missing=True) + fastq=expand(patterns["fastq"], n=n, allow_missing=True), output: - fastq=expand(patterns["cutadapt"], n=n, allow_missing=True) + fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), log: - 'data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log' + "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", threads: 6 resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), params: extra=( - "--nextseq-trim 20 " - "--overlap 6 " - "--minimum-length 25 " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - ) + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " if is_paired else "" + ( + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + ) + + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " + if is_paired + else "" + ), run: if is_paired: shell( @@ -103,53 +123,54 @@ rule cutadapt: "&> {log}" ) + # TODO: rm wrapper rule fastqc: input: - '{sample_dir}/{sample}/{sample}{suffix}' - threads: - 6 + "{sample_dir}/{sample}/{sample}{suffix}", + threads: 6 output: - html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html', - zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip', + html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html", + zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip", resources: mem_mb=gb(8), - runtime=autobump(hours=2) + runtime=autobump(hours=2), script: - utils.wrapper_for('fastqc/wrapper.py') + utils.wrapper_for("fastqc/wrapper.py") -if config['aligner'] == 'hisat2': +if config["aligner"] == "hisat2": + rule hisat2: input: fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=rules.hisat2_index.output, output: - bam=temporary(patterns['bam']) + bam=temporary(patterns["bam"]), log: - patterns['bam'] + '.log' + patterns["bam"] + ".log", threads: 6 resources: mem_mb=gb(32), - runtime=autobump(hours=8) + runtime=autobump(hours=8), params: - extra="" + extra="", run: prefix = os.path.commonprefix(input.index).rstrip(".") - sam = output.bam.replace('.bam', '.sam') + sam = output.bam.replace(".bam", ".sam") if is_paired: assert len(input.fastq) == 2 - fastqs = '-1 {0} -2 {1} '.format(*input.fastq) + fastqs = "-1 {0} -2 {1} ".format(*input.fastq) else: assert len(input.fastq) == 1 - fastqs = '-U {0} '.format(input.fastq) + fastqs = "-U {0} ".format(input.fastq) shell( "hisat2 " "-x {prefix} " "{fastqs} " - '--no-unal ' + "--no-unal " "--threads {threads} " "-S {sam} " "> {log} 2>&1" @@ -161,40 +182,42 @@ if config['aligner'] == 'hisat2': "&& rm {sam}" ) + + # TODO: star has lots of rules. Better to be in rules/aligner.smk? -if config['aligner'].startswith('star'): +if config["aligner"].startswith("star"): # STAR can be run in 1-pass or 2-pass modes. Since we may be running it # more than once in almost the same way, we pull out the shell command here # and use it below. STAR_CMD = ( - 'STAR ' - '--runThreadN {threads} ' - '--genomeDir {genomedir} ' - '--readFilesIn {input.fastq} ' - '--readFilesCommand zcat ' - '--outFileNamePrefix {prefix} ' - '{params.extra} ' + "STAR " + "--runThreadN {threads} " + "--genomeDir {genomedir} " + "--readFilesIn {input.fastq} " + "--readFilesCommand zcat " + "--outFileNamePrefix {prefix} " + "{params.extra} " ) STAR_PARAMS = ( # NOTE: The STAR docs indicate that the following parameters are # standard options for ENCODE long-RNA-seq pipeline. Comments are from # the STAR docs. - '--outFilterType BySJout ' # reduces number of spurious junctions - '--outFilterMultimapNmax 20 ' # if more than this many multimappers, consider unmapped - '--alignSJoverhangMin 8 ' # min overhang for unannotated junctions - '--alignSJDBoverhangMin 1 ' # min overhang for annotated junctions - '--outFilterMismatchNmax 999 ' # max mismatches per pair - '--outFilterMismatchNoverReadLmax 0.04 ' # max mismatches per pair relative to read length - '--alignIntronMin 20 ' # min intron length - '--alignIntronMax 1000000 ' # max intron length - '--alignMatesGapMax 1000000 ' # max distance between mates - '--outSAMunmapped None ' # do not report aligned reads in output + "--outFilterType BySJout " # reduces number of spurious junctions + "--outFilterMultimapNmax 20 " # if more than this many multimappers, consider unmapped + "--alignSJoverhangMin 8 " # min overhang for unannotated junctions + "--alignSJDBoverhangMin 1 " # min overhang for annotated junctions + "--outFilterMismatchNmax 999 " # max mismatches per pair + "--outFilterMismatchNoverReadLmax 0.04 " # max mismatches per pair relative to read length + "--alignIntronMin 20 " # min intron length + "--alignIntronMax 1000000 " # max intron length + "--alignMatesGapMax 1000000 " # max distance between mates + "--outSAMunmapped None " # do not report aligned reads in output ) - logfile_extensions = ['Log.progress.out', 'Log.out', 'Log.final.out', 'Log.std.out'] + logfile_extensions = ["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"] -if config['aligner'] == 'star': +if config["aligner"] == "star": rule star: """ @@ -203,37 +226,39 @@ if config['aligner'] == 'star': input: fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=rules.star_index.output, - annotation=f"{REFERENCES}/annotation.gtf" + annotation=f"{REFERENCES}/annotation.gtf", output: - bam=temporary(patterns['bam']), - sjout=temporary(patterns['bam'].replace('.bam', '.star.SJ.out.tab')), + bam=temporary(patterns["bam"]), + sjout=temporary(patterns["bam"].replace(".bam", ".star.SJ.out.tab")), log: - patterns['bam'].replace('.bam', '.star.bam.log') + patterns["bam"].replace(".bam", ".star.bam.log"), threads: 16 resources: mem_mb=gb(64), - runtime=autobump(hours=8) + runtime=autobump(hours=8), params: - extra=STAR_PARAMS - + extra=STAR_PARAMS, run: genomedir = os.path.dirname(input.index[0]) outdir = os.path.dirname(output[0]) - prefix = output.bam.replace('.bam', '.star.') + prefix = output.bam.replace(".bam", ".star.") shell( - STAR_CMD + ( - '--outSAMtype BAM SortedByCoordinate ' - '--outStd BAM_SortedByCoordinate > {output.bam} ' - '2> {log} ' + STAR_CMD + + ( + "--outSAMtype BAM SortedByCoordinate " + "--outStd BAM_SortedByCoordinate > {output.bam} " + "2> {log} " ) ) # move various hard-coded log files to log directory - logfiles = expand(prefix + '{ext}', ext=logfile_extensions) - shell('mkdir -p {outdir}/star_logs ' - '&& mv {logfiles} {outdir}/star_logs') + logfiles = expand(prefix + "{ext}", ext=logfile_extensions) + shell( + "mkdir -p {outdir}/star_logs " "&& mv {logfiles} {outdir}/star_logs" + ) -if config['aligner'] == 'star-twopass': + +if config["aligner"] == "star-twopass": rule star_pass1: """ @@ -242,37 +267,38 @@ if config['aligner'] == 'star-twopass': input: fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=rules.star_index.output, - annotation=f"{REFERENCES}/annotation.gtf" + annotation=f"{REFERENCES}/annotation.gtf", output: - sjout=temporary(patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab')), + sjout=temporary(patterns["bam"].replace(".bam", ".star-pass1.SJ.out.tab")), log: - patterns['bam'].replace('.bam', '.star-pass1.bam.log') + patterns["bam"].replace(".bam", ".star-pass1.bam.log"), threads: 16 resources: mem_mb=gb(64), - runtime=autobump(hours=8) + runtime=autobump(hours=8), params: - extra=STAR_PARAMS + extra=STAR_PARAMS, run: genomedir = os.path.dirname(input.index[0]) outdir = os.path.dirname(output[0]) - prefix = output.sjout.replace('SJ.out.tab', '') + prefix = output.sjout.replace("SJ.out.tab", "") shell( - STAR_CMD + - ( + STAR_CMD + + ( # In this first pass, we don't actually care about the # alignment -- just the detected junctions. So we output # the SAM to /dev/null. - '--outStd SAM > /dev/null ' - '2> {log} ' + "--outStd SAM > /dev/null " + "2> {log} " ) ) # move various hard-coded log files to log directory - logfiles = expand(prefix + '{ext}', ext=logfile_extensions) - shell('mkdir -p {outdir}/star-pass1_logs ' - '&& mv {logfiles} {outdir}/star-pass1_logs') - + logfiles = expand(prefix + "{ext}", ext=logfile_extensions) + shell( + "mkdir -p {outdir}/star-pass1_logs " + "&& mv {logfiles} {outdir}/star-pass1_logs" + ) rule star_pass2: """ @@ -283,47 +309,52 @@ if config['aligner'] == 'star-twopass': fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=rules.star_index.output, annotation=f"{REFERENCES}/annotation.gtf", - sjout=expand(patterns['bam'].replace('.bam', '.star-pass1.SJ.out.tab'), sample=SAMPLES), + sjout=expand( + patterns["bam"].replace(".bam", ".star-pass1.SJ.out.tab"), + sample=SAMPLES, + ), output: - bam=temporary(patterns['bam']), - sjout=temporary(patterns['bam'].replace('.bam', '.star-pass2.SJ.out.tab')), + bam=temporary(patterns["bam"]), + sjout=temporary(patterns["bam"].replace(".bam", ".star-pass2.SJ.out.tab")), log: - patterns['bam'].replace('.bam', '.star-pass2.bam.log') + patterns["bam"].replace(".bam", ".star-pass2.bam.log"), threads: 16 resources: mem_mb=gb(64), - runtime=autobump(hours=8) + runtime=autobump(hours=8), params: - extra=STAR_PARAMS + extra=STAR_PARAMS, run: genomedir = os.path.dirname(input.index[0]) outdir = os.path.dirname(output[0]) - prefix = output.bam.replace('.bam', '.star-pass2.') + prefix = output.bam.replace(".bam", ".star-pass2.") shell( - STAR_CMD + ( + STAR_CMD + + ( # In contrast to pass 1, we will be keeping these BAMs -- # so sort them - '--outSAMtype BAM SortedByCoordinate ' - + "--outSAMtype BAM SortedByCoordinate " # Splice junction databases from all samples in the first # pass. - '--sjdbFileChrStartEnd {input.sjout} ' - '--outStd BAM_SortedByCoordinate > {output.bam} ' - '2> {log} ' + "--sjdbFileChrStartEnd {input.sjout} " + "--outStd BAM_SortedByCoordinate > {output.bam} " + "2> {log} " ) ) # move various hard-coded log files to log directory - logfiles = expand(prefix + '{ext}', ext=logfile_extensions) - shell('mkdir -p {outdir}/star-pass2_logs ' - '&& mv {logfiles} {outdir}/star-pass2_logs') + logfiles = expand(prefix + "{ext}", ext=logfile_extensions) + shell( + "mkdir -p {outdir}/star-pass2_logs " + "&& mv {logfiles} {outdir}/star-pass2_logs" + ) - shell('rm -r {prefix}_STARgenome') + shell("rm -r {prefix}_STARgenome") rule rRNA: input: - fastq=expand(patterns["cutadapt"], n=1, allow_missing=True), # currently only R1 + fastq=expand(patterns["cutadapt"], n=1, allow_missing=True), index=multiext( f"{REFERENCES}/bowtie2/rrna", ".1.bt2", @@ -335,21 +366,21 @@ rule rRNA: ".fa", ), output: - bam=temporary(patterns['rrna']['bam']) + bam=temporary(patterns["rrna"]["bam"]), log: - patterns['rrna']['bam'] + '.log' + patterns["rrna"]["bam"] + ".log", threads: 6 resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), params: extra=( - '-k 1 ' # NOTE: we only care if >=1 mapped - '--no-unal ' # NOTE: suppress unaligned reads - ) + "-k 1 " + "--no-unal " + ), run: prefix = os.path.commonprefix(input.index).rstrip(".") - sam = output.bam.replace('.bam', '.sam') + sam = output.bam.replace(".bam", ".sam") shell( "bowtie2 " @@ -370,41 +401,41 @@ rule rRNA: rule fastq_count: input: - fastq='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz' + fastq="{sample_dir}/{sample}/{sample}{suffix}.fastq.gz", output: - '{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize' + "{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize", threads: 1 resources: mem_mb=gb(1), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'zcat {input} | echo $((`wc -l`/4)) > {output}' + "zcat {input} | echo $((`wc -l`/4)) > {output}" rule bam_count: input: - bam='{sample_dir}/{sample}/{suffix}.bam' + bam="{sample_dir}/{sample}/{suffix}.bam", output: - '{sample_dir}/{sample}/{suffix}.bam.libsize' + "{sample_dir}/{sample}/{suffix}.bam.libsize", threads: 1 resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'samtools view -c {input} > {output}' + "samtools view -c {input} > {output}" rule bam_index: input: - bam='{prefix}.bam' + bam="{prefix}.bam", output: - bai='{prefix}.bam.bai' + bai="{prefix}.bam.bai", threads: 1 resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'samtools index {input} {output}' + "samtools index {input} {output}" # TODO: split into multiple featurecounts runs, since PE needs to be sorted each time. @@ -414,91 +445,101 @@ rule featurecounts: """ input: annotation=rules.gtf.output, - bam=expand(patterns['markduplicates']['bam'], sample=SAMPLES), + bam=expand(patterns["markduplicates"]["bam"], sample=SAMPLES), output: - counts='{sample_dir}/rnaseq_aggregation/featurecounts.txt' + counts="{sample_dir}/rnaseq_aggregation/featurecounts.txt", log: - '{sample_dir}/rnaseq_aggregation/featurecounts.txt.log' + "{sample_dir}/rnaseq_aggregation/featurecounts.txt.log", threads: 8 resources: mem_mb=gb(16), - runtime=autobump(hours=2) + runtime=autobump(hours=2), params: strand_arg={ - 'unstranded': '-s0 ', - 'fr-firststrand': '-s2 ', - 'fr-secondstrand': '-s1 ', - }[config["stranded"]], - extra="" + "unstranded": "-s0 ", + "fr-firststrand": "-s2 ", + "fr-secondstrand": "-s1 ", + }[config["stranded"]], + extra="", run: # NOTE: By default, we use -p for paired-end - p_arg = '' + p_arg = "" if is_paired: - p_arg = '-p --countReadPairs ' + p_arg = "-p --countReadPairs " shell( - 'featureCounts ' - '{params.strand_arg} ' - '{p_arg} ' - '-T {threads} ' - '-a {input.annotation} ' - '-o {output.counts} ' - '{input.bam} ' - '&> {log}' + "featureCounts " + "{params.strand_arg} " + "{p_arg} " + "-T {threads} " + "-a {input.annotation} " + "-o {output.counts} " + "{input.bam} " + "&> {log}" ) -# TODO: port some of this over to utils, or maybe script. + +# # TODO: port some of this over to utils, or maybe script. rule rrna_libsizes_table: """ Aggregate rRNA counts into a table """ input: - rrna=expand(patterns['rrna']['libsize'], sample=SAMPLES), - fastq=expand(patterns['libsizes']['cutadapt'], sample=SAMPLES), + rrna=expand(patterns["rrna"]["libsize"], sample=SAMPLES), + fastq=expand(patterns["libsizes"]["cutadapt"], sample=SAMPLES), output: - json=patterns['rrna_percentages_yaml'], - tsv=patterns['rrna_percentages_table'] + json=patterns["rrna_percentages_yaml"], + tsv=patterns["rrna_percentages_table"], threads: 1 resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), run: def rrna_sample(f): - return utils.extract_wildcards(patterns['rrna']['libsize'], f)['sample'] + return utils.extract_wildcards(patterns["rrna"]["libsize"], f)["sample"] + def sample(f): - return utils.extract_wildcards(patterns['libsizes']['cutadapt'], f)['sample'] + return utils.extract_wildcards(patterns["libsizes"]["cutadapt"], f)[ + "sample" + ] + def million(f): return float(open(f).read()) / 1e6 + rrna = sorted(input.rrna, key=rrna_sample) fastq = sorted(input.fastq, key=sample) samples = list(map(rrna_sample, rrna)) rrna_m = list(map(million, rrna)) fastq_m = list(map(million, fastq)) - df = pd.DataFrame(dict( - sample=samples, - million_reads_rRNA=rrna_m, - million_reads_fastq=fastq_m, - )) - df = df.set_index('sample') - df['rRNA_percentage'] = df.million_reads_rRNA / df.million_reads_fastq * 100 + df = pd.DataFrame( + dict( + sample=samples, + million_reads_rRNA=rrna_m, + million_reads_fastq=fastq_m, + ) + ) + df = df.set_index("sample") + df["rRNA_percentage"] = df.million_reads_rRNA / df.million_reads_fastq * 100 - df[['million_reads_fastq', 'million_reads_rRNA', 'rRNA_percentage']].to_csv(output.tsv, sep='\t') + df[["million_reads_fastq", "million_reads_rRNA", "rRNA_percentage"]].to_csv( + output.tsv, sep="\t" + ) y = { - 'id': 'rrna_percentages_table', - 'section_name': 'rRNA content', - 'description': 'Amount of reads mapping to rRNA sequence', - 'plot_type': 'table', - 'pconfig': { - 'id': 'rrna_percentages_table_table', - 'title': 'rRNA content table', - 'min': 0 + "id": "rrna_percentages_table", + "section_name": "rRNA content", + "description": "Amount of reads mapping to rRNA sequence", + "plot_type": "table", + "pconfig": { + "id": "rrna_percentages_table_table", + "title": "rRNA content table", + "min": 0, }, - 'data': yaml.load(df.transpose().to_json(), Loader=yaml.FullLoader), + "data": yaml.load(df.transpose().to_json(), Loader=yaml.FullLoader), } - with open(output.json, 'w') as fout: + with open(output.json, "w") as fout: yaml.dump(y, fout, default_flow_style=False) @@ -521,29 +562,29 @@ rule multiqc: patterns["rrna_percentages_table"], patterns["featurecounts"], ), - config='config/multiqc_config.yaml' + config="config/multiqc_config.yaml", output: - 'data/rnaseq_aggregation/multiqc.html' + "data/rnaseq_aggregation/multiqc.html", log: - 'data/rnaseq_aggregation/multiqc.log' + "data/rnaseq_aggregation/multiqc.log", threads: 1 resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), run: analysis_directory = set([os.path.dirname(i) for i in input]) outdir = os.path.dirname(output[0]) basename = os.path.basename(output[0]) shell( - 'LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 ' - 'multiqc ' - '--quiet ' - '--outdir {outdir} ' - '--force ' - '--filename {basename} ' - '--config {input.config} ' - '{analysis_directory} ' - '&> {log} ' + "LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 " + "multiqc " + "--quiet " + "--outdir {outdir} " + "--force " + "--filename {basename} " + "--config {input.config} " + "{analysis_directory} " + "&> {log} " ) @@ -552,29 +593,29 @@ rule markduplicates: Mark or remove PCR duplicates with Picard MarkDuplicates """ input: - bam=patterns['bam'] + bam=patterns["bam"], output: - bam=patterns['markduplicates']['bam'], - metrics=patterns['markduplicates']['metrics'], + bam=patterns["markduplicates"]["bam"], + metrics=patterns["markduplicates"]["metrics"], log: - patterns['markduplicates']['bam'] + '.log' + patterns["markduplicates"]["bam"] + ".log", threads: 1 resources: mem_mb=gb(32), runtime=autobump(hours=2), disk_mb=autobump(gb=100), params: - java_args='-Xmx20g' + java_args="-Xmx20g", # java_args='-Xmx2g' # [TEST SETTINGS -1] shell: - 'picard ' - '{params.java_args} ' - 'MarkDuplicates ' - 'INPUT={input.bam} ' - 'OUTPUT={output.bam} ' - 'METRICS_FILE={output.metrics} ' - 'VALIDATION_STRINGENCY=LENIENT ' - '&> {log}' + "picard " + "{params.java_args} " + "MarkDuplicates " + "INPUT={input.bam} " + "OUTPUT={output.bam} " + "METRICS_FILE={output.metrics} " + "VALIDATION_STRINGENCY=LENIENT " + "&> {log}" rule collectrnaseqmetrics: @@ -582,35 +623,35 @@ rule collectrnaseqmetrics: Calculate various RNA-seq QC metrics with Picarc CollectRnaSeqMetrics """ input: - bam=patterns['markduplicates']['bam'], + bam=patterns["markduplicates"]["bam"], refflat=rules.conversion_refflat.output, output: - metrics=patterns['collectrnaseqmetrics']['metrics'], + metrics=patterns["collectrnaseqmetrics"]["metrics"], log: - patterns['collectrnaseqmetrics']['metrics'] + '.log' + patterns["collectrnaseqmetrics"]["metrics"] + ".log", threads: 1 resources: mem_mb=gb(32), - runtime=autobump(hours=2) + runtime=autobump(hours=2), params: - java_args='-Xmx20g', + java_args="-Xmx20g", # java_args='-Xmx2g', # [TEST SETTINGS -1] strand_arg={ - 'unstranded': 'STRAND=NONE ', - 'fr-firststrand': 'STRAND=SECOND_READ_TRANSCRIPTION_STRAND ', - 'fr-secondstrand': 'STRAND=FIRST_READ_TRANSCRIPTION_STRAND ', - }[config["stranded"]] + "unstranded": "STRAND=NONE ", + "fr-firststrand": "STRAND=SECOND_READ_TRANSCRIPTION_STRAND ", + "fr-secondstrand": "STRAND=FIRST_READ_TRANSCRIPTION_STRAND ", + }[config["stranded"]], run: shell( - 'picard ' - '{params.java_args} ' - 'CollectRnaSeqMetrics ' - '{params.strand_arg} ' - 'VALIDATION_STRINGENCY=LENIENT ' - 'REF_FLAT={input.refflat} ' - 'INPUT={input.bam} ' - 'OUTPUT={output.metrics} ' - '&> {log}' + "picard " + "{params.java_args} " + "CollectRnaSeqMetrics " + "{params.strand_arg} " + "VALIDATION_STRINGENCY=LENIENT " + "REF_FLAT={input.refflat} " + "INPUT={input.bam} " + "OUTPUT={output.metrics} " + "&> {log}" ) @@ -619,18 +660,18 @@ rule preseq: Compute a library complexity curve with preseq """ input: - bam=patterns['bam'] + bam=patterns["bam"], output: - patterns['preseq'] + patterns["preseq"], threads: 1 resources: mem_mb=gb(1), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'preseq ' - 'c_curve ' - '-B {input} ' - '-o {output} ' + "preseq " + "c_curve " + "-B {input} " + "-o {output} " rule salmon: @@ -639,37 +680,37 @@ rule salmon: """ input: fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), - index=REFERENCES + "/salmon/versionInfo.json" + index=REFERENCES + "/salmon/versionInfo.json", output: - patterns['salmon'] + patterns["salmon"], log: - patterns['salmon'] + '.log' + patterns["salmon"] + ".log", threads: 6 resources: mem_mb=gb(32), - runtime=autobump(hours=2) + runtime=autobump(hours=2), params: extra=( "--libType=A " "--gcBias " "--seqBias " "--validateMappings " - ) + ), run: outdir = os.path.dirname(output[0]) index_dir = os.path.dirname(input.index) if is_paired: - fastq_arg = f'-1 {input.fastq[0]} -2 {input.fastq[1]} ' + fastq_arg = f"-1 {input.fastq[0]} -2 {input.fastq[1]} " else: - fastq_arg = f'-r {input.fastq} ' + fastq_arg = f"-r {input.fastq} " shell( - 'salmon quant ' - '--index {index_dir} ' - '--output {outdir} ' - '--threads {threads} ' - '{params.extra} ' - '{fastq_arg} ' - '&> {log}' + "salmon quant " + "--index {index_dir} " + "--output {outdir} " + "--threads {threads} " + "{params.extra} " + "{fastq_arg} " + "&> {log}" ) @@ -681,55 +722,56 @@ rule kallisto: fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=REFERENCES + "/kallisto/transcripts.idx", output: - patterns['kallisto'] + patterns["kallisto"], log: - patterns['kallisto'] + '.log' - threads: - 8 + patterns["kallisto"] + ".log", + threads: 8 resources: mem_mb=gb(32), runtime=autobump(hours=2), params: strand_arg={ - 'unstranded': '', - 'fr-firststrand': '--rf-stranded', - 'fr-secondstrand': '--fr-stranded', - }[config["stranded"]], + "unstranded": "", + "fr-firststrand": "--rf-stranded", + "fr-secondstrand": "--fr-stranded", + }[config["stranded"]], extra=( - "--bootstrap-samples 100" if is_paired else - "--single --fragment-length 300 --sd 20 --bootstrap-samples 100" + "--bootstrap-samples 100" + if is_paired + else "--single --fragment-length 300 --sd 20 --bootstrap-samples 100" ), run: outdir = os.path.dirname(output[0]) shell( - 'kallisto quant ' - '--index {input.index} ' - '--output-dir {outdir} ' - '--threads {threads} ' - '--bootstrap-samples 100 ' - '--threads {threads} ' - '{params.strand_arg} ' - '{params.extra} ' - '{input.fastq} ' - '&> {log}' + "kallisto quant " + "--index {input.index} " + "--output-dir {outdir} " + "--threads {threads} " + "--bootstrap-samples 100 " + "--threads {threads} " + "{params.strand_arg} " + "{params.extra} " + "{input.fastq} " + "&> {log}" ) + rule rseqc_infer_experiment: """ Infer strandedness of experiment """ input: - bam=patterns['markduplicates']['bam'], + bam=patterns["markduplicates"]["bam"], bed12=rules.conversion_bed12.output, output: - txt=patterns['rseqc']['infer_experiment'] + txt=patterns["rseqc"]["infer_experiment"], log: - patterns['rseqc']['infer_experiment'] + '.log' + patterns["rseqc"]["infer_experiment"] + ".log", resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}' + "infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}" rule rseqc_read_distribution: @@ -737,17 +779,17 @@ rule rseqc_read_distribution: read distribution plots """ input: - bam=patterns['markduplicates']['bam'], + bam=patterns["markduplicates"]["bam"], bed12=rules.conversion_bed12.output, output: - txt=patterns['rseqc']['read_distribution'] + txt=patterns["rseqc"]["read_distribution"], log: - patterns['rseqc']['read_distribution'] + '.log' + patterns["rseqc"]["read_distribution"] + ".log", resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}' + "read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}" rule idxstats: @@ -755,19 +797,17 @@ rule idxstats: Run samtools idxstats on sample bams """ input: - bam=patterns['markduplicates']['bam'], - bai=patterns['markduplicates']['bam'] + '.bai' + bam=patterns["markduplicates"]["bam"], + bai=patterns["markduplicates"]["bam"] + ".bai", output: - txt=patterns['samtools']['idxstats'] - log: - patterns['samtools']['idxstats'] + '.log' + txt=patterns["samtools"]["idxstats"], + log: + patterns["samtools"]["idxstats"] + ".log", resources: mem_mb=gb(16), - runtime=autobump(hours=2) + runtime=autobump(hours=2), run: - shell( - 'samtools idxstats {input.bam} 2> {log} 1> {output.txt}' - ) + shell("samtools idxstats {input.bam} 2> {log} 1> {output.txt}") rule bigwig_neg: @@ -775,36 +815,36 @@ rule bigwig_neg: Create a bigwig for negative-strand reads """ input: - bam=patterns['markduplicates']['bam'], - bai=patterns['markduplicates']['bam'] + '.bai', + bam=patterns["markduplicates"]["bam"], + bai=patterns["markduplicates"]["bam"] + ".bai", output: - patterns['bigwig']['neg'] + patterns["bigwig"]["neg"], threads: 8 resources: mem_mb=gb(16), - runtime=autobump(hours=2) + runtime=autobump(hours=2), log: - patterns['bigwig']['neg'] + '.log' + patterns["bigwig"]["neg"] + ".log", params: - strand_arg = { - 'unstranded': '', - 'fr-firststrand': '--filterRNAstrand reverse ', - 'fr-secondstrand': '--filterRNAstrand forward ', - }[config["stranded"]], + strand_arg={ + "unstranded": "", + "fr-firststrand": "--filterRNAstrand reverse ", + "fr-secondstrand": "--filterRNAstrand forward ", + }[config["stranded"]], extra=( - '--minMappingQuality 20 ' - '--smoothLength 10 ' - '--normalizeUsing BPM ' # equivalent to TPM # [TEST SETTINGS] + "--minMappingQuality 20 " + "--smoothLength 10 " + "--normalizeUsing BPM " # [TEST SETTINGS] ), run: shell( - 'bamCoverage ' - '--bam {input.bam} ' - '-o {output} ' - '-p {threads} ' - '{params.extra} ' - '{params.strand_arg} ' - '&> {log}' + "bamCoverage " + "--bam {input.bam} " + "-o {output} " + "-p {threads} " + "{params.extra} " + "{params.strand_arg} " + "&> {log}" ) @@ -813,58 +853,58 @@ rule bigwig_pos: Create a bigwig for postive-strand reads. """ input: - bam=patterns['markduplicates']['bam'], - bai=patterns['markduplicates']['bam'] + '.bai', + bam=patterns["markduplicates"]["bam"], + bai=patterns["markduplicates"]["bam"] + ".bai", output: - patterns['bigwig']['pos'] + patterns["bigwig"]["pos"], threads: 8 resources: mem_mb=gb(16), - runtime=autobump(hours=2) + runtime=autobump(hours=2), log: - patterns['bigwig']['pos'] + '.log' + patterns["bigwig"]["pos"] + ".log", params: strand_arg={ - 'unstranded': '', - 'fr-firststrand': '--filterRNAstrand forward ', - 'fr-secondstrand': '--filterRNAstrand reverse ', - }[config["stranded"]], + "unstranded": "", + "fr-firststrand": "--filterRNAstrand forward ", + "fr-secondstrand": "--filterRNAstrand reverse ", + }[config["stranded"]], extra=( - '--minMappingQuality 20 ' - '--smoothLength 10 ' - '--normalizeUsing BPM ' # equivalent to TPM # [TEST SETTINGS] + "--minMappingQuality 20 " + "--smoothLength 10 " + "--normalizeUsing BPM " # [TEST SETTINGS] ), run: shell( - 'bamCoverage ' - '--bam {input.bam} ' - '-o {output} ' - '-p {threads} ' - '{params.extra} ' - '{params.strand_arg} ' - '&> {log}' + "bamCoverage " + "--bam {input.bam} " + "-o {output} " + "-p {threads} " + "{params.extra} " + "{params.strand_arg} " + "&> {log}" ) rule flagstat: input: - bam=patterns['markduplicates']['bam'], - bai=patterns['markduplicates']['bam'] + '.bai' + bam=patterns["markduplicates"]["bam"], + bai=patterns["markduplicates"]["bam"] + ".bai", output: - patterns['samtools']['flagstat'] + patterns["samtools"]["flagstat"], log: - patterns['samtools']['flagstat'] + '.log' + patterns["samtools"]["flagstat"] + ".log", shell: - 'samtools flagstat {input.bam} > {output}' + "samtools flagstat {input.bam} > {output}" rule samtools_stats: input: - bam=patterns['markduplicates']['bam'], - bai=patterns['markduplicates']['bam'] + '.bai' + bam=patterns["markduplicates"]["bam"], + bai=patterns["markduplicates"]["bam"] + ".bai", output: - patterns['samtools']['stats'] + patterns["samtools"]["stats"], log: - patterns['samtools']['stats'] + '.log' + patterns["samtools"]["stats"] + ".log", shell: - 'samtools stats {input.bam} > {output}' + "samtools stats {input.bam} > {output}" From 6968e4fd3a3d4deb7e3d41bd3e9508ffe1416db7 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 10 Jan 2025 23:08:58 -0500 Subject: [PATCH 045/196] move wrappers to scripts --- wrappers/wrappers/epic2/wrapper.py => scripts/epic2.py | 0 .../macs2/callpeak/wrapper.py => scripts/macs2_callpeak.py | 0 .../merge_and_dedup/wrapper.py => scripts/merge_and_dedup.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename wrappers/wrappers/epic2/wrapper.py => scripts/epic2.py (100%) rename wrappers/wrappers/macs2/callpeak/wrapper.py => scripts/macs2_callpeak.py (100%) rename wrappers/wrappers/combos/merge_and_dedup/wrapper.py => scripts/merge_and_dedup.py (100%) diff --git a/wrappers/wrappers/epic2/wrapper.py b/scripts/epic2.py similarity index 100% rename from wrappers/wrappers/epic2/wrapper.py rename to scripts/epic2.py diff --git a/wrappers/wrappers/macs2/callpeak/wrapper.py b/scripts/macs2_callpeak.py similarity index 100% rename from wrappers/wrappers/macs2/callpeak/wrapper.py rename to scripts/macs2_callpeak.py diff --git a/wrappers/wrappers/combos/merge_and_dedup/wrapper.py b/scripts/merge_and_dedup.py similarity index 100% rename from wrappers/wrappers/combos/merge_and_dedup/wrapper.py rename to scripts/merge_and_dedup.py From 8013417fa746715ebb2d9b14e517bbb5ac771995 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 10 Jan 2025 23:09:30 -0500 Subject: [PATCH 046/196] overhaul and simplify preprocessor --- ci/preprocessor.py | 120 +++++++++++++-------------------------------- 1 file changed, 35 insertions(+), 85 deletions(-) diff --git a/ci/preprocessor.py b/ci/preprocessor.py index 042bee33..6bf05361 100644 --- a/ci/preprocessor.py +++ b/ci/preprocessor.py @@ -7,54 +7,16 @@ in production. Rather than require users edit files to remove those test-specific patterns, here we keep the test settings commented out and only un-comment when running tests. - -First, we look for any line that matches "# [test settings]" (case insensitive, -with optional surrounding spacing) and an optional signed integer. Any of these -would work: - - >>> assert matches('# [test settings]') - >>> assert matches('#[test settings]') - >>> assert matches('# [ test settings ]') - >>> assert matches('# [ test settings -1]') - >>> assert matches('# [ test settings +2]') - >>> assert matches('# [ TEST SETTINGS +2]') - >>> assert matches('# [ TeSt SeTTiNgS +2 ]') - -If a lines does not match, output it as-is. - -If a line matches, then uncomment it. Specifically, remove the first "#" in the -line; if it was followed by exactly one space, then remove that too. - -If a line matches and a signed integer was provided, then consider it -a relative location, and then comment-out the referred-to line. Example: - - >>> preprocess(''' - ... use this for production - ... # use this for tests # [test settings -1] - ... '''.splitlines(True)) - - # use this for production - use this for tests # [test settings -1] - - -If the matched special string creates the first "#" in the line, then do -nothing to that line but still respect the relative locations. Useful for just -commenting out nearby lines for tests: - - >>> preprocess(''' - ... # [TEST SETTINGS +1] - ... comment out for testing'''.splitlines(True)) - - # [TEST SETTINGS +1] - # comment out for testing """ + import re -regexp = re.compile(r'#\s?\[\s?test settings\s?(?P[-+]*\d)?\s*\]') +regexp = re.compile(r"#\s?\[\s?(enable|disable) for test\s?\]") -def matches(line): - return regexp.search(line.lower()) is not None + +def is_commented(line): + return line.strip().startswith("#") def comment_line(line): @@ -66,87 +28,75 @@ def comment_line(line): """ x = [] for i, character in enumerate(line): - if character == ' ': + if character == " ": x.append(character) else: break - x.append('# ') + x.append("# ") x.extend(line[i:]) - return ''.join(x) + return "".join(x) def uncomment_line(line): """ Removes the first instance of "#" from a line; if it was followed by - exactly one space then remove that too. + exactly one space then remove that too . . . UNLESS the *only* comment is the + special character that triggers this behavior, in which case we do nothing. >>> assert uncomment_line('# asdf') == 'asdf' >>> assert uncomment_line('#asdf') == 'asdf' >>> assert uncomment_line('# asdf # but this should be kept') == 'asdf # but this should be kept' >>> assert uncomment_line('# asdf') == ' asdf' >>> assert uncomment_line(' # asdf') == ' asdf' + >>> assert uncomment_line('do nothing') == 'do nothing' + >>> assert uncomment_line('do nothing # [disable for test]') == 'do nothing # [disable for test]') + >>> assert uncomment_line('#uncomment # [disable for test]') == 'uncomment # [disable for test]') """ - first = line.find('#') + first = line.find("#") - # If the first comment is the one that flag the line, then do nothing. + # If the first comment is the one that flagged the line, then do nothing. m = regexp.search(line.lower()) if m: if m.start() == first: return line - if line[first + 1] == ' ' and line[first + 2] != ' ': - pattern = '# ' + if line[first + 1] == " " and line[first + 2] != " ": + pattern = "# " else: - pattern = '#' - return line.replace(pattern, '', 1) + pattern = "#" + return line.replace(pattern, "", 1) def preprocess(lines): + result = [] if isinstance(lines, str): lines = [lines] - # These lists will keep track of whether a line should be changed. We need to - # create them ahead of time so that we can use relative indexing from line N to - # modify the state of lines N-1 or N+1 - uncomment = [False for i in range(len(lines))] - comment = [False for i in range(len(lines))] - - for i, line in enumerate(lines): + for line in lines: m = regexp.search(line.lower()) - if m: - # There as at least a "[ test settings ]", so remove comment - uncomment[i] = True - - # Figure out if there was also a relative location to uncomment, - # and keep track of it in the `comment` list. - rel = m.group('rel') - if rel is not None: - rel = int(rel) - comment[i + rel] = True + if not m: + result.append(line) + continue - result = [] - for (c, u, line) in zip(comment, uncomment, lines): - # E.g., in this situation, unclear what should happen: - # - # # [test settings] - # # [test settings -1] - # - if c and u: - raise ValueError("Line {0} is trying to be both commented and uncommented".format(line)) - if c: - result.append(comment_line(line)) - elif u: + action = m.group(1) + if action == "enable" and is_commented(line): result.append(uncomment_line(line)) + elif action == "disable" and not is_commented(line): + result.append(comment_line(line)) else: - result.append(line) - print(''.join(result)) + raise ValueError(f"Inconsistent commenting and action:\n{line}") + + print("".join(result)) if __name__ == "__main__": import argparse + ap = argparse.ArgumentParser(usage=__doc__) - ap.add_argument('infile', help='Input file to modify. Modified file printed to stdout.') + ap.add_argument( + "infile", help="Input file to modify. Modified file printed to stdout." + ) args = ap.parse_args() lines = open(args.infile).readlines() preprocess(lines) From 595eddf83c6cd784ce632b84ff71b2352607a473 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 10 Jan 2025 23:09:52 -0500 Subject: [PATCH 047/196] add bed_to_bigbed as script --- scripts/bed_to_bigbed.py | 56 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 scripts/bed_to_bigbed.py diff --git a/scripts/bed_to_bigbed.py b/scripts/bed_to_bigbed.py new file mode 100644 index 00000000..13ab5444 --- /dev/null +++ b/scripts/bed_to_bigbed.py @@ -0,0 +1,56 @@ +import sys +import os +import numpy as np +import pandas as pd +from snakemake.shell import shell + +sys.path.insert(0, os.path.dirname(__file__) + "/..") +from lib import chipseq + +# Based on the filename, identify the algorithm; +# Based on the contents, identify the format. +algorithm = os.path.basename(os.path.dirname(snakemake.input.bed)) +kind = chipseq.detect_peak_format(snakemake.input.bed) + +# bedToBigBed doesn't handle zero-size files +if os.stat(snakemake.input.bed).st_size == 0: + shell("touch {output}") + +# Note that autoSql filenames are relative to the workdir of the snakefile +# calling this script. +elif kind == 'narrowPeak': + _as = '../../include/autosql/bigNarrowPeak.as' + _type = 'bed6+4' + names=[ + 'chrom', 'chromStart', 'chromEnd', 'name', 'score', + 'strand', 'signalValue', 'pValue', 'qValue', 'peak'] +elif kind == 'broadPeak': + _as = '../../include/autosql/bigBroadPeak.as' + _type = 'bed6+3' + names=[ + 'chrom', 'chromStart', 'chromEnd', 'name', 'score', + 'strand', 'signalValue', 'pValue', 'qValue'] +elif kind == 'epic2Input': + _as = f'../../include/autosql/{kind}Peak.as' + _type = 'bed6+4' + names=[ + 'chrom', 'chromStart', 'chromEnd', 'pValue', 'score', + 'strand', 'ChIPCount', 'InputCount', 'FDR', 'log2FoldChange'] +elif kind == 'epic2NoInput': + _as = f'../../include/autosql/{kind}Peak.as' + _type = 'bed6' + names=[ + 'chrom', 'chromStart', 'chromEnd', 'ChIPCount', 'score', + 'strand'] +else: + raise ValueError("Unhandled format for {0}".format(input.bed)) + +df = pd.read_table(snakemake.input.bed, index_col=False, names=names) +df['score'] = df['score'] - df['score'].min() +df['score'] = (df['score'] / df['score'].max()) * 1000 +df['score'] = df['score'].replace([np.inf, -np.inf], np.nan).fillna(0) +df['score'] = df['score'].astype(int) +df.to_csv(snakemake.output[0] + '.tmp', sep='\t', index=False, header=False) + +shell('bedToBigBed -as={_as} -type={_type} {snakemake.output}.tmp {snakemake.input.chromsizes} {snakemake.output} &> {snakemake.log}') +shell('rm {snakemake.output}.tmp') From 95cefeacf068d69f6799e5d56a86f9f6ac68d04b Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 10 Jan 2025 23:10:08 -0500 Subject: [PATCH 048/196] add peakcallers to requirements.txt --- include/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/requirements.txt b/include/requirements.txt index fd8df8be..a2b21ee3 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -4,6 +4,7 @@ bowtie bowtie2 cutadapt>=3.0 deeptools +epic2 fastq-screen fastqc font-ttf-dejavu-sans-mono @@ -13,6 +14,7 @@ hisat2 intervalstats ipython kallisto +macs2 multiqc pandas pandoc From c35776379d1c933e013c28b5ec2bbb37e5cd1950 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 10 Jan 2025 23:10:44 -0500 Subject: [PATCH 049/196] clean up log handling for epic2 --- scripts/epic2.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/scripts/epic2.py b/scripts/epic2.py index ee66e766..6ac30bdb 100644 --- a/scripts/epic2.py +++ b/scripts/epic2.py @@ -2,8 +2,6 @@ import glob from snakemake import shell -log = snakemake.log_fmt_shell() -logfile = None extra = snakemake.params.get('extra', '') outdir, basebed = os.path.split(snakemake.output.bed) @@ -11,21 +9,17 @@ extra = snakemake.params.block.get('extra', '') # `-c` has to be skipped if no control is provided -# if os.path.isfile(snakemake.input.control): if len(snakemake.input.control) > 0: arguments = '-c {snakemake.input.control} ' else: arguments = '' -# Add `--guess-bampe` if input dataset is paired-end -if snakemake.params.is_paired: - arguments += '--guess-bampe ' shell( 'epic2 ' + arguments + extra + '-t {snakemake.input.ip} ' - '--chromsizes {snakemake.input.chromsizes} | ' - 'sort -k1,1 -k2,2n > {label}.tmp.bed ' + '--chromsizes {snakemake.input.chromsizes} 2> {snakemake.log} | ' + 'sort -k1,1 -k2,2n > {label}.tmp.bed' ) # Fix the output file so that it doesn't have negative numbers and so it fits From 52ac28aa49d6f310e66517ece8e7143c39f46ee8 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 10 Jan 2025 23:11:21 -0500 Subject: [PATCH 050/196] test settings overhaul --- workflows/rnaseq/Snakefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 3b384cd3..84682e9d 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -605,8 +605,8 @@ rule markduplicates: runtime=autobump(hours=2), disk_mb=autobump(gb=100), params: - java_args="-Xmx20g", - # java_args='-Xmx2g' # [TEST SETTINGS -1] + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] shell: "picard " "{params.java_args} " @@ -634,8 +634,8 @@ rule collectrnaseqmetrics: mem_mb=gb(32), runtime=autobump(hours=2), params: - java_args="-Xmx20g", - # java_args='-Xmx2g', # [TEST SETTINGS -1] + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g', # [enable for test] strand_arg={ "unstranded": "STRAND=NONE ", "fr-firststrand": "STRAND=SECOND_READ_TRANSCRIPTION_STRAND ", @@ -834,7 +834,7 @@ rule bigwig_neg: extra=( "--minMappingQuality 20 " "--smoothLength 10 " - "--normalizeUsing BPM " # [TEST SETTINGS] + "--normalizeUsing BPM " # [disable for test] ), run: shell( @@ -872,7 +872,7 @@ rule bigwig_pos: extra=( "--minMappingQuality 20 " "--smoothLength 10 " - "--normalizeUsing BPM " # [TEST SETTINGS] + "--normalizeUsing BPM " # [disable for test] ), run: shell( From 9024aa6d5c8983916fd4601a2ba4144e937e4fc4 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 10 Jan 2025 23:11:29 -0500 Subject: [PATCH 051/196] comment sampletable --- workflows/rnaseq/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 84682e9d..a94f4639 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -16,7 +16,7 @@ include: "../references/Snakefile" REFERENCES = config.get("reference_dir", "../../references") -sampletable = pd.read_table(config["sampletable"], sep="\t") +sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" is_sra = utils.detect_sra(sampletable) From 2483b98ab569ea278a9cd5dc835332bf47f582db Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 10 Jan 2025 23:11:42 -0500 Subject: [PATCH 052/196] various rnaseq fixes --- workflows/rnaseq/Snakefile | 40 ++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index a94f4639..a328d887 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -1,7 +1,6 @@ import sys import os import yaml -import tempfile import pandas as pd sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") @@ -38,6 +37,7 @@ localrules: rule all: input: patterns["multiqc"], + patterns["bigwig"], if is_sra: @@ -124,19 +124,38 @@ rule cutadapt: ) -# TODO: rm wrapper rule fastqc: input: - "{sample_dir}/{sample}/{sample}{suffix}", - threads: 6 + '{sample_dir}/{sample}/{sample}{suffix}' + threads: + 1 output: - html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html", - zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip", + html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html', + zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip', resources: mem_mb=gb(8), - runtime=autobump(hours=2), - script: - utils.wrapper_for("fastqc/wrapper.py") + runtime=autobump(hours=2) + log: + '{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log', + run: + outdir = os.path.dirname(output.html) or "." + shell( + 'fastqc ' + '--noextract ' + '--quiet ' + '--outdir {outdir} ' + '{input} ' + '{log} ' + ) + outfile = os.path.basename(input[0]) + for s in ['.fastq', '.fq', '.gz', '.bam']: + outfile = outfile.replace(s, '') + out_zip = os.path.join(outdir, outfile + '_fastqc.zip') + if not os.path.abspath(out_zip) == os.path.abspath(output.zip): + shell('mv {out_zip} {output.zip}') + out_html = os.path.join(outdir, outfile + '_fastqc.html') + if not os.path.abspath(out_html) == os.path.abspath(output.html): + shell('mv {out_html} {output.html}') if config["aligner"] == "hisat2": @@ -149,7 +168,7 @@ if config["aligner"] == "hisat2": bam=temporary(patterns["bam"]), log: patterns["bam"] + ".log", - threads: 6 + threads: 16 resources: mem_mb=gb(32), runtime=autobump(hours=8), @@ -173,6 +192,7 @@ if config["aligner"] == "hisat2": "--no-unal " "--threads {threads} " "-S {sam} " + "{params.extra} " "> {log} 2>&1" ) From 227646c8433d0eecf91b4f0b1237be64677117c0 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Fri, 10 Jan 2025 23:12:11 -0500 Subject: [PATCH 053/196] chipseq overhaul and simplification --- lib/chipseq.py | 36 +- workflows/chipseq/Snakefile | 971 +++++++----------- .../chipseq/config/chipseq_patterns.yaml | 10 +- workflows/chipseq/config/config.yaml | 51 - 4 files changed, 413 insertions(+), 655 deletions(-) diff --git a/lib/chipseq.py b/lib/chipseq.py index 887bb9f9..62608ed8 100644 --- a/lib/chipseq.py +++ b/lib/chipseq.py @@ -1,9 +1,11 @@ +from snakemake.io import expand + """ Helpers for ChIP-seq. """ # Example config for reference -# __example_config__ = { +# { # 'peak_calling': { # [ # { @@ -24,7 +26,32 @@ # ] # } # } +# +# This needs to be expanded out to the following patterns: +# +# [ +# 'data/chipseq_peaks/macs2/rep1/peaks.bigbed', +# 'data/chipseq_peaks/macs2/rep2/peaks.bigbed', +# ] +# +# Which in turn needs these bams: +# +# [ +# expand(patterns['merged_techreps'], label=['input_1', 'ip_1']), +# expand(patterns['merged_techreps'], label=['input_2', 'ip_2']), +# +# +def add_bams_to_peak_calling(config): + d = peak_calling_dict(config) + for key, block in d.items(): + peak_calling_run, algorithm = key + block['ip_bams'] = expand('data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam', label=block['ip']) + block['control_bams'] = expand('data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam', label=block['control']) + block['bed'] = f"data/chipseq_peaks/{algorithm}/{peak_calling_run}/peaks.bed" + block['bigbed'] = f"data/chipseq_peaks/{algorithm}/{peak_calling_run}/peaks.bigbed" + d[key] = block + return d def peak_calling_dict(config, algorithm=None): """ @@ -60,11 +87,6 @@ def peak_calling_dict(config, algorithm=None): if key in d: raise ValueError("peak calling run '{0}' already defined".format(key)) - # If metadata key has been provided, then use that to populate the - # block as default values. - metadata = config['references'][config['organism']][config['aligner']['tag']].get('metadata', {}) - block.update(metadata) - d[key] = block return d @@ -139,7 +161,7 @@ def merged_input_for_ip(sampletable, merged_ip): ... input1 input s2cell-1 s2cell-input-1 ... input3 input s2cell-2 s2cell-input-3 ... input9 input s2cell-1 s2cell-input-1'''), - ... sep='\s+') + ... sep='\\s+') >>> merged_input_for_ip(df, 's2cell-gaf-1') diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 2b5fc485..24b09dec 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -1,181 +1,110 @@ import sys import os -from textwrap import dedent import yaml -import tempfile import pandas as pd -import numpy as np -import pybedtools -HERE = str(Path(workflow.snakefile).parent) -sys.path.insert(0, HERE + "/../..") -from lib import common, utils, helpers, aligners, chipseq -from lib.patterns_targets import ChIPSeqConfig +sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") +from lib import utils +from lib import chipseq from lib.utils import autobump, gb, hours -# ---------------------------------------------------------------------------- -# -# Search for the string "NOTE:" to look for points of configuration that might -# be helpful for your experiment. -# -# ---------------------------------------------------------------------------- -if not workflow.overwrite_configfiles: - configfile: 'config/config.yaml' +configfile: "config/config.yaml" -config = common.load_config(config) -include: '../references/Snakefile' +include: "../references/Snakefile" -# Verify configuration of config and sampletable files -helpers.preflight(config) -c = ChIPSeqConfig( - config, - config.get('patterns', 'config/chipseq_patterns.yaml') -) +REFERENCES = config.get("reference_dir", "../../references") +sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") +sampletable = sampletable.set_index(sampletable.columns[0], drop=False) +is_paired = utils.detect_layout(sampletable) == "PE" +is_sra = utils.detect_sra(sampletable) +n = ["1", "2"] if is_paired else ["1"] +SAMPLES = sampletable.iloc[:, 0].values +patterns = yaml.safe_load(open("config/chipseq_patterns.yaml"))["patterns_by_sample"] +peaks = chipseq.add_bams_to_peak_calling(config) -SAMPLES = c.sampletable.iloc[:, 0].values wildcard_constraints: - n = '[1,2]', - sample = '|'.join(SAMPLES) + n="[1,2]", + sample="|".join(SAMPLES), +localrules: + symlinks, + symlink_targets, -def wrapper_for(path): - return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) - - -# ---------------------------------------------------------------------------- -# RULES -# ---------------------------------------------------------------------------- - - -# See "patterns and targets" in the documentation for what's going on here. -final_targets = utils.flatten(( - c.targets['bam'], - utils.flatten(c.targets['fastqc']), - [c.targets['fastq_screen']], - [c.targets['multiqc']], - utils.flatten(c.targets['markduplicates']), - utils.flatten(c.targets['bigwig']), - utils.flatten(c.targets['peaks']), - utils.flatten(c.targets['merged_techreps']), - utils.flatten(c.targets['fingerprint']), - utils.flatten(c.targets['bigbed']), - utils.flatten(c.targets['multibigwigsummary']), - utils.flatten(c.targets['plotcorrelation']), -)) - -if config.get('merged_bigwigs', None): - final_targets.extend(utils.flatten(c.targets['merged_bigwig'])) - - -def render_r1_r2(pattern): - return expand(pattern, sample='{sample}', n=c.n) - -def render_r1_only(pattern): - return expand(pattern, sample='{sample}', n=1) rule targets: - """ - Final targets to create - """ - input: final_targets - - -if 'orig_filename' in c.sampletable.columns: - - localrules: symlinks - - # Convert the sampletable to be indexed by the first column, for - # convenience in generating the input/output filenames. - _st = c.sampletable.set_index(c.sampletable.columns[0]) - - def orig_for_sample(wc): - """ - Given a sample, returns either one or two original fastq files - depending on whether the library was single- or paired-end. - """ - if c.is_paired: - return _st.loc[wc.sample, ['orig_filename', 'orig_filename_R2']] - return _st.loc[wc.sample, ['orig_filename']] + input: + patterns["multiqc"], + expand(patterns["bigwig"], label=sampletable.label), + [v["bed"] for k, v in peaks.items()], - rule symlinks: - """ - Symlinks files over from original filename - """ - input: - orig_for_sample - output: - render_r1_r2(c.patterns['fastq']) - threads: 1 - resources: - mem_mb=gb(1), - runtime=10, - run: - assert len(output) == len(input), (input, output) - for src, linkname in zip(input, output): - utils.make_relative_symlink(src, linkname) +if is_sra: + include: "../../rules/sra.smk" - rule symlink_targets: - input: c.targets['fastq'] +rule symlinks: + input: + lambda wc: ( + sampletable.loc[wc.sample, ["orig_filename", "orig_filename_R2"]] + if is_paired + else sampletable.loc[wc.sample, ["orig_filename"]] + ), + output: + expand(patterns["fastq"], n=n, allow_missing=True), + threads: 1 + resources: + mem_mb=100, + runtime=10, + run: + assert len(output) == len(input), (input, output) + for src, linkname in zip(input, output): + utils.make_relative_symlink(src, linkname) -if 'Run' in c.sampletable.columns and sum(c.sampletable['Run'].str.startswith('SRR')) > 0: - # Convert the sampletable to be indexed by the first column, for - # convenience in generating the input/output filenames. - _st = c.sampletable.set_index(c.sampletable.columns[0]) +rule symlink_targets: + input: + expand( + "data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=n + ), - rule fastq_dump: - output: - fastq=render_r1_r2(c.patterns['fastq']) - log: - render_r1_only(c.patterns['fastq'])[0] + '.log' - params: - is_paired=c.is_paired, - sampletable=_st, - # limit = 100000, # [TEST SETTINGS] - resources: - mem_mb=autobump(gb=8), - runtime=autobump(hours=2) - conda: - '../../wrappers/wrappers/fastq-dump/environment.yaml' - script: - wrapper_for('fastq-dump/wrapper.py') rule cutadapt: - """ - Run cutadapt - """ input: - fastq=render_r1_r2(c.patterns['fastq']) + fastq=expand(patterns["fastq"], n=n, allow_missing=True), output: - fastq=render_r1_r2(c.patterns['cutadapt']) - resources: - mem_mb=gb(2), - runtime=autobump(hours=2) + fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), log: - render_r1_r2(c.patterns['cutadapt'])[0] + '.log' + "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", threads: 6 + resources: + mem_mb=gb(2), + runtime=autobump(hours=2), + params: + extra=( + ( + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + ) + + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " + if is_paired + else "" + ), run: - - # NOTE: Change cutadapt params here - if c.is_paired: + if is_paired: shell( "cutadapt " "-o {output[0]} " "-p {output[1]} " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " - '--nextseq-trim 20 ' - "--overlap 6 " - '-j {threads} ' - '--minimum-length 25 ' + "-j {threads} " + "{params.extra} " "{input.fastq[0]} " "{input.fastq[1]} " "&> {log}" @@ -184,67 +113,85 @@ rule cutadapt: shell( "cutadapt " "-o {output[0]} " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - '--nextseq-trim 20 ' - "--overlap 6 " - '-j {threads} ' - '--minimum-length 25 ' + "-j {threads} " + "{params.extra} " "{input.fastq[0]} " "&> {log}" ) rule fastqc: - """ - Run FastQC - """ input: - '{sample_dir}/{sample}/{sample}{suffix}' - threads: - 6 + "{sample_dir}/{sample}/{sample}{suffix}", + threads: 1 output: - html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html', - zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip', + html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html", + zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip", resources: - mem_mb=gb(2), - runtime=autobump(hours=2) - script: - wrapper_for('fastqc/wrapper.py') + mem_mb=gb(8), + runtime=autobump(hours=2), + log: + "{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log", + run: + outdir = os.path.dirname(output.html) or "." + shell( + "fastqc " + "--noextract " + "--quiet " + "--outdir {outdir} " + "{input} " + "&> {log} " + ) + outfile = os.path.basename(input[0]) + for s in [".fastq", ".fq", ".gz", ".bam"]: + outfile = outfile.replace(s, "") + out_zip = os.path.join(outdir, outfile + "_fastqc.zip") + if not os.path.abspath(out_zip) == os.path.abspath(output.zip): + shell("mv {out_zip} {output.zip}") + out_html = os.path.join(outdir, outfile + "_fastqc.html") + if not os.path.abspath(out_html) == os.path.abspath(output.html): + shell("mv {out_html} {output.html}") rule bowtie2: - """ - Map reads with Bowtie2 - """ input: - fastq=common.fill_r1_r2(c.sampletable, c.patterns['cutadapt']), - index=[c.refdict[c.organism][config['aligner']['tag']]['bowtie2']] + fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), + index=multiext( + f"{REFERENCES}/bowtie2/genome", + ".1.bt2", + ".2.bt2", + ".3.bt2", + ".4.bt2", + ".rev.1.bt2", + ".rev.2.bt2", + ".fa", + ), output: - bam=c.patterns['bam'] + bam=temporary(patterns["bam"]), log: - c.patterns['bam'] + '.log' + patterns["bam"] + ".log", threads: 16 resources: mem_mb=gb(32), - runtime=autobump(hours=2) + runtime=autobump(hours=2), + params: + extra="", run: - prefix = aligners.prefix_from_bowtie2_index(input.index) - sam = output.bam.replace('.bam', '.sam') - - if c.is_paired: - assert len(input.fastq) == 2 - fastqs = '-1 {0} -2 {1} '.format(*input.fastq) - else: - assert len(input.fastq) == 1 - fastqs = '-U {0} '.format(input.fastq) - + prefix = os.path.commonprefix(input.index).rstrip(".") + sam = output.bam.replace(".bam", ".sam") + fastqs = ( + f"-1 {input.fastq[0]} -2 {input.fastq[1]}" + if is_paired + else f"-U {input.fastq}" + ) shell( "bowtie2 " "-x {prefix} " "{fastqs} " - '--no-unal ' # NOTE: suppress unaligned reads + "--no-unal " "--threads {threads} " "-S {sam} " + "{params.extra} " "> {log} 2>&1" ) @@ -256,271 +203,164 @@ rule bowtie2: rule unique: - """ - Remove multimappers - """ input: - c.patterns['bam'] + patterns["bam"], output: - c.patterns['unique'] + patterns["unique"], threads: 1 resources: mem_mb=gb(1), - runtime=autobump(hours=2) - shell: + runtime=autobump(hours=2), + params: # NOTE: the quality score chosen here should reflect the scores output # by the aligner used. For example, STAR uses 255 as max mapping # quality. - 'samtools view -b -q 20 {input} > {output}' + extra="-q 20", + shell: + "samtools view -b {params.extra} {input} > {output}" rule fastq_count: - """ - Count reads in a FASTQ file - """ input: - fastq='{sample_dir}/{sample}/{sample}{suffix}.fastq.gz' + fastq="{sample_dir}/{sample}/{sample}{suffix}.fastq.gz", output: - '{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize' + "{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize", threads: 1 resources: mem_mb=gb(1), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'zcat {input} | echo $((`wc -l`/4)) > {output}' + "zcat {input} | echo $((`wc -l`/4)) > {output}" rule bam_count: - """ - Count reads in a BAM file - """ input: - bam='{sample_dir}/{sample}/{suffix}.bam' + bam="{sample_dir}/{sample}/{suffix}.bam", output: - '{sample_dir}/{sample}/{suffix}.bam.libsize' + "{sample_dir}/{sample}/{suffix}.bam.libsize", threads: 1 resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'samtools view -c {input} > {output}' + "samtools view -c {input} > {output}" rule bam_index: - """ - Index a BAM - """ input: - bam='{prefix}.bam' + bam="{prefix}.bam", output: - bai='{prefix}.bam.bai' + bai="{prefix}.bam.bai", threads: 1 resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'samtools index {input} {output}' - - -def fastq_screen_references(): - """ - Returns the Bowtie2 indexes for the configured references from the - `fastq_screen:` section of the config - """ - refs = {} - for i in config['fastq_screen']: - refs[i['label']] = c.refdict[i['organism']][i['tag']]['bowtie2'] - return refs - - -rule fastq_screen: - """ - Run fastq_screen to look for contamination from other genomes - """ - input: - **fastq_screen_references(), - fastq=render_r1_only(rules.cutadapt.output.fastq), - output: - txt=c.patterns['fastq_screen'] - log: - c.patterns['fastq_screen'] + '.log' - threads: 6 - resources: - mem_mb=autobump(gb=4), - runtime=autobump(hours=2) - params: subset=100000 - script: - wrapper_for('fastq_screen/wrapper.py') - - -multiqc_inputs = [ - utils.flatten(c.targets['fastqc']) + - utils.flatten(c.targets['cutadapt']) + - utils.flatten(c.targets['bam']) + - utils.flatten(c.targets['markduplicates']) + - utils.flatten(c.targets['fingerprint']) + - utils.flatten(c.targets['peaks']) + - utils.flatten(c.targets['fastq_screen']) + - utils.flatten(c.targets['plotcorrelation']) -] - -if c.is_paired: - multiqc_inputs.extend(utils.flatten(c.targets['collectinsertsizemetrics']['metrics'])) - -rule multiqc: - """ - Aggregate various QC stats and logs into a single HTML report with MultiQC - """ - # NOTE: if you add more rules and want MultiQC to pick up the output, best - # to add outputs from those rules to the inputs here. - input: - files=multiqc_inputs, - config='config/multiqc_config.yaml' - output: - c.targets['multiqc'] - log: - c.targets['multiqc'][0] + '.log' - threads: 1 - resources: - mem_mb=gb(2), - runtime=autobump(hours=2) - run: - analysis_directory = set([os.path.dirname(i) for i in input]) - outdir = os.path.dirname(c.targets['multiqc'][0]) - basename = os.path.basename(c.targets['multiqc'][0]) - shell( - 'LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 ' - 'multiqc ' - '--quiet ' - '--outdir {outdir} ' - '--force ' - '--filename {basename} ' - '--config {input.config} ' - '{analysis_directory} ' - '&> {log} ' - ) + "samtools index {input} {output}" rule markduplicates: - """ - Mark or remove PCR duplicates with Picard MarkDuplicates - """ input: - bam=c.patterns['unique'] + bam=patterns["unique"], output: - bam=c.patterns['markduplicates']['bam'], - metrics=c.patterns['markduplicates']['metrics'] + bam=patterns["markduplicates"]["bam"], + metrics=patterns["markduplicates"]["metrics"], log: - c.patterns['markduplicates']['bam'] + '.log' + patterns["markduplicates"]["bam"] + ".log", threads: 1 resources: mem_mb=gb(32), runtime=autobump(hours=2), - disk_mb=gb(100) + disk_mb=gb(100), params: - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. - java_args='-Xmx20g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] shell: - 'picard ' - '{params.java_args} ' - 'MarkDuplicates ' - 'INPUT={input.bam} ' - 'OUTPUT={output.bam} ' - 'REMOVE_DUPLICATES=true ' - 'METRICS_FILE={output.metrics} ' - 'VALIDATION_STRINGENCY=LENIENT ' - '&> {log}' + "picard " + "{params.java_args} " + "MarkDuplicates " + "INPUT={input.bam} " + "OUTPUT={output.bam} " + "REMOVE_DUPLICATES=true " + "METRICS_FILE={output.metrics} " + "VALIDATION_STRINGENCY=LENIENT " + "&> {log}" rule merge_techreps: - """ - Technical replicates are merged and then re-deduped. - - If there's only one technical replicate, its unique, nodups bam is simply - symlinked. - """ input: lambda wc: expand( - c.patterns['markduplicates']['bam'], - sample=common.get_techreps(c.sampletable, wc.label), - ) + patterns["markduplicates"]["bam"], + sample=utils.get_techreps(sampletable, wc.label), + ), output: - bam=c.patterns['merged_techreps'], - metrics=c.patterns['merged_techreps'] + '.metrics' + bam=patterns["merged_techreps"], + metrics=patterns["merged_techreps"] + ".metrics", log: - c.patterns['merged_techreps'] + '.log' + patterns["merged_techreps"] + ".log", threads: 1 resources: mem_mb=gb(32), runtime=autobump(hours=2), disk_mb=gb(100), params: - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. - java_args='-Xmx32g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] + java_args="-Xmx32g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] script: - wrapper_for('combos/merge_and_dedup/wrapper.py') + "../../scripts/merge_and_dedup.py" + + +if is_paired: -if c.is_paired: rule collectinsertsizemetrics: input: - bam=c.patterns['markduplicates']['bam'], + bam=patterns["markduplicates"]["bam"], output: - pdf=c.patterns['collectinsertsizemetrics']['pdf'], - metrics=c.patterns['collectinsertsizemetrics']['metrics'] + pdf=patterns["collectinsertsizemetrics"]["pdf"], + metrics=patterns["collectinsertsizemetrics"]["metrics"], log: - c.patterns['collectinsertsizemetrics']['metrics'] + '.log' + patterns["collectinsertsizemetrics"]["metrics"] + ".log", threads: 1 resources: mem_mb=gb(32), - runtime=autobump(hours=2) + runtime=autobump(hours=2), params: - java_args='-Xmx20g' - # java_args='-Xmx2g' # [TEST SETTINGS -1] + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] shell: - 'picard ' - '{params.java_args} ' - 'CollectInsertSizeMetrics ' - 'I={input.bam} ' - 'O={output.metrics} ' - 'H={output.pdf} ' - '&> {log} ' + "picard " + "{params.java_args} " + "CollectInsertSizeMetrics " + "I={input.bam} " + "O={output.metrics} " + "H={output.pdf} " + "&> {log} " -rule bigwig: - """ - Create a bigwig. - See note below about normalizing! - """ +rule bigwig: input: - bam=c.patterns['merged_techreps'], - bai=c.patterns['merged_techreps'] + '.bai', + bam=patterns["merged_techreps"], + bai=patterns["merged_techreps"] + ".bai", output: - c.patterns['bigwig'] + patterns["bigwig"], log: - c.patterns['bigwig'] + '.log' + patterns["bigwig"] + ".log", threads: 1 resources: mem_mb=gb(16), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'bamCoverage ' - '--bam {input.bam} ' - '-o {output} ' - '-p {threads} ' - '--minMappingQuality 20 ' - '--ignoreDuplicates ' + "bamCoverage " + "--bam {input.bam} " + "-o {output} " + "-p {threads} " + "--minMappingQuality 20 " + "--ignoreDuplicates " # Can't use the CPM normalization for testing due to <1000 reads total # in example data; keep uncommented when running in production - # [TEST SETTINGS +1] - '--normalizeUsing CPM ' - '--extendReads 300 ' - '&> {log}' + "--normalizeUsing CPM " # [disable for test] + "--extendReads 300 " + "&> {log}" rule fingerprint: @@ -531,175 +371,114 @@ rule fingerprint: Note: uses the merged techreps. """ input: - bams=lambda wc: expand(c.patterns['merged_techreps'], label=wc.ip_label), - control=lambda wc: expand(c.patterns['merged_techreps'], label=chipseq.merged_input_for_ip(c.sampletable, wc.ip_label)), - bais=lambda wc: expand(c.patterns['merged_techreps'] + '.bai', label=wc.ip_label), - control_bais=lambda wc: expand(c.patterns['merged_techreps'] + '.bai', label=chipseq.merged_input_for_ip(c.sampletable, wc.ip_label)), + bams=lambda wc: expand(patterns["merged_techreps"], label=wc.ip_label), + control=lambda wc: expand( + patterns["merged_techreps"], + label=chipseq.merged_input_for_ip(sampletable, wc.ip_label), + ), + bais=lambda wc: expand(patterns["merged_techreps"] + ".bai", label=wc.ip_label), + control_bais=lambda wc: expand( + patterns["merged_techreps"] + ".bai", + label=chipseq.merged_input_for_ip(sampletable, wc.ip_label), + ), output: - plot=c.patterns['fingerprint']['plot'], - raw_counts=c.patterns['fingerprint']['raw_counts'], - metrics=c.patterns['fingerprint']['metrics'] + plot=patterns["fingerprint"]["plot"], + raw_counts=patterns["fingerprint"]["raw_counts"], + metrics=patterns["fingerprint"]["metrics"], threads: 8 - log: c.patterns['fingerprint']['metrics'] + '.log' + log: + patterns["fingerprint"]["metrics"] + ".log", threads: 1 resources: mem_mb=gb(32), - runtime=autobump(hours=2) + runtime=autobump(hours=2), run: if len(input.control) == 0: jsdsample_arg = "" else: - jsdsample_arg = '--JSDsample ' + str(input.control) + jsdsample_arg = "--JSDsample " + str(input.control) shell( - 'plotFingerprint ' '--bamfiles {input.bams} ' - '-p {threads} ' + "plotFingerprint " + "--bamfiles {input.bams} " + "-p {threads} " # The JSDsample argument is disabled for testing as it dramatically # increases the run time. - # [TEST SETTINGS +1] - '{jsdsample_arg} ' - '--smartLabels ' - '--extendReads=300 ' - '--skipZeros ' - '--outQualityMetrics {output.metrics} ' - '--outRawCounts {output.raw_counts} ' - '--plotFile {output.plot} ' + "{jsdsample_arg} " # [disable for test] + "--smartLabels " + "--extendReads=300 " + "--skipZeros " + "--outQualityMetrics {output.metrics} " + "--outRawCounts {output.raw_counts} " + "--plotFile {output.plot} " # Default is 500k; use fewer to speed up testing: - # '--numberOfSamples 50 ' # [TEST SETTINGS ] - '&> {log} ' + # '--numberOfSamples 50 ' # [enable for test] + "&> {log} " '&& sed -i "s/NA/0.0/g" {output.metrics} ' ) -rule sicer: - """ - Run the SICER peak caller - """ - input: - ip=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.sicer_run, 'sicer', 'ip'), - ), - control=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.sicer_run, 'sicer', 'control'), - ), - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], - output: - bed=c.patterns['peaks']['sicer'] - log: - c.patterns['peaks']['sicer'] + '.log' - resources: - mem_mb=gb(16), - runtime=autobump(hours=2) - params: - block=lambda wc: chipseq.block_for_run(config, wc.sicer_run, 'sicer') - wrapper: - wrapper_for('sicer') rule macs2: """ Run the macs2 peak caller """ input: - ip=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.macs2_run, 'macs2', 'ip'), - ), - control=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.macs2_run, 'macs2', 'control'), - ), - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], + ip=lambda wc: expand( + patterns["merged_techreps"], + label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "ip"), + ), + control=lambda wc: expand( + patterns["merged_techreps"], + label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "control"), + ), + chromsizes=rules.chromsizes.output, output: - bed=c.patterns['peaks']['macs2'] + bed=patterns["peaks"]["macs2"], resources: mem_mb=gb(16), - runtime=autobump(hours=2) + runtime=autobump(hours=2), log: - c.patterns['peaks']['macs2'] + '.log' + patterns["peaks"]["macs2"] + ".log", params: - block=lambda wc: chipseq.block_for_run(config, wc.macs2_run, 'macs2') - wrapper: - wrapper_for('macs2/callpeak') + block=lambda wc: chipseq.block_for_run(config, wc.macs2_run, "macs2"), + script: + "../../scripts/macs2_callpeak.py" + -# Epic2 peak caller -# See https://github.com/biocore-ntnu/epic2 rule epic2: """ Run the epic2 peak caller """ input: - ip=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.epic2_run, 'epic2', 'ip'), - ), - control=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.epic2_run, 'epic2', 'control'), - ), - bai=lambda wc: # epic2 requires both .bam and .bam.bai (bam index) files (.bam.bai is not explicitly) - expand( - c.patterns['merged_techreps'] + '.bai', - label=chipseq.samples_for_run(config, wc.epic2_run, 'epic2', 'ip'), - ), - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'] + ip=lambda wc: expand( + patterns["merged_techreps"], + label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "ip"), + ), + control=lambda wc: expand( + patterns["merged_techreps"], + label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "control"), + ), + bai=lambda wc: expand( + patterns["merged_techreps"] + ".bai", + label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "ip"), + ) + + expand( + patterns["merged_techreps"] + ".bai", + label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "control"), + ), + chromsizes=rules.chromsizes.output, output: - bed=c.patterns['peaks']['epic2'] + bed=patterns["peaks"]["epic2"], resources: mem_mb=gb(16), - runtime=autobump(hours=2) - log: - c.patterns['peaks']['epic2'] + '.log' - params: - block=lambda wc: chipseq.block_for_run(config, wc.epic2_run, 'epic2'), - is_paired=c.is_paired - wrapper: - wrapper_for('epic2') - - -rule spp: - """ - Run the SPP peak caller - """ - input: - ip=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.spp_run, 'spp', 'ip'), - ), - control=lambda wc: - expand( - c.patterns['merged_techreps'], - label=chipseq.samples_for_run(config, wc.spp_run, 'spp', 'control'), - ), - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], - output: - bed=c.patterns['peaks']['spp'], - enrichment_estimates=c.patterns['peaks']['spp'] + '.est.wig', - smoothed_enrichment_mle=c.patterns['peaks']['spp'] + '.mle.wig', - rdata=c.patterns['peaks']['spp'] + '.RData' + runtime=autobump(hours=2), log: - c.patterns['peaks']['spp'] + '.log' - resources: - mem_mb=gb(16), - runtime=autobump(hours=2) + patterns["peaks"]["epic2"] + ".log", params: - block=lambda wc: chipseq.block_for_run(config, wc.spp_run, 'spp'), - keep_tempfiles=False, - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting in the cluster - # config. - java_args='-Xmx24g', - # java_args='-Xmx2g', # [TEST SETTINGS -1] - threads: 2 - wrapper: - wrapper_for('spp') + block=lambda wc: chipseq.block_for_run(config, wc.epic2_run, "epic2"), + is_paired=is_paired, + script: + "../../scripts/epic2.py" rule bed_to_bigbed: @@ -707,59 +486,17 @@ rule bed_to_bigbed: Convert BED to bigBed """ input: - bed='{prefix}.bed', - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'] - output: '{prefix}.bigbed' + bed="{prefix}.bed", + chromsizes=rules.chromsizes.output, + output: + "{prefix}.bigbed", resources: mem_mb=gb(2), - runtime=autobump(hours=2) - log: '{prefix}.bigbed.log' - run: - # Based on the filename, identify the algorithm. Based on the contents, - # identify the format. - algorithm = os.path.basename(os.path.dirname(input.bed)) - kind = chipseq.detect_peak_format(input.bed) - - # bedToBigBed doesn't handle zero-size files - # bigbed is not created from epic2-generated peaks - if os.stat(input.bed).st_size == 0: - shell("touch {output}") - elif kind == 'narrowPeak': - _as = '../../include/autosql/bigNarrowPeak.as' - _type = 'bed6+4' - names=[ - 'chrom', 'chromStart', 'chromEnd', 'name', 'score', - 'strand', 'signalValue', 'pValue', 'qValue', 'peak'] - elif kind == 'broadPeak': - _as = '../../include/autosql/bigBroadPeak.as' - _type = 'bed6+3' - names=[ - 'chrom', 'chromStart', 'chromEnd', 'name', 'score', - 'strand', 'signalValue', 'pValue', 'qValue'] - elif kind == 'epic2Input': - _as = f'../../include/autosql/{kind}Peak.as' - _type = 'bed6+4' - names=[ - 'chrom', 'chromStart', 'chromEnd', 'pValue', 'score', - 'strand', 'ChIPCount', 'InputCount', 'FDR', 'log2FoldChange'] - elif kind == 'epic2NoInput': - _as = f'../../include/autosql/{kind}Peak.as' - _type = 'bed6' - names=[ - 'chrom', 'chromStart', 'chromEnd', 'ChIPCount', 'score', - 'strand'] - else: - raise ValueError("Unhandled format for {0}".format(input.bed)) - - df = pd.read_table(input.bed, index_col=False, names=names) - df['score'] = df['score'] - df['score'].min() - df['score'] = (df['score'] / df['score'].max()) * 1000 - df['score'] = df['score'].replace([np.inf, -np.inf], np.nan).fillna(0) - df['score'] = df['score'].astype(int) - df.to_csv(output[0] + '.tmp', sep='\t', index=False, header=False) - - shell('bedToBigBed -as={_as} -type={_type} {output}.tmp {input.chromsizes} {output} &> {log}') - shell('rm {output}.tmp') + runtime=autobump(hours=2), + log: + "{prefix}.bigbed.log", + script: + "../../scripts/bed_to_bigbed.py" rule multibigwigsummary: @@ -767,25 +504,25 @@ rule multibigwigsummary: Summarize the bigWigs across genomic bins """ input: - c.targets['bigwig'] + expand(patterns["bigwig"], label=sampletable.label), output: - npz=c.targets['multibigwigsummary']['npz'], - tab=c.targets['multibigwigsummary']['tab'] + npz=patterns["multibigwigsummary"]["npz"], + tab=patterns["multibigwigsummary"]["tab"], threads: 16 resources: mem_mb=gb(16), - runtime=autobump(hours=2) + runtime=autobump(hours=2), run: # from the input files, figure out the sample name. - labels = ' '.join([i.split('/')[-2] for i in input]) + labels = " ".join([i.split("/")[-2] for i in input]) shell( - 'multiBigwigSummary ' - 'bins ' - '-b {input} ' - '--labels {labels} ' - '--numberOfProcessors {threads} ' - '-out {output.npz} ' - '--outRawCounts {output.tab}' + "multiBigwigSummary " + "bins " + "-b {input} " + "--labels {labels} " + "--numberOfProcessors {threads} " + "-out {output.npz} " + "--outRawCounts {output.tab}" ) @@ -794,22 +531,21 @@ rule plotcorrelation: Plot a heatmap of correlations across all samples """ input: - c.targets['multibigwigsummary']['npz'] + patterns["multibigwigsummary"]["npz"], output: - heatmap=c.targets['plotcorrelation']['heatmap'], - tab=c.targets['plotcorrelation']['tab'] + heatmap=patterns["plotcorrelation"]["heatmap"], + tab=patterns["plotcorrelation"]["tab"], resources: mem_mb=gb(2), - runtime=autobump(hours=2) + runtime=autobump(hours=2), shell: - 'plotCorrelation ' - '--corData {input} ' - '--corMethod spearman ' - '--whatToPlot heatmap ' - '--plotFile {output.heatmap} ' - '--colorMap Reds ' - '--outFileCorMatrix {output.tab}' - + "plotCorrelation " + "--corData {input} " + "--corMethod spearman " + "--whatToPlot heatmap " + "--plotFile {output.heatmap} " + "--colorMap Reds " + "--outFileCorMatrix {output.tab}" # NOTE: if you're expecting negative correlation, try a divergent # colormap and setting the min/max to ensure that the colomap is # centered on zero: @@ -817,45 +553,88 @@ rule plotcorrelation: # '--zMin -1 ' # '--zMax 1 ' -if 'merged_bigwigs' in config: - rule merge_bigwigs: - """ - Merge together bigWigs as specified in the config ("merged_bigwigs" - section). - """ - input: - bigwigs=lambda wc: expand( - c.patterns['bigwig'], - label=config['merged_bigwigs'][wc.merged_bigwig_label], - ), - chromsizes=refdict[c.organism][config['aligner']['tag']]['chromsizes'], - output: - c.patterns['merged_bigwig'] - resources: - mem_mb=gb(16), - runtime=autobump(hours=2) - log: - c.patterns['merged_bigwig'] + '.log' - script: - wrapper_for('average-bigwigs/wrapper.py') rule idxstats: - """ - Run samtools idxstats on sample bams - """ input: - bam=c.patterns['markduplicates']['bam'], - bai=c.patterns['markduplicates']['bam'] + '.bai' + bam=patterns["markduplicates"]["bam"], + bai=patterns["markduplicates"]["bam"] + ".bai", output: - txt=c.patterns['samtools']['idxstats'] + txt=patterns["samtools"]["idxstats"], resources: mem_mb=gb(16), - runtime=autobump(hours=2) - log: - c.patterns['samtools']['idxstats'] + '.log' + runtime=autobump(hours=2), + log: + patterns["samtools"]["idxstats"] + ".log", + shell: + "samtools idxstats {input.bam} 2> {log} 1> {output.txt}" + + +rule flagstat: + input: + bam=patterns["markduplicates"]["bam"], + bai=patterns["markduplicates"]["bam"] + ".bai", + output: + patterns["samtools"]["flagstat"], + log: + patterns["samtools"]["flagstat"] + ".log", + shell: + "samtools flagstat {input.bam} > {output}" + + +rule samtools_stats: + input: + bam=patterns["markduplicates"]["bam"], + bai=patterns["markduplicates"]["bam"] + ".bai", + output: + patterns["samtools"]["stats"], + log: + patterns["samtools"]["stats"] + ".log", + shell: + "samtools stats {input.bam} > {output}" + + +rule multiqc: + input: + expand(patterns["bam"], sample=SAMPLES), + expand(patterns["fastqc"]["raw"], sample=SAMPLES), + expand(patterns["fastqc"]["cutadapt"], sample=SAMPLES), + expand(patterns["fastqc"]["bam"], sample=SAMPLES), + expand(patterns["bigwig"], label=sampletable.label), + expand(patterns["samtools"]["idxstats"], sample=SAMPLES), + expand(patterns["samtools"]["flagstat"], sample=SAMPLES), + expand(patterns["samtools"]["stats"], sample=SAMPLES), + expand(patterns["merged_techreps"], label=sampletable.label), + expand( + patterns["fingerprint"]["metrics"], + ip_label=sampletable.loc[sampletable.antibody != "input", "label"], + ), + expand(patterns["collectinsertsizemetrics"], sample=SAMPLES) + if is_paired + else [], + [v["bigbed"] for v in peaks.values()], + patterns["multibigwigsummary"]["tab"], + patterns["plotcorrelation"]["tab"], + config="config/multiqc_config.yaml", + output: + patterns["multiqc"], + log: + patterns["multiqc"] + ".log", + threads: 1 + resources: + mem_mb=gb(2), + runtime=autobump(hours=2), run: + analysis_directory = "data" + outdir = os.path.dirname(output[0]) + basename = os.path.basename(output[0]) shell( - 'samtools idxstats {input.bam} 2> {log} 1> {output.txt}' + "LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 " + "multiqc " + "--quiet " + "--outdir {outdir} " + "--force " + "--filename {basename} " + "--config {input.config} " + "{analysis_directory} " + "&> {log} " ) - -# vim: ft=python diff --git a/workflows/chipseq/config/chipseq_patterns.yaml b/workflows/chipseq/config/chipseq_patterns.yaml index 3e44107a..90b511c9 100644 --- a/workflows/chipseq/config/chipseq_patterns.yaml +++ b/workflows/chipseq/config/chipseq_patterns.yaml @@ -49,7 +49,15 @@ patterns_by_sample: metrics: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics' samtools: - idxstats: 'data/rnaseq_samples/{sample}/idxstat_{sample}.txt' + idxstats: 'data/rnaseq_samples/{sample}/samtools_idxstats_{sample}.txt' + flagstat: 'data/rnaseq_samples/{sample}/samtools_flagstat_{sample}.txt' + stats: 'data/rnaseq_samples/{sample}/samtools_stats_{sample}.txt' + + peaks: + macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed' + spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bed' + sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bed' + epic2: 'data/chipseq_peaks/epic2/{epic2_run}/peaks.bed' patterns_by_peaks: peaks: diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml index 591fe13b..a8d10142 100644 --- a/workflows/chipseq/config/config.yaml +++ b/workflows/chipseq/config/config.yaml @@ -39,33 +39,10 @@ chipseq: # merging step of the workflow merges and de-dupes appropriately so that the # peak callers only see BAMs with all duplicates removed. # - # The "extra" block is used to pass extra information to the peak-caller in - # a run-specific manner. Check the wrapper README for details on this. For - # example, the macs2 wrapper passes `extra` verbatim to the command line, but - # the spp wrapper handles things differently. - # # Each wrapper is built to accept either single or multiple BAMs and output # at least a BED file of peaks. # peak_calling: - - label: gaf-embryo-sicer - algorithm: sicer - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - redundancy_threshold: 1 - window_size: 200 - fragment_size: 150 - # optional user-specified override mappable genome proportion if - # specified here, SICER will use this value instead of the value specific - # to the genome build if NOT specified here, SICER will use the - # mappability value for your genome build - effective_genome_fraction: 0.75 - genome_build: dm6 - gap_size: 600 - fdr: 0.01 - - label: gaf-embryo-1 algorithm: macs2 @@ -80,23 +57,6 @@ chipseq: effective_genome_count: 7e7 extra: '--nomodel --extsize 147' - - label: gaf-embryo-1 - algorithm: spp - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - extra: - fdr: 0.3 - zthr: 4 - - - label: gaf-embryo-1-defaults - algorithm: spp - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - - label: gaf-wingdisc-pooled algorithm: macs2 ip: @@ -107,17 +67,6 @@ chipseq: - input-wingdisc-2 extra: '--nomodel --extsize 147' - - label: gaf-wingdisc-pooled - algorithm: spp - ip: - - gaf-wingdisc-1 - - gaf-wingdisc-2 - control: - - input-wingdisc-1 - # - input-wingdisc-2 - extra: - fdr: 0.5 - zthr: 4 - label: gaf-wingdisc-pooled-1 algorithm: epic2 From 413620746f8687399a12b800be4fca3076ca5406 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:29:59 -0500 Subject: [PATCH 054/196] clean up some tests --- lib/patterns_targets.py | 256 ------------------ .../{star_1pass.tsv => hisat2.tsv} | 0 test/test_configs/override.yaml | 14 - 3 files changed, 270 deletions(-) delete mode 100644 lib/patterns_targets.py rename test/test_configs/{star_1pass.tsv => hisat2.tsv} (100%) delete mode 100644 test/test_configs/override.yaml diff --git a/lib/patterns_targets.py b/lib/patterns_targets.py deleted file mode 100644 index 08fedb26..00000000 --- a/lib/patterns_targets.py +++ /dev/null @@ -1,256 +0,0 @@ -""" -This module handles the reading and filling-in of patterns. It can be used from -within Snakefiles or in downstream (figure-making) scripts. -""" - -import os -import collections -import yaml -from . import utils -from . import chipseq -from snakemake.io import expand - -HERE = os.path.abspath(os.path.dirname(__file__)) - -# Note: when adding support for new peak callers, add them here. -PEAK_CALLERS = ['macs2', 'spp', 'sicer', 'epic2'] - - -def update_recursive(d, u): - """ - Update dictionary `d` with items in dictionary `u`, recursively - """ - for k, v in u.items(): - if isinstance(v, collections.abc.Mapping): - d[k] = update_recursive(d.get(k, {}), v) - else: - d[k] = v - return d - - -class SeqConfig(object): - def __init__(self, config, patterns, workdir=None): - """ - This class takes care of common tasks related to config and patterns - files (reading the sampletable, etc) but is intended to be subclassed. - - Parameters - ---------- - config : str or dict - - patterns : str - Path to patterns YAML file - - workdir : str - Config, patterns, and all paths in `config` should be interpreted - as relative to `workdir` - """ - self.path = None - self.workdir = '.' - if workdir is not None: - config = os.path.join(workdir, config) - patterns = os.path.join(workdir, patterns) - self.workdir = workdir - - self.config = config - - stranded = self.config.get('stranded', None) - self.stranded = None - if stranded: - if stranded in ('unstranded'): - self.stranded = 'unstranded' - elif stranded in ('fr-firststrand', 'ISR', 'SR', 'reverse'): - self.stranded = 'fr-firststrand' - elif stranded in ('fr-secondstrand', 'ISF', 'SF', 'forward'): - self.stranded = 'fr-secondstrand' - - # Read the config file and extract all sort of useful bits. This mostly - # uses the `common` module to handle the details. - self.samples, self.sampletable = utils.get_sampletable(self.config) - self.patterns = yaml.load(open(patterns), Loader=yaml.FullLoader) - self.is_paired = utils.detect_layout(self.sampletable) == 'PE' - if self.is_paired: - self.n = [1, 2] - else: - self.n = [1] - if 'Run' in self.sampletable.columns and sum(self.sampletable['Run'].str.startswith('SRR')) > 0: - self.is_sra = True - else: - self.is_sra = False - - ##########################utils.preflight(self.config) - -class RNASeqConfig(SeqConfig): - def __init__(self, config, patterns, workdir=None): - """ - Config object specific to RNA-seq workflows. - - Fills in patterns to create targets by handling the by-sample and - by-aggregate sections separately. - - Parameters - ---------- - - config : dict - - patterns : str - Path to patterns YAML file - - workdir : str - Config, patterns, and all paths in `config` should be interpreted - as relative to `workdir` - """ - SeqConfig.__init__(self, config, patterns, workdir) - - self.fill = dict(sample=self.samples, n=self.n) - self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None) - self.targets = utils.fill_patterns(self.patterns, self.fill) - - # If the sampletable is from an sra metadata table, then we need to set the value of - # 'orig_filename' for each of the samples to where the fastq was downloaded - if self.is_sra: - self.sampletable['orig_filename'] = expand(self.patterns["sra_fastq"], sample=self.samples, n=1) - if self.is_paired: - self.sampletable['orig_filename_R2'] = expand(self.patterns["sra_fastq"], sample=self.samples, n=2) - - # Then the aggregation - if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config: - self.fill_by_aggregation = dict( - merged_bigwig_label=self.config['merged_bigwigs'].keys(), - ) - self.targets_by_aggregation = utils.fill_patterns( - self.patterns_by_aggregation, - self.fill_by_aggregation - ) - self.targets.update(self.targets_by_aggregation) - self.patterns.update(self.patterns_by_aggregation) - - #########################utils.rnaseq_preflight(self) - - -class ChIPSeqConfig(SeqConfig): - def __init__(self, config, patterns, workdir=None): - """ - Config object specific to ChIP-seq workflows. - - Fills in patterns to create targets by handling the by-sample, by-peak, - and by-aggregate sections separately. - - Parameters - ---------- - - config : dict - - patterns : str - Path to patterns YAML file - - workdir : str - Config, patterns, and all paths in `config` should be interpreted - as relative to `workdir` - """ - SeqConfig.__init__(self, config, patterns, workdir) - - self.targets = {} - - # For ChIP-seq, the structure of the patterns is quite different for - # samples than it is for peaks. For example, the peaks do not have any - # sample info in the filenames but aggregate possibly many different samples - # - # So construct them separately, and then later update self.patterns and - # self.targets. - # - # The averaged bigwigs are also aggregated, but in a different way. - # They will be handled separately. - # - # First, the samples... - self.patterns_by_sample = self.patterns['patterns_by_sample'] - self.fill_by_sample = dict( - n=self.n, - sample=self.samples.values, - label=self.sampletable.label.values, - ip_label=self.sampletable.label[ - self.sampletable.antibody != 'input'].values - ) - self.targets_by_sample = utils.fill_patterns( - self.patterns_by_sample, self.fill_by_sample) - - self.targets.update(self.targets_by_sample) - self.patterns.update(self.patterns_by_sample) - - # Then the aggregation... - self.patterns_by_aggregation = self.patterns.pop('patterns_by_aggregate', None) - if self.patterns_by_aggregation is not None and 'merged_bigwigs' in self.config: - self.fill_by_aggregation = dict( - merged_bigwig_label=self.config['merged_bigwigs'].keys(), - ) - self.targets_by_aggregation = utils.fill_patterns( - self.patterns_by_aggregation, - self.fill_by_aggregation - ) - self.targets.update(self.targets_by_aggregation) - self.patterns.update(self.patterns_by_aggregation) - - # Then the peaks... - # - - self.patterns_by_peaks = self.patterns['patterns_by_peaks'] - self.targets_for_peaks = {} - - # We need to fill in just those peak-calling runs that are specified - # for each peak-caller. For reference, here's an example - # `patterns_by_peaks` from the YAML: - # - # peaks: - # macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bed' - # spp: '{peak_calling}/spp/{spp_run}/peaks.bed' - # bigbed: - # macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bigbed' - # spp: '{peak_calling}/spp/{spp_run}/peaks.bigbed' - - - # Also note that the snakefile's all rule uses - # utils.flatten(c.targets['peaks']), but in the case where no - # peak-calling runs are specified these should be initialized, - # otherwise we'll get a KeyError. - self.targets['peaks'] = [] - self.targets['bigbed'] = [] - - for pc in PEAK_CALLERS: - # Extract out just the subset of `patterns_by_peaks` for this - # peak-caller e.g., from the example above, if pc='macs2' this - # would only be: - # - # peaks: - # macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bed' - # bigbed: - # macs2: '{peak_calling}/macs2/{macs2_run}/peaks.bigbed' - # - _peak_patterns = { - k: {pc: v[pc]} for k, v in self.patterns_by_peaks.items() - } - - - # Fix for issue #166, which was caused by commit 8a211122: - # - # If no runs for the peak-caller are configured, this will be - # empty and we should continue on. - peaks_to_fill = list(chipseq.peak_calling_dict(self.config, algorithm=pc).keys()) - - if not peaks_to_fill: - continue - - _fill = {pc + '_run': peaks_to_fill} - - # The trick here is the recursive updating of targets_for_peaks. - # We're adding the filled-in runs of each peak caller to the - # targets as they're built. - update_recursive( - self.targets_for_peaks, - utils.fill_patterns(_peak_patterns, _fill) - ) - - - self.targets.update(self.targets_for_peaks) - self.patterns.update(self.patterns_by_peaks) - - utils.chipseq_preflight(self) diff --git a/test/test_configs/star_1pass.tsv b/test/test_configs/hisat2.tsv similarity index 100% rename from test/test_configs/star_1pass.tsv rename to test/test_configs/hisat2.tsv diff --git a/test/test_configs/override.yaml b/test/test_configs/override.yaml deleted file mode 100644 index bd05a925..00000000 --- a/test/test_configs/override.yaml +++ /dev/null @@ -1,14 +0,0 @@ -# Due to the way Snakemake recursively merges config items, we need to -# recursively reset this dictonary to override the default one in order to -# allow arbitrary other sample names. -# -# Use it like this -# -# snakemake --configfile ../../test/override.yaml --config sampletable=/path/to/tsv -# -merged_bigwigs: - control_pos: - pos: [] - treatment_all: - pos: [] - neg: [] From d7bb4924773b29cd987ee6fb45c0e44831387aae Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:30:18 -0500 Subject: [PATCH 055/196] convert rrna table to script --- scripts/rrna_libsizes_table.py | 58 ++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 scripts/rrna_libsizes_table.py diff --git a/scripts/rrna_libsizes_table.py b/scripts/rrna_libsizes_table.py new file mode 100644 index 00000000..f71d48bc --- /dev/null +++ b/scripts/rrna_libsizes_table.py @@ -0,0 +1,58 @@ +""" +Prepares a TSV and JSON file for multiqc to pick up and display as a sortable +table +""" +import sys +import os +import pandas as pd +import yaml + +sys.path.insert(0, os.path.dirname(__file__) + "/..") +from lib import utils + + +def rrna_sample(f): + return utils.extract_wildcards(snakemake.config["patterns"]["rrna"]["libsize"], f)["sample"] + + +def sample(f): + return utils.extract_wildcards(snakemake.config["patterns"]["libsizes"]["cutadapt"], f)["sample"] + + +def million(f): + return float(open(f).read()) / 1e6 + + +rrna = sorted(snakemake.input.rrna, key=rrna_sample) +fastq = sorted(snakemake.input.fastq, key=sample) +samples = list(map(rrna_sample, rrna)) +rrna_m = list(map(million, rrna)) +fastq_m = list(map(million, fastq)) + +df = pd.DataFrame( + dict( + sample=samples, + million_reads_rRNA=rrna_m, + million_reads_fastq=fastq_m, + ) +) +df = df.set_index("sample") +df["rRNA_percentage"] = df.million_reads_rRNA / df.million_reads_fastq * 100 + +df[["million_reads_fastq", "million_reads_rRNA", "rRNA_percentage"]].to_csv( + snakemake.output.tsv, sep="\t" +) +y = { + "id": "rrna_percentages_table", + "section_name": "rRNA content", + "description": "Amount of reads mapping to rRNA sequence", + "plot_type": "table", + "pconfig": { + "id": "rrna_percentages_table_table", + "title": "rRNA content table", + "min": 0, + }, + "data": yaml.load(df.transpose().to_json(), Loader=yaml.FullLoader), +} +with open(snakemake.output.json, "w") as fout: + yaml.dump(y, fout, default_flow_style=False) From 66f5a11b4d9028a099d74e5efa8eb23c4aa41842 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:30:36 -0500 Subject: [PATCH 056/196] fix test on preprocessor --- ci/preprocessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/preprocessor.py b/ci/preprocessor.py index 6bf05361..1cd7e5da 100644 --- a/ci/preprocessor.py +++ b/ci/preprocessor.py @@ -49,8 +49,8 @@ def uncomment_line(line): >>> assert uncomment_line('# asdf') == ' asdf' >>> assert uncomment_line(' # asdf') == ' asdf' >>> assert uncomment_line('do nothing') == 'do nothing' - >>> assert uncomment_line('do nothing # [disable for test]') == 'do nothing # [disable for test]') - >>> assert uncomment_line('#uncomment # [disable for test]') == 'uncomment # [disable for test]') + >>> assert uncomment_line('do nothing # [disable for test]') == 'do nothing # [disable for test]' + >>> assert uncomment_line('#uncomment # [disable for test]') == 'uncomment # [disable for test]' """ first = line.find("#") From bfdbf5e874b8b6a19d4ac6083db776d446a0c3db Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:30:49 -0500 Subject: [PATCH 057/196] updated env yaml --- env.yml | 62 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/env.yml b/env.yml index 02f0f695..9bbc8a71 100644 --- a/env.yml +++ b/env.yml @@ -34,7 +34,7 @@ dependencies: - cairo=1.18.2 - certifi=2024.12.14 - cffi=1.17.1 - - charset-normalizer=3.4.0 + - charset-normalizer=3.4.1 - click=8.1.8 - coin-or-cbc=2.10.12 - coin-or-cgl=0.60.9 @@ -60,6 +60,7 @@ dependencies: - docutils=0.21.2 - dpath=2.2.0 - eido=0.2.4 + - epic2=0.0.52 - et_xmlfile=2.0.0 - exceptiongroup=1.2.2 - execnet=2.1.1 @@ -82,8 +83,8 @@ dependencies: - gffutils=0.13 - gfortran_impl_linux-64=14.2.0 - giflib=5.2.2 - - gitdb=4.0.11 - - gitpython=3.1.43 + - gitdb=4.0.12 + - gitpython=3.1.44 - graphite2=1.3.13 - gsl=1.16 - gxx_impl_linux-64=14.2.0 @@ -102,7 +103,7 @@ dependencies: - imagesize=1.4.1 - immutables=0.21 - importlib-metadata=8.5.0 - - importlib_resources=6.4.5 + - importlib_resources=6.5.2 - iniconfig=2.0.0 - intervalstats=1.01 - ipython=8.31.0 @@ -133,7 +134,7 @@ dependencies: - libcups=2.3.3 - libcurl=8.11.1 - libdeflate=1.23 - - libedit=3.1.20191231 + - libedit=3.1.20240808 - libev=4.33 - libexpat=2.6.4 - libffi=3.4.2 @@ -158,7 +159,7 @@ dependencies: - libnsl=2.0.1 - libopenblas=0.3.28 - libopenssl-static=3.4.0 - - libpng=1.6.44 + - libpng=1.6.45 - libsanitizer=14.2.0 - libsqlite=3.47.2 - libssh2=1.11.1 @@ -174,6 +175,7 @@ dependencies: - libzlib=1.3.1 - logmuse=0.2.8 - logomaker=0.8 + - macs2=2.2.9.1 - make=4.4.1 - markdown=3.6 - markdown-it-py=3.0.0 @@ -187,6 +189,7 @@ dependencies: - mysql-connector-c=6.1.11 - natsort=8.4.0 - nbformat=5.10.4 + - ncbi-vdb=3.1.1 - ncurses=6.5 - networkx=3.4.2 - nspr=4.36 @@ -197,6 +200,7 @@ dependencies: - openjpeg=2.5.3 - openpyxl=3.1.5 - openssl=3.4.0 + - ossuuid=1.6.2 - packaging=24.2 - pandas=2.2.3 - pandoc=3.6.1 @@ -208,14 +212,44 @@ dependencies: - pephubclient=0.4.4 - peppy=0.40.7 - perl=5.32.1 + - perl-alien-build=2.84 + - perl-alien-libxml2=0.17 + - perl-business-isbn=3.007 + - perl-business-isbn-data=20210112.006 + - perl-capture-tiny=0.48 + - perl-carp=1.50 + - perl-constant=1.33 + - perl-exporter=5.74 + - perl-extutils-makemaker=7.70 + - perl-ffi-checklib=0.28 + - perl-file-chdir=0.1011 + - perl-file-path=2.18 + - perl-file-temp=0.2304 + - perl-file-which=1.24 - perl-gd=2.56 - perl-gdgraph=1.54 - perl-gdtextutil=0.86 + - perl-importer=0.026 + - perl-parent=0.243 + - perl-path-tiny=0.124 + - perl-pathtools=3.75 + - perl-scope-guard=0.21 + - perl-sub-info=0.002 + - perl-term-table=0.024 + - perl-test-fatal=0.016 + - perl-test-warnings=0.031 + - perl-test2-suite=0.000163 + - perl-try-tiny=0.31 + - perl-uri=5.17 + - perl-xml-libxml=2.0210 + - perl-xml-namespacesupport=1.12 + - perl-xml-sax=1.02 + - perl-xml-sax-base=1.09 - pexpect=4.9.0 - picard=2.27.5 - pickleshare=0.7.5 - pigz=2.8 - - pillow=11.0.0 + - pillow=11.1.0 - pip=24.3.1 - pixman=0.44.2 - pkgutil-resolve-name=1.3.10 @@ -238,20 +272,20 @@ dependencies: - pydantic=2.10.4 - pydantic-core=2.27.2 - pyfaidx=0.8.1.3 - - pygments=2.18.0 + - pygments=2.19.1 - pyparsing=3.2.1 - pysam=0.22.1 - pysocks=1.7.1 - pytest=8.3.4 - pytest-xdist=3.6.1 - - python=3.12.8 + - python=3.11.11 - python-dateutil=2.9.0.post0 - python-fastjsonschema=2.21.1 - python-isal=1.7.1 - python-kaleido=0.2.1 - python-tzdata=2024.2 - python-zlib-ng=0.5.1 - - python_abi=3.12 + - python_abi=3.11 - pytz=2024.1 - pyvcf3=1.0.3 - pyyaml=6.0.2 @@ -267,7 +301,7 @@ dependencies: - rseqc=5.0.4 - salmon=1.10.3 - samtools=1.21 - - scipy=1.14.1 + - scipy=1.15.0 - seaborn=0.13.2 - seaborn-base=0.13.2 - sed=4.8 @@ -279,12 +313,12 @@ dependencies: - slack_sdk=3.34.0 - smart_open=7.1.0 - smmap=5.0.0 - - snakemake=8.26.0 + - snakemake=8.27.0 - snakemake-interface-common=1.17.4 - snakemake-interface-executor-plugins=9.3.3 - snakemake-interface-report-plugins=1.1.0 - snakemake-interface-storage-plugins=3.3.0 - - snakemake-minimal=8.26.0 + - snakemake-minimal=8.27.0 - snowballstemmer=2.2.0 - soupsieve=2.5 - spectra=0.0.11 @@ -296,7 +330,7 @@ dependencies: - sphinxcontrib-qthelp=2.0.0 - sphinxcontrib-serializinghtml=1.1.10 - sqlite=3.47.2 - - sra-tools=2.9.6 + - sra-tools=3.1.1 - stack_data=0.6.3 - star=2.7.11b - statsmodels=0.14.4 From a466da0471854632f07d7bde57c5f5b92210cbd2 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:30:59 -0500 Subject: [PATCH 058/196] fix import --- lib/postprocess/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py index 16010e14..f8fc64a6 100644 --- a/lib/postprocess/utils.py +++ b/lib/postprocess/utils.py @@ -9,7 +9,7 @@ here = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.join(here, "../../lib")) -from common import openfile +from utils import openfile From eb6892529023c1a5df7478971f4074c55baff84c Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:31:17 -0500 Subject: [PATCH 059/196] fix strand check --- rules/strand_check.smk | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/rules/strand_check.smk b/rules/strand_check.smk index 625ba3e2..4c936a77 100644 --- a/rules/strand_check.smk +++ b/rules/strand_check.smk @@ -1,14 +1,13 @@ - rule sample_strand_check: input: - fastq=fill_r1_r2(c.sampletable, c.patterns['fastq']), - index=rules.bowtie2_index.output, + fastq=expand(patterns["fastq"], n=n, allow_missing=True), + index=expand(rules.bowtie2_index.output, label="genome"), bed12=rules.conversion_bed12.output, output: strandedness='strand_check/{sample}/{sample}.strandedness', bam=temporary('strand_check/{sample}/{sample}.strandedness.bam'), bai=temporary('strand_check/{sample}/{sample}.strandedness.bam.bai'), - fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', sample=SAMPLES, n=n)), + fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', n=n, allow_missing=True)), log: 'strand_check/{sample}/{sample}.strandedness.log' threads: 6 @@ -16,17 +15,13 @@ rule sample_strand_check: mem_mb=gb(8), runtime=autobump(hours=2) run: - prefix = aligners.prefix_from_bowtie2_index(input.index) - nreads = int(config['strand_check_reads']) * 4 - if c.is_paired: - assert len(input.fastq) == 2 - assert len(output.fastqs) == 2 + prefix = os.path.commonprefix(input.index).rstrip(".") + nreads = int(1e5 * 4) + if is_paired: shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}') fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} ' else: - assert len(input.fastq) == 1 - assert len(output.fastqs) == 1 shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') fastqs = f'-U {output.fastqs[0]} ' shell( From 8f33026592306e59b91f5812e4a86e9d1a86a732 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:33:52 -0500 Subject: [PATCH 060/196] split featurecounts --- workflows/rnaseq/Snakefile | 182 +++++++------------ workflows/rnaseq/config/rnaseq_patterns.yaml | 4 +- 2 files changed, 71 insertions(+), 115 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index a328d887..29732d47 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -18,10 +18,10 @@ REFERENCES = config.get("reference_dir", "../../references") sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" -is_sra = utils.detect_sra(sampletable) n = ["1", "2"] if is_paired else ["1"] -SAMPLES = sampletable.iloc[:, 0].values +SAMPLES = sampletable.index patterns = yaml.safe_load(open("config/rnaseq_patterns.yaml")) +config["patterns"] = patterns wildcard_constraints: @@ -37,12 +37,43 @@ localrules: rule all: input: patterns["multiqc"], - patterns["bigwig"], + expand(patterns["bigwig"]["pos"], sample=SAMPLES), + expand(patterns["bigwig"]["neg"], sample=SAMPLES), +# Optionally run ``snakemake strand_check`` to do a preliminary run evaluating strandedness. +include: '../../rules/strand_check.smk' -if is_sra: +if utils.detect_sra(sampletable): + sampletable['orig_filename'] = expand( + 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1) + + if is_paired: + sampletable['orig_filename_R2'] = expand( + 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2) + + rule fastq_dump: + output: + fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n, allow_missing=True) + log: + 'original_data/sra_samples/{sample}/{sample}.fastq.gz.log' + params: + is_paired=is_paired, + # extra="-X 100000", # [enable for test] + resources: + mem_mb=gb(1), + disk_mb=autobump(gb=1), + runtime=autobump(hours=2) + run: + srr = sampletable.loc[wildcards.sample, "Run"] + extra = params.get("extra", "") + if is_paired: + shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") + shell("mv {srr}_1.fastq.gz {output[0]}") + shell("mv {srr}_2.fastq.gz {output[1]}") + else: + shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp") + shell("mv {output[0]}.tmp {output[0]}") - include: "../../rules/sra.smk" rule symlinks: @@ -71,13 +102,6 @@ rule symlink_targets: ), -# This can be set at the command line with --config strand_check_reads=1000 -config.setdefault("strand_check_reads", 1e5) - -# TODO: re-enable -# include: '../../rules/strand_check.smk' - - rule cutadapt: input: fastq=expand(patterns["fastq"], n=n, allow_missing=True), @@ -145,7 +169,7 @@ rule fastqc: '--quiet ' '--outdir {outdir} ' '{input} ' - '{log} ' + '2> {log} ' ) outfile = os.path.basename(input[0]) for s in ['.fastq', '.fq', '.gz', '.bam']: @@ -203,9 +227,6 @@ if config["aligner"] == "hisat2": ) - -# TODO: star has lots of rules. Better to be in rules/aligner.smk? - if config["aligner"].startswith("star"): # STAR can be run in 1-pass or 2-pass modes. Since we may be running it @@ -240,9 +261,7 @@ if config["aligner"].startswith("star"): if config["aligner"] == "star": rule star: - """ - Align with STAR (1-pass mode) - """ + "Align with STAR (1-pass mode)" input: fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=rules.star_index.output, @@ -281,9 +300,7 @@ if config["aligner"] == "star": if config["aligner"] == "star-twopass": rule star_pass1: - """ - First pass of alignment with STAR to get the junctions - """ + "First pass of alignment with STAR to get the junctions" input: fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=rules.star_index.output, @@ -458,18 +475,14 @@ rule bam_index: "samtools index {input} {output}" -# TODO: split into multiple featurecounts runs, since PE needs to be sorted each time. rule featurecounts: - """ - Count reads in annotations with featureCounts from the subread package - """ input: annotation=rules.gtf.output, - bam=expand(patterns["markduplicates"]["bam"], sample=SAMPLES), + bam=patterns["markduplicates"]["bam"], output: - counts="{sample_dir}/rnaseq_aggregation/featurecounts.txt", + patterns["featurecounts"]["per_sample"] log: - "{sample_dir}/rnaseq_aggregation/featurecounts.txt.log", + patterns["featurecounts"]["per_sample"] + ".log" threads: 8 resources: mem_mb=gb(16), @@ -482,7 +495,6 @@ rule featurecounts: }[config["stranded"]], extra="", run: - # NOTE: By default, we use -p for paired-end p_arg = "" if is_paired: p_arg = "-p --countReadPairs " @@ -492,17 +504,34 @@ rule featurecounts: "{p_arg} " "-T {threads} " "-a {input.annotation} " - "-o {output.counts} " + "-o {output} " "{input.bam} " "&> {log}" ) +rule aggregate_featurecounts: + input: + expand(patterns["featurecounts"]["per_sample"], sample=SAMPLES) + output: + patterns["featurecounts"]["aggregated"] + log: + patterns["featurecounts"]["aggregated"] + ".log" + threads: + 1 + resources: + mem_mb=gb(8), + runtime=autobump(hours=1) + run: + for i, file in enumerate(input): + df = pd.read_csv(file, sep="\t", comment="#") + df = df.set_index('Geneid', drop=False) + if i == 0: + final = df + continue + final[df.columns[-1]] = df[df.columns[-1]] + final.to_csv(output[0], sep="\t", index=False) -# # TODO: port some of this over to utils, or maybe script. rule rrna_libsizes_table: - """ - Aggregate rRNA counts into a table - """ input: rrna=expand(patterns["rrna"]["libsize"], sample=SAMPLES), fastq=expand(patterns["libsizes"]["cutadapt"], sample=SAMPLES), @@ -513,54 +542,8 @@ rule rrna_libsizes_table: resources: mem_mb=gb(2), runtime=autobump(hours=2), - run: - def rrna_sample(f): - return utils.extract_wildcards(patterns["rrna"]["libsize"], f)["sample"] - - - def sample(f): - return utils.extract_wildcards(patterns["libsizes"]["cutadapt"], f)[ - "sample" - ] - - - def million(f): - return float(open(f).read()) / 1e6 - - - rrna = sorted(input.rrna, key=rrna_sample) - fastq = sorted(input.fastq, key=sample) - samples = list(map(rrna_sample, rrna)) - rrna_m = list(map(million, rrna)) - fastq_m = list(map(million, fastq)) - - df = pd.DataFrame( - dict( - sample=samples, - million_reads_rRNA=rrna_m, - million_reads_fastq=fastq_m, - ) - ) - df = df.set_index("sample") - df["rRNA_percentage"] = df.million_reads_rRNA / df.million_reads_fastq * 100 - - df[["million_reads_fastq", "million_reads_rRNA", "rRNA_percentage"]].to_csv( - output.tsv, sep="\t" - ) - y = { - "id": "rrna_percentages_table", - "section_name": "rRNA content", - "description": "Amount of reads mapping to rRNA sequence", - "plot_type": "table", - "pconfig": { - "id": "rrna_percentages_table_table", - "title": "rRNA content table", - "min": 0, - }, - "data": yaml.load(df.transpose().to_json(), Loader=yaml.FullLoader), - } - with open(output.json, "w") as fout: - yaml.dump(y, fout, default_flow_style=False) + script: + "../../scripts/rrna_libsizes_table.py" rule multiqc: @@ -579,8 +562,9 @@ rule multiqc: expand(patterns["samtools"]["idxstats"], sample=SAMPLES), expand(patterns["samtools"]["flagstat"], sample=SAMPLES), expand(patterns["samtools"]["stats"], sample=SAMPLES), + patterns["rrna_percentages_table"], - patterns["featurecounts"], + patterns["featurecounts"]["aggregated"], ), config="config/multiqc_config.yaml", output: @@ -609,9 +593,6 @@ rule multiqc: rule markduplicates: - """ - Mark or remove PCR duplicates with Picard MarkDuplicates - """ input: bam=patterns["bam"], output: @@ -639,9 +620,6 @@ rule markduplicates: rule collectrnaseqmetrics: - """ - Calculate various RNA-seq QC metrics with Picarc CollectRnaSeqMetrics - """ input: bam=patterns["markduplicates"]["bam"], refflat=rules.conversion_refflat.output, @@ -676,9 +654,6 @@ rule collectrnaseqmetrics: rule preseq: - """ - Compute a library complexity curve with preseq - """ input: bam=patterns["bam"], output: @@ -695,9 +670,6 @@ rule preseq: rule salmon: - """ - Quantify reads coming from transcripts with Salmon - """ input: fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=REFERENCES + "/salmon/versionInfo.json", @@ -735,9 +707,6 @@ rule salmon: rule kallisto: - """ - Quantify reads coming from transcripts with Kallisto - """ input: fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), index=REFERENCES + "/kallisto/transcripts.idx", @@ -777,9 +746,6 @@ rule kallisto: rule rseqc_infer_experiment: - """ - Infer strandedness of experiment - """ input: bam=patterns["markduplicates"]["bam"], bed12=rules.conversion_bed12.output, @@ -795,9 +761,6 @@ rule rseqc_infer_experiment: rule rseqc_read_distribution: - """ - read distribution plots - """ input: bam=patterns["markduplicates"]["bam"], bed12=rules.conversion_bed12.output, @@ -813,9 +776,6 @@ rule rseqc_read_distribution: rule idxstats: - """ - Run samtools idxstats on sample bams - """ input: bam=patterns["markduplicates"]["bam"], bai=patterns["markduplicates"]["bam"] + ".bai", @@ -831,9 +791,6 @@ rule idxstats: rule bigwig_neg: - """ - Create a bigwig for negative-strand reads - """ input: bam=patterns["markduplicates"]["bam"], bai=patterns["markduplicates"]["bam"] + ".bai", @@ -869,9 +826,6 @@ rule bigwig_neg: rule bigwig_pos: - """ - Create a bigwig for postive-strand reads. - """ input: bam=patterns["markduplicates"]["bam"], bai=patterns["markduplicates"]["bam"] + ".bai", diff --git a/workflows/rnaseq/config/rnaseq_patterns.yaml b/workflows/rnaseq/config/rnaseq_patterns.yaml index 92b2a534..35681125 100644 --- a/workflows/rnaseq/config/rnaseq_patterns.yaml +++ b/workflows/rnaseq/config/rnaseq_patterns.yaml @@ -15,7 +15,9 @@ libsizes: cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize' bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.libsize' fastq_screen: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.screen.txt' -featurecounts: 'data/rnaseq_aggregation/featurecounts.txt' +featurecounts: + per_sample: 'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt' + aggregated: 'data/rnaseq_aggregation/featurecounts.txt' libsizes_table: 'data/rnaseq_aggregation/libsizes_table.tsv' libsizes_yaml: 'data/rnaseq_aggregation/libsizes_table_mqc.yaml' rrna_percentages_table: 'data/rnaseq_aggregation/rrna_percentages_table.tsv' From 39209ce270e5d5259a323c75c88cb4411b313819 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:35:14 -0500 Subject: [PATCH 061/196] all sorts of fixes and cleanup --- .circleci/config.yml | 34 +++++--------- lib/utils.py | 2 +- rules/sra.smk | 2 +- test/lcdb-wf-test | 7 ++- test/test_configs/hisat2.tsv | 4 +- test/test_configs/star_override_1pass.yaml | 10 ---- test/test_configs/star_override_2pass.yaml | 10 ---- test/test_configs/test_rnaseq_config.yaml | 54 ++++++++-------------- test/workflow_test_params.yaml | 18 ++++---- 9 files changed, 48 insertions(+), 93 deletions(-) delete mode 100644 test/test_configs/star_override_1pass.yaml delete mode 100644 test/test_configs/star_override_2pass.yaml diff --git a/.circleci/config.yml b/.circleci/config.yml index 02b27915..16e5b5f0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -220,16 +220,6 @@ variables: --until bed_to_bigbed fi - # -------------------------------------------------------------------------- - # Standard references workflow. - references-step: &references-step - run: - name: references workflow - command: | - source /opt/miniforge/etc/profile.d/conda.sh - conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test references --run-workflow --configfile=config/config.yaml -j2 -p -k --orig $ORIG - # -------------------------------------------------------------------------- # Standard RNA-seq workflow rnaseq-step: &rnaseq-step @@ -408,14 +398,14 @@ jobs: # - *get-data # - *colocalization-step - references: - <<: *defaults - steps: - - checkout - - *restore_cache - - *set-path - - *get-data - - *references-step + # references: + # <<: *defaults + # steps: + # - checkout + # - *restore_cache + # - *set-path + # - *get-data + # - *references-step build-docs: <<: *defaults @@ -489,10 +479,10 @@ workflows: requires: - initial-setup - pytest - - references: - requires: - - initial-setup - - pytest + # - references: + # requires: + # - initial-setup + # - pytest # - colocalization: # requires: # - initial-setup diff --git a/lib/utils.py b/lib/utils.py index f1a97c79..0e5cc9e2 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1137,7 +1137,7 @@ def check_urls(config, verbose=False): too-many-connection issues """ failures = [] - urls = list(set(utils.flatten(pluck(config, "url")))) + urls = list(set(flatten(pluck(config, "url")))) for url in urls: if url.startswith("file://"): continue diff --git a/rules/sra.smk b/rules/sra.smk index 861b5098..2992f503 100644 --- a/rules/sra.smk +++ b/rules/sra.smk @@ -14,7 +14,7 @@ rule fastq_dump: params: is_paired=is_paired, sampletable=_st, - # extra="-X 100000", # [TEST SETTINGS] + # extra="-X 100000", # [enable for test] resources: mem_mb=gb(1), disk_mb=autobump(gb=1), diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test index df59b24c..21f6978c 100755 --- a/test/lcdb-wf-test +++ b/test/lcdb-wf-test @@ -142,9 +142,12 @@ class Runner(object): %(prog)s rnaseq --run-workflow --strandedness-pe %(prog)s rnaseq --run-workflow --strandedness-se %(prog)s rnaseq --run-workflow --star-2pass - %(prog)s rnaseq --run-workflow --star-1pass + %(prog)s rnaseq --run-workflow --hisat2 %(prog)s rnaseq --run-workflow --pe + # Since there are a lot of parameters here, see + # "workflow_test_params.yaml" for how they are managed. + """, formatter_class=argparse.RawDescriptionHelpFormatter ) @@ -328,7 +331,7 @@ class Runner(object): if args.url_check: print_header("url check") sys.path.insert(0, str(TOPLEVEL)) - from lib.common import check_all_urls_found + from lib.utils import check_all_urls_found check_all_urls_found() diff --git a/test/test_configs/hisat2.tsv b/test/test_configs/hisat2.tsv index 3c73275e..df6746ce 100644 --- a/test/test_configs/hisat2.tsv +++ b/test/test_configs/hisat2.tsv @@ -1,3 +1,3 @@ samplename group layout orig_filename -sample1-star-1pass control SE data/example_data/rnaseq_sample1PE_1.fq.gz -sample2-star-1pass control SE data/example_data/rnaseq_sample2.fq.gz +sample1-hisat2 control SE data/example_data/rnaseq_sample1PE_1.fq.gz +sample2-hisat2 control SE data/example_data/rnaseq_sample2.fq.gz diff --git a/test/test_configs/star_override_1pass.yaml b/test/test_configs/star_override_1pass.yaml deleted file mode 100644 index cba6ff76..00000000 --- a/test/test_configs/star_override_1pass.yaml +++ /dev/null @@ -1,10 +0,0 @@ -aligner: - index: star - tag: test - -merged_bigwigs: - control_pos: - pos: [] - treatment_all: - pos: [] - neg: [] diff --git a/test/test_configs/star_override_2pass.yaml b/test/test_configs/star_override_2pass.yaml deleted file mode 100644 index b091eba3..00000000 --- a/test/test_configs/star_override_2pass.yaml +++ /dev/null @@ -1,10 +0,0 @@ -aligner: - index: 'star-twopass' - tag: test - -merged_bigwigs: - control_pos: - pos: [] - treatment_all: - pos: [] - neg: [] diff --git a/test/test_configs/test_rnaseq_config.yaml b/test/test_configs/test_rnaseq_config.yaml index 6c674345..2cbd3d66 100644 --- a/test/test_configs/test_rnaseq_config.yaml +++ b/test/test_configs/test_rnaseq_config.yaml @@ -1,43 +1,27 @@ -sampletable: 'config/sampletable.tsv' - -patterns: 'config/rnaseq_patterns.yaml' - -# Which key in the `references` dict below to use -organism: 'dmel' - -# If not specified here, use the environment variable REFERENCES_DIR. -references_dir: 'references_data' - -aligner: - index: 'hisat2' - tag: 'test' +fasta: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" + postprocess: 'lib.utils.gzipped' -stranded: 'fr-firststrand' +gtf: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf" + postprocess: 'lib.utils.gzipped' rrna: - index: 'bowtie2' - tag: 'rRNA' - -gtf: - tag: "test" + url: + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + postprocess: + function: 'lib.utils.filter_fastas' + args: 'Drosophila melanogaster' -salmon: - tag: "test" -kallisto: - tag: "test" +sampletable: 'config/sampletable.tsv' -fastq_screen: - - label: rRNA - organism: dmel - tag: test - - label: Fly - organism: dmel - tag: test +patterns: 'config/rnaseq_patterns.yaml' -# See the reference config files in the top level of the repo, -# include/reference_configs, for inspiration for more species. +# See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info. +stranded: 'fr-firststrand' # for dUTP libraries +# 'fr-secondstrand' # for ligation libraries +# 'unstranded' # for libraries without strand specificity -include_references: - - '../../include/reference_configs/test.yaml' - - '../../include/reference_configs/Drosophila_melanogaster.yaml' +aligner: 'star' diff --git a/test/workflow_test_params.yaml b/test/workflow_test_params.yaml index 70e57da6..5d74fac9 100644 --- a/test/workflow_test_params.yaml +++ b/test/workflow_test_params.yaml @@ -45,19 +45,17 @@ rnaseq: desc: Tests running STAR in 2-pass mode. Only runs until the star_pass2 rule. args: | --until star_pass2 - --configfile - __ORIG__/test/test_configs/test_rnaseq_config.yaml - __ORIG__/test/test_configs/star_override_2pass.yaml + --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml --config sampletable=__ORIG__/test/test_configs/star_2pass.tsv + --config aligner="star-twopass" - star-1pass: - desc: Tests running STAR in 1-pass (default) mode. Only runs until the star rule. + hisat2: + desc: Tests running HISAT2 args: | - --until star - --configfile - __ORIG__/test/test_configs/test_rnaseq_config.yaml - __ORIG__/test/test_configs/star_override_1pass.yaml - --config sampletable=__ORIG__/test/test_configs/star_1pass.tsv + --until hisat2 + --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml + --config sampletable=__ORIG__/test/test_configs/hisat2.tsv + --config aligner=hisat2 pe: desc: Tests paired-end data From 155307a6d77db198296deaa46e76877b344f5bcd Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:36:56 -0500 Subject: [PATCH 062/196] sra for chipseq --- workflows/chipseq/Snakefile | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 24b09dec..9c8a2f37 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -19,7 +19,6 @@ REFERENCES = config.get("reference_dir", "../../references") sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" -is_sra = utils.detect_sra(sampletable) n = ["1", "2"] if is_paired else ["1"] SAMPLES = sampletable.iloc[:, 0].values patterns = yaml.safe_load(open("config/chipseq_patterns.yaml"))["patterns_by_sample"] @@ -43,9 +42,36 @@ rule targets: [v["bed"] for k, v in peaks.items()], -if is_sra: +if utils.detect_sra(sampletable): + sampletable['orig_filename'] = expand( + 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1) - include: "../../rules/sra.smk" + if is_paired: + sampletable['orig_filename_R2'] = expand( + 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2) + + rule fastq_dump: + output: + fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n, allow_missing=True) + log: + 'original_data/sra_samples/{sample}/{sample}.fastq.gz.log' + params: + is_paired=is_paired, + # extra="-X 100000", # [enable for test] + resources: + mem_mb=gb(1), + disk_mb=autobump(gb=1), + runtime=autobump(hours=2) + run: + srr = sampletable.loc[wildcards.sample, "Run"] + extra = params.get("extra", "") + if is_paired: + shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") + shell("mv {srr}_1.fastq.gz {output[0]}") + shell("mv {srr}_2.fastq.gz {output[1]}") + else: + shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp") + shell("mv {output[0]}.tmp {output[0]}") rule symlinks: From fd1c1c3df76284f7080e9e706d68fa61ae54356d Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:37:13 -0500 Subject: [PATCH 063/196] clean out test suite --- lib/test_suite.py | 89 +---------------------------------------------- 1 file changed, 1 insertion(+), 88 deletions(-) diff --git a/lib/test_suite.py b/lib/test_suite.py index 21b9c052..eb018c3f 100644 --- a/lib/test_suite.py +++ b/lib/test_suite.py @@ -1,88 +1 @@ -import os -import pprint -from textwrap import dedent -from . import common - - -def test_config_loading(tmpdir): - f0 = tmpdir.mkdir('subdir').join('file0.yaml') - dir_to_include = tmpdir.join('subdir') - f0.write(dedent(''' - references: - species_to_keep: - tag_from_directory: - fasta: - url: "https://from_directory" - - # Will get overwritten by a specific file - tag_from_file: - fasta: - url: "https://from_directory" - - # Will get overwritten by specific file, and then that will get - # overwritten by the config - tag_from_config: - fasta: - url: "https://from_directory" - ''')) - f1 = tmpdir.join('subdir', 'file1.yaml') - f1.write(dedent(''' - references: - species2: - tag_only_in_directory: - fasta: - url: "" - indexes: - - bowtie2 - ''')) - - f2 = tmpdir.join('file1.yaml') - f2.write(dedent(''' - references: - species_to_keep: - tag_from_file: - fasta: - url: "https://from_file" - tag_from_config: - fasta: - url: "https://from_file" - - ''')) - - f3 = tmpdir.join('file3.yaml') - f3.write(dedent(''' - references_dir: "/data" - references: - species_to_keep: - tag_from_config: - fasta: - url: "https://from_config" - - include_references: - - {dir_to_include} - - {f2} - '''.format(dir_to_include=dir_to_include, f2=f2))) - - config = common.load_config(str(f3)) - - assert config == { - 'references_dir': '/data', - 'include_references': [ - '{0}/subdir'.format(str(tmpdir)), - '{0}/file1.yaml'.format(str(tmpdir)), - ], - 'references': { - 'species_to_keep': { - 'tag_from_config': { - 'fasta': {'url': 'https://from_config'}}, - 'tag_from_directory': { - 'fasta': {'url': 'https://from_directory'}}, - 'tag_from_file': { - 'fasta': {'url': 'https://from_file'}} - }, - 'species2': { - 'tag_only_in_directory': { - 'fasta': {'indexes': ['bowtie2'], 'url': ''}}}, - }, - } - +from . import utils From d322e333949a4d25f37f4b83cd1127cdb7ad2ae0 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 12 Jan 2025 11:39:27 -0500 Subject: [PATCH 064/196] add strandcheck back to snakefile --- rules/sra.smk | 34 ------------------- rules/strand_check.smk | 64 ----------------------------------- workflows/rnaseq/Snakefile | 68 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 66 insertions(+), 100 deletions(-) delete mode 100644 rules/sra.smk delete mode 100644 rules/strand_check.smk diff --git a/rules/sra.smk b/rules/sra.smk deleted file mode 100644 index 2992f503..00000000 --- a/rules/sra.smk +++ /dev/null @@ -1,34 +0,0 @@ - -sampletable['orig_filename'] = expand( - 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1) - -if is_paired: - sampletable['orig_filename_R2'] = expand( - 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2) - -rule fastq_dump: - output: - fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n) - log: - 'original_data/sra_samples/{sample}/{sample}.fastq.gz.log' - params: - is_paired=is_paired, - sampletable=_st, - # extra="-X 100000", # [enable for test] - resources: - mem_mb=gb(1), - disk_mb=autobump(gb=1), - runtime=autobump(hours=2) - run: - _st = sampletable.set_index(sampletable.columns[0]) - srr = _st.loc[wildcards.sample, "Run"] - extra = params.get("extra", "") - if is_paired: - shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") - shell("mv {srr}_1.fastq.gz {output[0]}") - shell("mv {srr}_2.fastq.gz {output[1]}") - else: - shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp") - shell("mv {output[0]}.tmp {output[0]}") - -# vim: ft=snakemake diff --git a/rules/strand_check.smk b/rules/strand_check.smk deleted file mode 100644 index 4c936a77..00000000 --- a/rules/strand_check.smk +++ /dev/null @@ -1,64 +0,0 @@ -rule sample_strand_check: - input: - fastq=expand(patterns["fastq"], n=n, allow_missing=True), - index=expand(rules.bowtie2_index.output, label="genome"), - bed12=rules.conversion_bed12.output, - output: - strandedness='strand_check/{sample}/{sample}.strandedness', - bam=temporary('strand_check/{sample}/{sample}.strandedness.bam'), - bai=temporary('strand_check/{sample}/{sample}.strandedness.bam.bai'), - fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', n=n, allow_missing=True)), - log: - 'strand_check/{sample}/{sample}.strandedness.log' - threads: 6 - resources: - mem_mb=gb(8), - runtime=autobump(hours=2) - run: - prefix = os.path.commonprefix(input.index).rstrip(".") - nreads = int(1e5 * 4) - if is_paired: - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}') - fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} ' - else: - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') - fastqs = f'-U {output.fastqs[0]} ' - shell( - "bowtie2 " - "-x {prefix} " - "{fastqs} " - '--no-unal ' - "--threads {threads} 2> {log} " - "| samtools view -Sb - " - "| samtools sort - -o {output.bam} " - ) - shell("samtools index {output.bam}") - shell( - 'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}' - ) - -rule strand_check: - input: - expand('strand_check/{sample}/{sample}.strandedness', sample=SAMPLES) - output: - html='strand_check/strandedness.html', - filelist=temporary('strand_check/filelist') - log: - 'strand_check/strandedness.log' - resources: - mem_mb=gb(1), - runtime=autobump(hours=2) - run: - with open(output.filelist, 'w') as fout: - for i in input: - fout.write(i + '\n') - shell( - 'multiqc ' - '--force ' - '--module rseqc ' - '--file-list {output.filelist} ' - '--filename {output.html} &> {log}' - ) - -# vim: ft=snakemake diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 29732d47..0dff1d54 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -40,8 +40,6 @@ rule all: expand(patterns["bigwig"]["pos"], sample=SAMPLES), expand(patterns["bigwig"]["neg"], sample=SAMPLES), -# Optionally run ``snakemake strand_check`` to do a preliminary run evaluating strandedness. -include: '../../rules/strand_check.smk' if utils.detect_sra(sampletable): sampletable['orig_filename'] = expand( @@ -102,6 +100,72 @@ rule symlink_targets: ), +# Optionally run ``snakemake strand_check`` to do a preliminary run on +# automatically-subset data to evaluate strandedness. +rule sample_strand_check: + input: + fastq=expand(patterns["fastq"], n=n, allow_missing=True), + index=expand(rules.bowtie2_index.output, label="genome"), + bed12=rules.conversion_bed12.output, + output: + strandedness='strand_check/{sample}/{sample}.strandedness', + bam=temporary('strand_check/{sample}/{sample}.strandedness.bam'), + bai=temporary('strand_check/{sample}/{sample}.strandedness.bam.bai'), + fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', n=n, allow_missing=True)), + log: + 'strand_check/{sample}/{sample}.strandedness.log' + threads: 6 + resources: + mem_mb=gb(8), + runtime=autobump(hours=2) + run: + prefix = os.path.commonprefix(input.index).rstrip(".") + nreads = int(1e5 * 4) + if is_paired: + shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') + shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}') + fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} ' + else: + shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') + fastqs = f'-U {output.fastqs[0]} ' + shell( + "bowtie2 " + "-x {prefix} " + "{fastqs} " + '--no-unal ' + "--threads {threads} 2> {log} " + "| samtools view -Sb - " + "| samtools sort - -o {output.bam} " + ) + shell("samtools index {output.bam}") + shell( + 'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}' + ) + + +rule strand_check: + input: + expand('strand_check/{sample}/{sample}.strandedness', sample=SAMPLES) + output: + html='strand_check/strandedness.html', + filelist=temporary('strand_check/filelist') + log: + 'strand_check/strandedness.log' + resources: + mem_mb=gb(1), + runtime=autobump(hours=2) + run: + with open(output.filelist, 'w') as fout: + for i in input: + fout.write(i + '\n') + shell( + 'multiqc ' + '--force ' + '--module rseqc ' + '--file-list {output.filelist} ' + '--filename {output.html} &> {log}' + ) + rule cutadapt: input: fastq=expand(patterns["fastq"], n=n, allow_missing=True), From 8b6b52a01e52cc44b500beb9b71962503f9618f6 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Tue, 14 Jan 2025 11:13:03 -0500 Subject: [PATCH 065/196] don't use patterns any more --- workflows/rnaseq/Snakefile | 338 +++++++++++++++++++------------------ 1 file changed, 172 insertions(+), 166 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 0dff1d54..5b9923b8 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -11,7 +11,6 @@ from lib.utils import autobump, gb, hours configfile: "config/config.yaml" -include: "../references/Snakefile" REFERENCES = config.get("reference_dir", "../../references") @@ -20,8 +19,6 @@ sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" n = ["1", "2"] if is_paired else ["1"] SAMPLES = sampletable.index -patterns = yaml.safe_load(open("config/rnaseq_patterns.yaml")) -config["patterns"] = patterns wildcard_constraints: @@ -36,10 +33,9 @@ localrules: rule all: input: - patterns["multiqc"], - expand(patterns["bigwig"]["pos"], sample=SAMPLES), - expand(patterns["bigwig"]["neg"], sample=SAMPLES), + "data/rnaseq_aggregation/multiqc.html", +include: "../references/Snakefile" if utils.detect_sra(sampletable): sampletable['orig_filename'] = expand( @@ -82,7 +78,7 @@ rule symlinks: else sampletable.loc[wc.sample, ["orig_filename"]] ), output: - expand(patterns["fastq"], n=n, allow_missing=True), + expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n) threads: 1 resources: mem_mb=100, @@ -104,7 +100,7 @@ rule symlink_targets: # automatically-subset data to evaluate strandedness. rule sample_strand_check: input: - fastq=expand(patterns["fastq"], n=n, allow_missing=True), + fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n), index=expand(rules.bowtie2_index.output, label="genome"), bed12=rules.conversion_bed12.output, output: @@ -168,9 +164,9 @@ rule strand_check: rule cutadapt: input: - fastq=expand(patterns["fastq"], n=n, allow_missing=True), + fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n) output: - fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), + fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz', n=n) log: "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", threads: 6 @@ -214,17 +210,17 @@ rule cutadapt: rule fastqc: input: - '{sample_dir}/{sample}/{sample}{suffix}' + 'data/rnaseq_samples/{sample}/{sample}{suffix}' threads: 1 output: - html='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html', - zip='{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip', + html='data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html', + zip='data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip', resources: mem_mb=gb(8), runtime=autobump(hours=2) log: - '{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log', + 'data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log', run: outdir = os.path.dirname(output.html) or "." shell( @@ -250,12 +246,12 @@ if config["aligner"] == "hisat2": rule hisat2: input: - fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), + fastq=rules.cutadapt.output, index=rules.hisat2_index.output, output: - bam=temporary(patterns["bam"]), + bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam") log: - patterns["bam"] + ".log", + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log" threads: 16 resources: mem_mb=gb(32), @@ -292,7 +288,10 @@ if config["aligner"] == "hisat2": if config["aligner"].startswith("star"): - + if os.getenv("TMPDIR"): + tmpdir_arg = "--outTmpDir $TMPDIR/star " + else: + tmpdir_arg = "" # STAR can be run in 1-pass or 2-pass modes. Since we may be running it # more than once in almost the same way, we pull out the shell command here # and use it below. @@ -303,6 +302,7 @@ if config["aligner"].startswith("star"): "--readFilesIn {input.fastq} " "--readFilesCommand zcat " "--outFileNamePrefix {prefix} " + "{tmpdir_arg} " "{params.extra} " ) STAR_PARAMS = ( @@ -327,18 +327,19 @@ if config["aligner"] == "star": rule star: "Align with STAR (1-pass mode)" input: - fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), + fastq=rules.cutadapt.output, index=rules.star_index.output, annotation=f"{REFERENCES}/annotation.gtf", output: - bam=temporary(patterns["bam"]), - sjout=temporary(patterns["bam"].replace(".bam", ".star.SJ.out.tab")), + bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), + sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab") log: - patterns["bam"].replace(".bam", ".star.bam.log"), + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log" threads: 16 resources: mem_mb=gb(64), runtime=autobump(hours=8), + disk_mb=gb(80), params: extra=STAR_PARAMS, run: @@ -366,17 +367,18 @@ if config["aligner"] == "star-twopass": rule star_pass1: "First pass of alignment with STAR to get the junctions" input: - fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), + fastq=rules.cutadapt.output, index=rules.star_index.output, annotation=f"{REFERENCES}/annotation.gtf", output: - sjout=temporary(patterns["bam"].replace(".bam", ".star-pass1.SJ.out.tab")), + sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab") log: - patterns["bam"].replace(".bam", ".star-pass1.bam.log"), + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log" threads: 16 resources: mem_mb=gb(64), runtime=autobump(hours=8), + disk_mb=gb(80), params: extra=STAR_PARAMS, run: @@ -407,22 +409,20 @@ if config["aligner"] == "star-twopass": samples to get the final BAM """ input: - fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), + fastq=rules.cutadapt.output, index=rules.star_index.output, annotation=f"{REFERENCES}/annotation.gtf", - sjout=expand( - patterns["bam"].replace(".bam", ".star-pass1.SJ.out.tab"), - sample=SAMPLES, - ), + sjout=expand(rules.star_pass1.output, sample=SAMPLES) output: - bam=temporary(patterns["bam"]), - sjout=temporary(patterns["bam"].replace(".bam", ".star-pass2.SJ.out.tab")), + bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), + sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.SJ.out.tab") log: - patterns["bam"].replace(".bam", ".star-pass2.bam.log"), + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log" threads: 16 resources: mem_mb=gb(64), runtime=autobump(hours=8), + disk_mb=gb(80), params: extra=STAR_PARAMS, run: @@ -455,7 +455,7 @@ if config["aligner"] == "star-twopass": rule rRNA: input: - fastq=expand(patterns["cutadapt"], n=1, allow_missing=True), + fastq='data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz', index=multiext( f"{REFERENCES}/bowtie2/rrna", ".1.bt2", @@ -467,9 +467,9 @@ rule rRNA: ".fa", ), output: - bam=temporary(patterns["rrna"]["bam"]), + bam='data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam', log: - patterns["rrna"]["bam"] + ".log", + 'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.log' threads: 6 resources: mem_mb=gb(2), @@ -539,14 +539,40 @@ rule bam_index: "samtools index {input} {output}" +rule markduplicates: + input: + bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.bam' + output: + bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam', + metrics='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics' + log: + 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.log' + threads: 1 + resources: + mem_mb=gb(32), + runtime=autobump(hours=2), + disk_mb=autobump(gb=100), + params: + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] + shell: + "picard " + "{params.java_args} " + "MarkDuplicates " + "INPUT={input.bam} " + "OUTPUT={output.bam} " + "METRICS_FILE={output.metrics} " + "VALIDATION_STRINGENCY=LENIENT " + "&> {log}" + rule featurecounts: input: annotation=rules.gtf.output, - bam=patterns["markduplicates"]["bam"], + bam=rules.markduplicates.output.bam output: - patterns["featurecounts"]["per_sample"] + 'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt' log: - patterns["featurecounts"]["per_sample"] + ".log" + 'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log' threads: 8 resources: mem_mb=gb(16), @@ -575,11 +601,11 @@ rule featurecounts: rule aggregate_featurecounts: input: - expand(patterns["featurecounts"]["per_sample"], sample=SAMPLES) + expand('data/rnaseq_samples/{sample}/{sample}_featurecounts.txt', sample=SAMPLES) output: - patterns["featurecounts"]["aggregated"] + 'data/rnaseq_aggregation/featurecounts.txt' log: - patterns["featurecounts"]["aggregated"] + ".log" + 'data/rnaseq_aggregation/featurecounts.txt.log' threads: 1 resources: @@ -595,13 +621,14 @@ rule aggregate_featurecounts: final[df.columns[-1]] = df[df.columns[-1]] final.to_csv(output[0], sep="\t", index=False) + rule rrna_libsizes_table: input: - rrna=expand(patterns["rrna"]["libsize"], sample=SAMPLES), - fastq=expand(patterns["libsizes"]["cutadapt"], sample=SAMPLES), + rrna=expand('data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize', sample=SAMPLES), + fastq=expand('data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize', sample=SAMPLES), output: - json=patterns["rrna_percentages_yaml"], - tsv=patterns["rrna_percentages_table"], + tsv='data/rnaseq_aggregation/rrna_percentages_table.tsv', + json='data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml', threads: 1 resources: mem_mb=gb(2), @@ -610,87 +637,18 @@ rule rrna_libsizes_table: "../../scripts/rrna_libsizes_table.py" -rule multiqc: - input: - files=( - expand(patterns["fastqc"]["raw"], sample=SAMPLES), - expand(patterns["fastqc"]["cutadapt"], sample=SAMPLES), - expand(patterns["fastqc"]["bam"], sample=SAMPLES), - expand(patterns["markduplicates"]["bam"], sample=SAMPLES), - expand(patterns["salmon"], sample=SAMPLES), - expand(patterns["kallisto"], sample=SAMPLES), - expand(patterns["preseq"], sample=SAMPLES), - expand(patterns["rseqc"]["infer_experiment"], sample=SAMPLES), - expand(patterns["rseqc"]["read_distribution"], sample=SAMPLES), - expand(patterns["collectrnaseqmetrics"]["metrics"], sample=SAMPLES), - expand(patterns["samtools"]["idxstats"], sample=SAMPLES), - expand(patterns["samtools"]["flagstat"], sample=SAMPLES), - expand(patterns["samtools"]["stats"], sample=SAMPLES), - - patterns["rrna_percentages_table"], - patterns["featurecounts"]["aggregated"], - ), - config="config/multiqc_config.yaml", - output: - "data/rnaseq_aggregation/multiqc.html", - log: - "data/rnaseq_aggregation/multiqc.log", - threads: 1 - resources: - mem_mb=gb(2), - runtime=autobump(hours=2), - run: - analysis_directory = set([os.path.dirname(i) for i in input]) - outdir = os.path.dirname(output[0]) - basename = os.path.basename(output[0]) - shell( - "LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 " - "multiqc " - "--quiet " - "--outdir {outdir} " - "--force " - "--filename {basename} " - "--config {input.config} " - "{analysis_directory} " - "&> {log} " - ) -rule markduplicates: - input: - bam=patterns["bam"], - output: - bam=patterns["markduplicates"]["bam"], - metrics=patterns["markduplicates"]["metrics"], - log: - patterns["markduplicates"]["bam"] + ".log", - threads: 1 - resources: - mem_mb=gb(32), - runtime=autobump(hours=2), - disk_mb=autobump(gb=100), - params: - java_args="-Xmx20g", # [disable for test] - # java_args='-Xmx2g' # [enable for test] - shell: - "picard " - "{params.java_args} " - "MarkDuplicates " - "INPUT={input.bam} " - "OUTPUT={output.bam} " - "METRICS_FILE={output.metrics} " - "VALIDATION_STRINGENCY=LENIENT " - "&> {log}" rule collectrnaseqmetrics: input: - bam=patterns["markduplicates"]["bam"], + bam=rules.markduplicates.output.bam, refflat=rules.conversion_refflat.output, output: - metrics=patterns["collectrnaseqmetrics"]["metrics"], + metrics='data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics' log: - patterns["collectrnaseqmetrics"]["metrics"] + ".log", + 'data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics.log' threads: 1 resources: mem_mb=gb(32), @@ -719,9 +677,11 @@ rule collectrnaseqmetrics: rule preseq: input: - bam=patterns["bam"], + bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam', output: - patterns["preseq"], + 'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt' + log: + 'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt.log' threads: 1 resources: mem_mb=gb(1), @@ -731,16 +691,17 @@ rule preseq: "c_curve " "-B {input} " "-o {output} " + "&> {log}" rule salmon: input: - fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), + fastq=rules.cutadapt.output, index=REFERENCES + "/salmon/versionInfo.json", output: - patterns["salmon"], + 'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf' log: - patterns["salmon"] + ".log", + 'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf.log' threads: 6 resources: mem_mb=gb(32), @@ -772,12 +733,12 @@ rule salmon: rule kallisto: input: - fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), + fastq=rules.cutadapt.output, index=REFERENCES + "/kallisto/transcripts.idx", output: - patterns["kallisto"], + 'data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5' log: - patterns["kallisto"] + ".log", + 'data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log' threads: 8 resources: mem_mb=gb(32), @@ -811,12 +772,12 @@ rule kallisto: rule rseqc_infer_experiment: input: - bam=patterns["markduplicates"]["bam"], + bam=rules.markduplicates.output, bed12=rules.conversion_bed12.output, output: - txt=patterns["rseqc"]["infer_experiment"], + 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt', log: - patterns["rseqc"]["infer_experiment"] + ".log", + 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt.log' resources: mem_mb=gb(2), runtime=autobump(hours=2), @@ -826,12 +787,12 @@ rule rseqc_infer_experiment: rule rseqc_read_distribution: input: - bam=patterns["markduplicates"]["bam"], + bam=rules.markduplicates.output, bed12=rules.conversion_bed12.output, output: - txt=patterns["rseqc"]["read_distribution"], + 'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt', log: - patterns["rseqc"]["read_distribution"] + ".log", + 'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt.log' resources: mem_mb=gb(2), runtime=autobump(hours=2), @@ -839,33 +800,57 @@ rule rseqc_read_distribution: "read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}" -rule idxstats: +rule samtools_idxstats: input: - bam=patterns["markduplicates"]["bam"], - bai=patterns["markduplicates"]["bam"] + ".bai", + bam=rules.markduplicates.output.bam, + bai=rules.markduplicates.output.bam + ".bai", output: - txt=patterns["samtools"]["idxstats"], + 'data/rnaseq_samples/{sample}/idxstat_{sample}.txt' log: - patterns["samtools"]["idxstats"] + ".log", + 'data/rnaseq_samples/{sample}/idxstat_{sample}.txt.log' resources: mem_mb=gb(16), runtime=autobump(hours=2), - run: - shell("samtools idxstats {input.bam} 2> {log} 1> {output.txt}") + shell: + "samtools idxstats {input.bam} 2> {log} 1> {output}" + + +rule samtools_flagstat: + input: + bam=rules.markduplicates.output.bam, + bai=rules.markduplicates.output.bam + ".bai", + output: + 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat' + log: + 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat.log' + shell: + "samtools flagstat {input.bam} > {output}" + + +rule samtools_stats: + input: + bam=rules.markduplicates.output.bam, + bai=rules.markduplicates.output.bam + ".bai", + output: + 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats' + log: + 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats.log' + shell: + "samtools stats {input.bam} > {output}" rule bigwig_neg: input: - bam=patterns["markduplicates"]["bam"], - bai=patterns["markduplicates"]["bam"] + ".bai", + bam=rules.markduplicates.output.bam, + bai=rules.markduplicates.output.bam + ".bai", output: - patterns["bigwig"]["neg"], + 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig' threads: 8 resources: mem_mb=gb(16), runtime=autobump(hours=2), log: - patterns["bigwig"]["neg"] + ".log", + 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig.log' params: strand_arg={ "unstranded": "", @@ -891,16 +876,16 @@ rule bigwig_neg: rule bigwig_pos: input: - bam=patterns["markduplicates"]["bam"], - bai=patterns["markduplicates"]["bam"] + ".bai", + bam=rules.markduplicates.output.bam, + bai=rules.markduplicates.output.bam + ".bai", output: - patterns["bigwig"]["pos"], + 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig' threads: 8 resources: mem_mb=gb(16), runtime=autobump(hours=2), log: - patterns["bigwig"]["pos"] + ".log", + 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig.log' params: strand_arg={ "unstranded": "", @@ -923,26 +908,47 @@ rule bigwig_pos: "&> {log}" ) - -rule flagstat: +rule multiqc: input: - bam=patterns["markduplicates"]["bam"], - bai=patterns["markduplicates"]["bam"] + ".bai", + files=( + expand(rules.fastqc.output.zip, sample=SAMPLES, suffix=["_R1.fastq.gz", "_R1.cutadapt.fastq.gz", ".cutadapt.bam"]), + expand(rules.markduplicates.output, sample=SAMPLES), + expand(rules.salmon.output, sample=SAMPLES), + expand(rules.kallisto.output, sample=SAMPLES), + expand(rules.preseq.output, sample=SAMPLES), + expand(rules.collectrnaseqmetrics.output, sample=SAMPLES), + expand(rules.samtools_stats.output, sample=SAMPLES), + expand(rules.samtools_flagstat.output, sample=SAMPLES), + expand(rules.samtools_idxstats.output, sample=SAMPLES), + expand(rules.rseqc_infer_experiment.output, sample=SAMPLES), + expand(rules.rseqc_read_distribution.output, sample=SAMPLES), + expand(rules.bigwig_pos.output, sample=SAMPLES), + expand(rules.bigwig_neg.output, sample=SAMPLES), + rules.rrna_libsizes_table.output, + ), + config="config/multiqc_config.yaml", output: - patterns["samtools"]["flagstat"], + "data/rnaseq_aggregation/multiqc.html", log: - patterns["samtools"]["flagstat"] + ".log", - shell: - "samtools flagstat {input.bam} > {output}" - + "data/rnaseq_aggregation/multiqc.log", + threads: 1 + resources: + mem_mb=gb(2), + runtime=autobump(hours=2), + disk_mb=gb(10) + run: + analysis_directory = set([os.path.dirname(i) for i in input]) + outdir = os.path.dirname(output[0]) + basename = os.path.basename(output[0]) + shell( + "LC_ALL=en_US.utf8 LC_LANG=en_US.utf8 " + "multiqc " + "--quiet " + "--outdir {outdir} " + "--force " + "--filename {basename} " + "--config {input.config} " + "{analysis_directory} " + "&> {log} " + ) -rule samtools_stats: - input: - bam=patterns["markduplicates"]["bam"], - bai=patterns["markduplicates"]["bam"] + ".bai", - output: - patterns["samtools"]["stats"], - log: - patterns["samtools"]["stats"] + ".log", - shell: - "samtools stats {input.bam} > {output}" From d5799fa619a3acb3961a16f6522ed5d440fba104 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Tue, 14 Jan 2025 18:24:08 -0500 Subject: [PATCH 066/196] snakefmt cleanup --- workflows/rnaseq/Snakefile | 282 +++++++++++++++++++++---------------- 1 file changed, 161 insertions(+), 121 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 5b9923b8..bf87a234 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -11,8 +11,6 @@ from lib.utils import autobump, gb, hours configfile: "config/config.yaml" - - REFERENCES = config.get("reference_dir", "../../references") sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) @@ -35,28 +33,38 @@ rule all: input: "data/rnaseq_aggregation/multiqc.html", + include: "../references/Snakefile" + if utils.detect_sra(sampletable): - sampletable['orig_filename'] = expand( - 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1) + sampletable["orig_filename"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1 + ) if is_paired: - sampletable['orig_filename_R2'] = expand( - 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2) + sampletable["orig_filename_R2"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + sample=SAMPLES, + n=2, + ) rule fastq_dump: output: - fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n, allow_missing=True) + fastq=expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + n=n, + allow_missing=True, + ), log: - 'original_data/sra_samples/{sample}/{sample}.fastq.gz.log' + "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", params: is_paired=is_paired, # extra="-X 100000", # [enable for test] resources: mem_mb=gb(1), disk_mb=autobump(gb=1), - runtime=autobump(hours=2) + runtime=autobump(hours=2), run: srr = sampletable.loc[wildcards.sample, "Run"] extra = params.get("extra", "") @@ -65,7 +73,9 @@ if utils.detect_sra(sampletable): shell("mv {srr}_1.fastq.gz {output[0]}") shell("mv {srr}_2.fastq.gz {output[1]}") else: - shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp") + shell( + "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp" + ) shell("mv {output[0]}.tmp {output[0]}") @@ -78,7 +88,7 @@ rule symlinks: else sampletable.loc[wc.sample, ["orig_filename"]] ), output: - expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n) + expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), threads: 1 resources: mem_mb=100, @@ -100,73 +110,88 @@ rule symlink_targets: # automatically-subset data to evaluate strandedness. rule sample_strand_check: input: - fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n), + fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), index=expand(rules.bowtie2_index.output, label="genome"), bed12=rules.conversion_bed12.output, output: - strandedness='strand_check/{sample}/{sample}.strandedness', - bam=temporary('strand_check/{sample}/{sample}.strandedness.bam'), - bai=temporary('strand_check/{sample}/{sample}.strandedness.bam.bai'), - fastqs=temporary(expand('strand_check/{sample}/{sample}_R{n}.strandedness.fastq', n=n, allow_missing=True)), + strandedness="strand_check/{sample}/{sample}.strandedness", + bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"), + bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"), + fastqs=temporary( + expand( + "strand_check/{sample}/{sample}_R{n}.strandedness.fastq", + n=n, + allow_missing=True, + ) + ), log: - 'strand_check/{sample}/{sample}.strandedness.log' + "strand_check/{sample}/{sample}.strandedness.log", threads: 6 resources: mem_mb=gb(8), - runtime=autobump(hours=2) + runtime=autobump(hours=2), run: prefix = os.path.commonprefix(input.index).rstrip(".") nreads = int(1e5 * 4) if is_paired: - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}') - fastqs = f'-1 {output.fastqs[0]} -2 {output.fastqs[1]} ' + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" + ) + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}" + ) + fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} " else: - shell('set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}') - fastqs = f'-U {output.fastqs[0]} ' + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" + ) + fastqs = f"-U {output.fastqs[0]} " shell( "bowtie2 " "-x {prefix} " "{fastqs} " - '--no-unal ' + "--no-unal " "--threads {threads} 2> {log} " "| samtools view -Sb - " "| samtools sort - -o {output.bam} " ) shell("samtools index {output.bam}") shell( - 'infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}' + "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}" ) rule strand_check: input: - expand('strand_check/{sample}/{sample}.strandedness', sample=SAMPLES) + expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES), output: - html='strand_check/strandedness.html', - filelist=temporary('strand_check/filelist') + html="strand_check/strandedness.html", + filelist=temporary("strand_check/filelist"), log: - 'strand_check/strandedness.log' + "strand_check/strandedness.log", resources: mem_mb=gb(1), - runtime=autobump(hours=2) + runtime=autobump(hours=2), run: - with open(output.filelist, 'w') as fout: - for i in input: - fout.write(i + '\n') + with open(output.filelist, "w") as fout: + for i in input: + fout.write(i + "\n") shell( - 'multiqc ' - '--force ' - '--module rseqc ' - '--file-list {output.filelist} ' - '--filename {output.html} &> {log}' + "multiqc " + "--force " + "--module rseqc " + "--file-list {output.filelist} " + "--filename {output.html} &> {log}" ) + rule cutadapt: input: - fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz', n=n) + fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), output: - fastq=expand('data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz', n=n) + fastq=expand( + "data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz", n=n + ), log: "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", threads: 6 @@ -210,36 +235,35 @@ rule cutadapt: rule fastqc: input: - 'data/rnaseq_samples/{sample}/{sample}{suffix}' - threads: - 1 + "data/rnaseq_samples/{sample}/{sample}{suffix}", + threads: 1 output: - html='data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html', - zip='data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip', + html="data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html", + zip="data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip", resources: mem_mb=gb(8), - runtime=autobump(hours=2) + runtime=autobump(hours=2), log: - 'data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log', + "data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log", run: outdir = os.path.dirname(output.html) or "." shell( - 'fastqc ' - '--noextract ' - '--quiet ' - '--outdir {outdir} ' - '{input} ' - '2> {log} ' + "fastqc " + "--noextract " + "--quiet " + "--outdir {outdir} " + "{input} " + "2> {log} " ) outfile = os.path.basename(input[0]) - for s in ['.fastq', '.fq', '.gz', '.bam']: - outfile = outfile.replace(s, '') - out_zip = os.path.join(outdir, outfile + '_fastqc.zip') + for s in [".fastq", ".fq", ".gz", ".bam"]: + outfile = outfile.replace(s, "") + out_zip = os.path.join(outdir, outfile + "_fastqc.zip") if not os.path.abspath(out_zip) == os.path.abspath(output.zip): - shell('mv {out_zip} {output.zip}') - out_html = os.path.join(outdir, outfile + '_fastqc.html') + shell("mv {out_zip} {output.zip}") + out_html = os.path.join(outdir, outfile + "_fastqc.html") if not os.path.abspath(out_html) == os.path.abspath(output.html): - shell('mv {out_html} {output.html}') + shell("mv {out_html} {output.html}") if config["aligner"] == "hisat2": @@ -249,9 +273,9 @@ if config["aligner"] == "hisat2": fastq=rules.cutadapt.output, index=rules.hisat2_index.output, output: - bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam") + bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), log: - "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log" + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log", threads: 16 resources: mem_mb=gb(32), @@ -287,6 +311,7 @@ if config["aligner"] == "hisat2": ) + if config["aligner"].startswith("star"): if os.getenv("TMPDIR"): tmpdir_arg = "--outTmpDir $TMPDIR/star " @@ -332,9 +357,11 @@ if config["aligner"] == "star": annotation=f"{REFERENCES}/annotation.gtf", output: bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), - sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab") + sjout=temporary( + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab" + ), log: - "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log" + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log", threads: 16 resources: mem_mb=gb(64), @@ -371,9 +398,11 @@ if config["aligner"] == "star-twopass": index=rules.star_index.output, annotation=f"{REFERENCES}/annotation.gtf", output: - sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab") + sjout=temporary( + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab" + ), log: - "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log" + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log", threads: 16 resources: mem_mb=gb(64), @@ -412,12 +441,14 @@ if config["aligner"] == "star-twopass": fastq=rules.cutadapt.output, index=rules.star_index.output, annotation=f"{REFERENCES}/annotation.gtf", - sjout=expand(rules.star_pass1.output, sample=SAMPLES) + sjout=expand(rules.star_pass1.output, sample=SAMPLES), output: bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), - sjout=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.SJ.out.tab") + sjout=temporary( + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.SJ.out.tab" + ), log: - "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log" + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log", threads: 16 resources: mem_mb=gb(64), @@ -455,7 +486,7 @@ if config["aligner"] == "star-twopass": rule rRNA: input: - fastq='data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz', + fastq="data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz", index=multiext( f"{REFERENCES}/bowtie2/rrna", ".1.bt2", @@ -467,9 +498,9 @@ rule rRNA: ".fa", ), output: - bam='data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam', + bam="data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam", log: - 'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.log' + "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.log", threads: 6 resources: mem_mb=gb(2), @@ -541,12 +572,12 @@ rule bam_index: rule markduplicates: input: - bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.bam' + bam="data/rnaseq_samples/{sample}/{sample}.cutadapt.bam", output: - bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam', - metrics='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics' + bam="data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam", + metrics="data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics", log: - 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.log' + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.log", threads: 1 resources: mem_mb=gb(32), @@ -565,14 +596,15 @@ rule markduplicates: "VALIDATION_STRINGENCY=LENIENT " "&> {log}" + rule featurecounts: input: annotation=rules.gtf.output, - bam=rules.markduplicates.output.bam + bam=rules.markduplicates.output.bam, output: - 'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt' + "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", log: - 'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log' + "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log", threads: 8 resources: mem_mb=gb(16), @@ -599,22 +631,24 @@ rule featurecounts: "&> {log}" ) + rule aggregate_featurecounts: input: - expand('data/rnaseq_samples/{sample}/{sample}_featurecounts.txt', sample=SAMPLES) + expand( + "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", sample=SAMPLES + ), output: - 'data/rnaseq_aggregation/featurecounts.txt' - log: - 'data/rnaseq_aggregation/featurecounts.txt.log' - threads: - 1 + "data/rnaseq_aggregation/featurecounts.txt", + log: + "data/rnaseq_aggregation/featurecounts.txt.log", + threads: 1 resources: mem_mb=gb(8), - runtime=autobump(hours=1) + runtime=autobump(hours=1), run: for i, file in enumerate(input): df = pd.read_csv(file, sep="\t", comment="#") - df = df.set_index('Geneid', drop=False) + df = df.set_index("Geneid", drop=False) if i == 0: final = df continue @@ -624,11 +658,17 @@ rule aggregate_featurecounts: rule rrna_libsizes_table: input: - rrna=expand('data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize', sample=SAMPLES), - fastq=expand('data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize', sample=SAMPLES), + rrna=expand( + "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize", + sample=SAMPLES, + ), + fastq=expand( + "data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize", + sample=SAMPLES, + ), output: - tsv='data/rnaseq_aggregation/rrna_percentages_table.tsv', - json='data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml', + tsv="data/rnaseq_aggregation/rrna_percentages_table.tsv", + json="data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml", threads: 1 resources: mem_mb=gb(2), @@ -637,24 +677,20 @@ rule rrna_libsizes_table: "../../scripts/rrna_libsizes_table.py" - - - - rule collectrnaseqmetrics: input: bam=rules.markduplicates.output.bam, refflat=rules.conversion_refflat.output, output: - metrics='data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics' + metrics="data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics", log: - 'data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics.log' + "data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics.log", threads: 1 resources: mem_mb=gb(32), runtime=autobump(hours=2), params: - java_args="-Xmx20g", # [disable for test] + java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g', # [enable for test] strand_arg={ "unstranded": "STRAND=NONE ", @@ -677,11 +713,11 @@ rule collectrnaseqmetrics: rule preseq: input: - bam='data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam', + bam="data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam", output: - 'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt' + "data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt", log: - 'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt.log' + "data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt.log", threads: 1 resources: mem_mb=gb(1), @@ -699,9 +735,9 @@ rule salmon: fastq=rules.cutadapt.output, index=REFERENCES + "/salmon/versionInfo.json", output: - 'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf' + "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf", log: - 'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf.log' + "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf.log", threads: 6 resources: mem_mb=gb(32), @@ -736,9 +772,9 @@ rule kallisto: fastq=rules.cutadapt.output, index=REFERENCES + "/kallisto/transcripts.idx", output: - 'data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5' + "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5", log: - 'data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log' + "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log", threads: 8 resources: mem_mb=gb(32), @@ -775,9 +811,9 @@ rule rseqc_infer_experiment: bam=rules.markduplicates.output, bed12=rules.conversion_bed12.output, output: - 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt', + "data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt", log: - 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt.log' + "data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt.log", resources: mem_mb=gb(2), runtime=autobump(hours=2), @@ -790,9 +826,9 @@ rule rseqc_read_distribution: bam=rules.markduplicates.output, bed12=rules.conversion_bed12.output, output: - 'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt', + "data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt", log: - 'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt.log' + "data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt.log", resources: mem_mb=gb(2), runtime=autobump(hours=2), @@ -805,9 +841,9 @@ rule samtools_idxstats: bam=rules.markduplicates.output.bam, bai=rules.markduplicates.output.bam + ".bai", output: - 'data/rnaseq_samples/{sample}/idxstat_{sample}.txt' + "data/rnaseq_samples/{sample}/idxstat_{sample}.txt", log: - 'data/rnaseq_samples/{sample}/idxstat_{sample}.txt.log' + "data/rnaseq_samples/{sample}/idxstat_{sample}.txt.log", resources: mem_mb=gb(16), runtime=autobump(hours=2), @@ -820,9 +856,9 @@ rule samtools_flagstat: bam=rules.markduplicates.output.bam, bai=rules.markduplicates.output.bam + ".bai", output: - 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat' + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat", log: - 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat.log' + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat.log", shell: "samtools flagstat {input.bam} > {output}" @@ -832,9 +868,9 @@ rule samtools_stats: bam=rules.markduplicates.output.bam, bai=rules.markduplicates.output.bam + ".bai", output: - 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats' + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats", log: - 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats.log' + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats.log", shell: "samtools stats {input.bam} > {output}" @@ -844,13 +880,13 @@ rule bigwig_neg: bam=rules.markduplicates.output.bam, bai=rules.markduplicates.output.bam + ".bai", output: - 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig' + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig", threads: 8 resources: mem_mb=gb(16), runtime=autobump(hours=2), log: - 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig.log' + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig.log", params: strand_arg={ "unstranded": "", @@ -879,13 +915,13 @@ rule bigwig_pos: bam=rules.markduplicates.output.bam, bai=rules.markduplicates.output.bam + ".bai", output: - 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig' + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig", threads: 8 resources: mem_mb=gb(16), runtime=autobump(hours=2), log: - 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig.log' + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig.log", params: strand_arg={ "unstranded": "", @@ -908,10 +944,15 @@ rule bigwig_pos: "&> {log}" ) + rule multiqc: input: files=( - expand(rules.fastqc.output.zip, sample=SAMPLES, suffix=["_R1.fastq.gz", "_R1.cutadapt.fastq.gz", ".cutadapt.bam"]), + expand( + rules.fastqc.output.zip, + sample=SAMPLES, + suffix=["_R1.fastq.gz", "_R1.cutadapt.fastq.gz", ".cutadapt.bam"], + ), expand(rules.markduplicates.output, sample=SAMPLES), expand(rules.salmon.output, sample=SAMPLES), expand(rules.kallisto.output, sample=SAMPLES), @@ -935,7 +976,7 @@ rule multiqc: resources: mem_mb=gb(2), runtime=autobump(hours=2), - disk_mb=gb(10) + disk_mb=gb(10), run: analysis_directory = set([os.path.dirname(i) for i in input]) outdir = os.path.dirname(output[0]) @@ -951,4 +992,3 @@ rule multiqc: "{analysis_directory} " "&> {log} " ) - From da2fc328078b22877a3c5f24e087c05a6d66b584 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 19 Jan 2025 13:03:50 -0500 Subject: [PATCH 067/196] rrna_libsizes_table script avoids utils --- scripts/rrna_libsizes_table.py | 22 ++++++++++++++++------ workflows/rnaseq/Snakefile | 3 +++ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/scripts/rrna_libsizes_table.py b/scripts/rrna_libsizes_table.py index f71d48bc..ea2b6820 100644 --- a/scripts/rrna_libsizes_table.py +++ b/scripts/rrna_libsizes_table.py @@ -2,21 +2,31 @@ Prepares a TSV and JSON file for multiqc to pick up and display as a sortable table """ -import sys import os +import re import pandas as pd import yaml - -sys.path.insert(0, os.path.dirname(__file__) + "/..") -from lib import utils +from snakemake.io import regex_from_filepattern def rrna_sample(f): - return utils.extract_wildcards(snakemake.config["patterns"]["rrna"]["libsize"], f)["sample"] + m = re.compile( + regex_from_filepattern( + snakemake.params.rrna_pattern, + ) + ).match(f) + if m: + return m.groupdict()["sample"] def sample(f): - return utils.extract_wildcards(snakemake.config["patterns"]["libsizes"]["cutadapt"], f)["sample"] + m = re.compile( + regex_from_filepattern( + snakemake.params.fastq_pattern, + ) + ).match(f) + if m: + return m.groupdict()["sample"] def million(f): diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index bf87a234..3f2e5b90 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -670,6 +670,9 @@ rule rrna_libsizes_table: tsv="data/rnaseq_aggregation/rrna_percentages_table.tsv", json="data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml", threads: 1 + params: + rrna_pattern=lambda wc: "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize", + fastq_pattern=lambda wc: "data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize", resources: mem_mb=gb(2), runtime=autobump(hours=2), From b049ef6e8371cf1961f5ad73ecccf5b01d97c074 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 19 Jan 2025 13:08:20 -0500 Subject: [PATCH 068/196] use mem and disk rather than mem_mb and disk_mb --- workflows/rnaseq/Snakefile | 131 +++++++++++++++++++------------------ 1 file changed, 68 insertions(+), 63 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 3f2e5b90..7247bbc2 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -5,7 +5,6 @@ import pandas as pd sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") from lib import utils -from lib.utils import autobump, gb, hours configfile: "config/config.yaml" @@ -62,9 +61,9 @@ if utils.detect_sra(sampletable): is_paired=is_paired, # extra="-X 100000", # [enable for test] resources: - mem_mb=gb(1), - disk_mb=autobump(gb=1), - runtime=autobump(hours=2), + mem="1g", + disk="1g", + runtime="2h", run: srr = sampletable.loc[wildcards.sample, "Run"] extra = params.get("extra", "") @@ -91,8 +90,8 @@ rule symlinks: expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), threads: 1 resources: - mem_mb=100, - runtime=10, + mem="1g", + runtime="10m", run: assert len(output) == len(input), (input, output) for src, linkname in zip(input, output): @@ -128,8 +127,8 @@ rule sample_strand_check: "strand_check/{sample}/{sample}.strandedness.log", threads: 6 resources: - mem_mb=gb(8), - runtime=autobump(hours=2), + mem="8g", + runtime="2h", run: prefix = os.path.commonprefix(input.index).rstrip(".") nreads = int(1e5 * 4) @@ -170,8 +169,8 @@ rule strand_check: log: "strand_check/strandedness.log", resources: - mem_mb=gb(1), - runtime=autobump(hours=2), + mem="1g", + runtime="2h", run: with open(output.filelist, "w") as fout: for i in input: @@ -196,8 +195,8 @@ rule cutadapt: "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", threads: 6 resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", params: extra=( ( @@ -241,8 +240,8 @@ rule fastqc: html="data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html", zip="data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip", resources: - mem_mb=gb(8), - runtime=autobump(hours=2), + mem="8g", + runtime="2h", log: "data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log", run: @@ -278,8 +277,8 @@ if config["aligner"] == "hisat2": "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log", threads: 16 resources: - mem_mb=gb(32), - runtime=autobump(hours=8), + mem="32g", + runtime="8h", params: extra="", run: @@ -364,9 +363,9 @@ if config["aligner"] == "star": "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log", threads: 16 resources: - mem_mb=gb(64), - runtime=autobump(hours=8), - disk_mb=gb(80), + mem="64g", + runtime="8h", + disk="80g", params: extra=STAR_PARAMS, run: @@ -405,9 +404,9 @@ if config["aligner"] == "star-twopass": "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log", threads: 16 resources: - mem_mb=gb(64), - runtime=autobump(hours=8), - disk_mb=gb(80), + mem="64g", + runtime="8h", + disk="80g", params: extra=STAR_PARAMS, run: @@ -451,9 +450,9 @@ if config["aligner"] == "star-twopass": "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log", threads: 16 resources: - mem_mb=gb(64), - runtime=autobump(hours=8), - disk_mb=gb(80), + mem="64g", + runtime="8h", + disk="80g", params: extra=STAR_PARAMS, run: @@ -503,8 +502,8 @@ rule rRNA: "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.log", threads: 6 resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", params: extra=( "-k 1 " @@ -538,8 +537,8 @@ rule fastq_count: "{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize", threads: 1 resources: - mem_mb=gb(1), - runtime=autobump(hours=2), + mem="1g", + runtime="2h", shell: "zcat {input} | echo $((`wc -l`/4)) > {output}" @@ -551,8 +550,8 @@ rule bam_count: "{sample_dir}/{sample}/{suffix}.bam.libsize", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", shell: "samtools view -c {input} > {output}" @@ -564,8 +563,8 @@ rule bam_index: bai="{prefix}.bam.bai", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", shell: "samtools index {input} {output}" @@ -580,9 +579,9 @@ rule markduplicates: "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.log", threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), - disk_mb=autobump(gb=100), + mem="32g", + runtime="2h", + disk="100g", params: java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g' # [enable for test] @@ -607,8 +606,8 @@ rule featurecounts: "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log", threads: 8 resources: - mem_mb=gb(16), - runtime=autobump(hours=2), + mem="16g", + runtime="2h", params: strand_arg={ "unstranded": "-s0 ", @@ -643,8 +642,8 @@ rule aggregate_featurecounts: "data/rnaseq_aggregation/featurecounts.txt.log", threads: 1 resources: - mem_mb=gb(8), - runtime=autobump(hours=1), + mem="8g", + runtime="1h" run: for i, file in enumerate(input): df = pd.read_csv(file, sep="\t", comment="#") @@ -674,8 +673,8 @@ rule rrna_libsizes_table: rrna_pattern=lambda wc: "data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize", fastq_pattern=lambda wc: "data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize", resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", script: "../../scripts/rrna_libsizes_table.py" @@ -690,8 +689,8 @@ rule collectrnaseqmetrics: "data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics.log", threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), + mem="32g", + runtime="2h", params: java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g', # [enable for test] @@ -723,8 +722,8 @@ rule preseq: "data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt.log", threads: 1 resources: - mem_mb=gb(1), - runtime=autobump(hours=2), + mem="1g", + runtime="2h", shell: "preseq " "c_curve " @@ -743,8 +742,8 @@ rule salmon: "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf.log", threads: 6 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), + mem="32g", + runtime="2h", params: extra=( "--libType=A " @@ -780,8 +779,8 @@ rule kallisto: "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log", threads: 8 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), + mem="32g", + runtime="2h", params: strand_arg={ "unstranded": "", @@ -818,8 +817,8 @@ rule rseqc_infer_experiment: log: "data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt.log", resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", shell: "infer_experiment.py -r {input.bed12} -i {input.bam} > {output} &> {log}" @@ -833,8 +832,8 @@ rule rseqc_read_distribution: log: "data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt.log", resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", shell: "read_distribution.py -i {input.bam} -r {input.bed12} > {output} &> {log}" @@ -848,8 +847,8 @@ rule samtools_idxstats: log: "data/rnaseq_samples/{sample}/idxstat_{sample}.txt.log", resources: - mem_mb=gb(16), - runtime=autobump(hours=2), + mem="16g", + runtime="2h", shell: "samtools idxstats {input.bam} 2> {log} 1> {output}" @@ -862,6 +861,9 @@ rule samtools_flagstat: "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat", log: "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat.log", + resources: + mem="8g", + runtime="2h", shell: "samtools flagstat {input.bam} > {output}" @@ -874,6 +876,9 @@ rule samtools_stats: "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats", log: "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats.log", + resources: + mem="8g", + runtime="2h", shell: "samtools stats {input.bam} > {output}" @@ -886,8 +891,8 @@ rule bigwig_neg: "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig", threads: 8 resources: - mem_mb=gb(16), - runtime=autobump(hours=2), + mem="16g", + runtime="2h", log: "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig.log", params: @@ -921,8 +926,8 @@ rule bigwig_pos: "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig", threads: 8 resources: - mem_mb=gb(16), - runtime=autobump(hours=2), + mem="16g", + runtime="2h", log: "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig.log", params: @@ -977,9 +982,9 @@ rule multiqc: "data/rnaseq_aggregation/multiqc.log", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2), - disk_mb=gb(10), + mem="2g", + runtime="2h", + disk="10g", run: analysis_directory = set([os.path.dirname(i) for i in input]) outdir = os.path.dirname(output[0]) From 650e60ff39eb94042aee8a3779954c260f67a292 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 19 Jan 2025 13:12:59 -0500 Subject: [PATCH 069/196] convert to mem and disk in references --- workflows/references/Snakefile | 56 +++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index d6dcf759..682f1bfe 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -3,7 +3,6 @@ import sys import pandas sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") -from lib.utils import autobump, gb, hours from lib import utils REFERENCES = config.get("reference_dir", "../../references") @@ -18,6 +17,9 @@ rule fasta: temporary(f"{REFERENCES}/genome.fa.gz"), log: f"{REFERENCES}/logs/genome.fa.gz.log", + resources: + mem_mb="4g", + runtime="2h", run: utils.download_and_postprocess( urls=config["fasta"]["url"], @@ -32,6 +34,9 @@ rule gtf: temporary(f"{REFERENCES}/annotation.gtf.gz"), log: f"{REFERENCES}/logs/annotation.gtf.gz.log", + resources: + mem="4g", + runtime="2h", run: utils.download_and_postprocess( urls=config["gtf"]["url"], @@ -46,6 +51,9 @@ rule rrna: temporary(f"{REFERENCES}/rrna.fa.gz"), log: f"{REFERENCES}/logs/rrna.fa.gz.log", + resources: + mem="4g", + runtime="2h", run: utils.download_and_postprocess( urls=config["rrna"]["url"], @@ -60,6 +68,9 @@ rule unzip: f"{REFERENCES}/{{prefix}}.gz", output: f"{REFERENCES}/{{prefix}}", + resources: + mem="4g", + runtime="2h", shell: "gunzip -c {input} > {output}" @@ -81,9 +92,9 @@ rule bowtie2_index: log: f"{REFERENCES}/logs/bowtie2_{{label}}.log", resources: - runtime=autobump(hours=8), - mem_mb=autobump(gb=32), - disk_mb=autobump(gb=50), + mem="32g", + disk="50g", + runtime="8h", threads: 8 run: index = os.path.commonprefix(output).rstrip(".") @@ -101,8 +112,8 @@ rule star_index: f"{REFERENCES}/logs/star.log", threads: 8 resources: - runtime=autobump(hours=8), - mem_mb=gb(64), + mem="64g", + runtime="8h", run: genomedir = os.path.dirname(output[0]) shell("rm -r {genomedir}") @@ -148,9 +159,9 @@ rule hisat2_index: log: f"{REFERENCES}/logs/hisat2.log", resources: - runtime=autobump(hours=8), - mem_mb=autobump(gb=32), - disk_mb=autobump(gb=50), + mem="32g", + disk="50g", + runtime="8h", threads: 8 run: index = os.path.commonprefix(output).rstrip(".") @@ -165,7 +176,8 @@ rule transcriptome_fasta: output: f"{REFERENCES}/transcriptome.fa", resources: - runtime=hours(1), + mem="4g", + runtime="2h", shell: "gffread {input.gtf} -w {output} -g {input.fasta}" @@ -180,8 +192,8 @@ rule salmon_index: params: outdir=f"{REFERENCES}/salmon", resources: - mem_mb=gb(32), - runtime=hours(2), + mem="32g", + runtime="2h", run: outdir = os.path.dirname(output[0]) shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}") @@ -195,8 +207,8 @@ rule kallisto_index: log: f"{REFERENCES}/logs/kallisto.log", resources: - runtime=hours(2), - mem_mb=gb(32), + mem="32g", + runtime="2h", shell: "kallisto index " "--index {output} " @@ -212,8 +224,8 @@ rule conversion_refflat: log: f"{REFERENCES}/logs/annotation.refflat.log", resources: - runtime=hours(2), - mem_mb=gb(2), + mem="2g", + runtime="2h", shell: "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp " """&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} """ @@ -226,8 +238,8 @@ rule conversion_bed12: output: f"{REFERENCES}/annotation.bed12", resources: - runtime=hours(2), - mem_mb=gb(2), + mem="2g", + runtime="2h", shell: "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp " "&& genePredToBed {output}.tmp {output} " @@ -247,8 +259,8 @@ rule chromsizes: java_args="-Xmx20g", # java_args='-Xmx2g' # [TEST SETTINGS -1] resources: - mem_mb=gb(24), - runtime=hours(2), + mem="24g", + runtime="2h", shell: "export LC_COLLATE=C; " "rm -f {output}.tmp " @@ -275,8 +287,8 @@ rule mappings: output[0] ].get("include_featuretypes", []), resources: - runtime=hours(2), - mem_mb=gb(2), + mem="2g", + runtime="2h", run: import gffutils From d5db4a56c864242d7d2cb698a479e0edea4becf7 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 19 Jan 2025 13:35:52 -0500 Subject: [PATCH 070/196] spell out params fully in wrapper --- include/WRAPPER_SLURM | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/include/WRAPPER_SLURM b/include/WRAPPER_SLURM index b2a2ffd4..2c92f7ae 100755 --- a/include/WRAPPER_SLURM +++ b/include/WRAPPER_SLURM @@ -26,13 +26,11 @@ fi # Run snakemake ( time snakemake \ - -p \ + --printshellcmds \ --directory $PWD \ - -k \ - --restart-times 3 \ + --keep-going \ --rerun-incomplete \ --jobname "s.{rulename}.{jobid}.sh" \ - -j 999 \ --use-conda \ --configfile config/config.yaml \ $PROFILE_CMD \ From b3a7d94f76b38beca89f6ee4f724d434b5ed1937 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Sun, 19 Jan 2025 13:36:05 -0500 Subject: [PATCH 071/196] timestamped log file for slurm wrapper --- include/WRAPPER_SLURM | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/WRAPPER_SLURM b/include/WRAPPER_SLURM index 2c92f7ae..9f7f1344 100755 --- a/include/WRAPPER_SLURM +++ b/include/WRAPPER_SLURM @@ -19,11 +19,13 @@ if [ -z "$LCDBWF_SNAKEMAKE_PROFILE" ]; then PROFILE_CMD="--profile $SNAKEMAKE_PROFILE" fi else -# LCDBWF_SNAKEMAKE_PROFILE found, this takes priority if both profile variables are set +# LCDBWF_SNAKEMAKE_PROFILE takes priority if both profile variables are set PROFILE_CMD="--profile $LCDBWF_SNAKEMAKE_PROFILE" fi -# Run snakemake +# Timestamped log file +LOGFILE="Snakefile_$(date +"%Y-%m-%d_%H%M").log" + ( time snakemake \ --printshellcmds \ @@ -35,7 +37,7 @@ fi --configfile config/config.yaml \ $PROFILE_CMD \ "$@" - ) > "Snakefile.log" 2>&1 + ) > "$LOGFILE" 2>&1 SNAKE_PID=$! From aa437be9adcd8344a2e5911ea1cbcf274f98be96 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 20 Jan 2025 10:15:29 -0500 Subject: [PATCH 072/196] rm wrappers --- wrappers/.gitignore | 5 - wrappers/LICENSE | 21 -- wrappers/README.md | 1 - wrappers/test/conftest.py | 10 - wrappers/test/raw_data_fixtures.py | 180 ------------ wrappers/test/test_atropos.py | 156 ----------- wrappers/test/test_bowtie2.py | 95 ------- wrappers/test/test_cutadapt.py | 151 ----------- wrappers/test/test_deeptools.py | 37 --- wrappers/test/test_demo.py | 159 ----------- wrappers/test/test_dupradar.py | 49 ---- wrappers/test/test_fastq_screen.py | 36 --- wrappers/test/test_fastqc.py | 70 ----- wrappers/test/test_featurecounts.py | 59 ---- wrappers/test/test_hisat2.py | 120 -------- wrappers/test/test_kallisto.py | 69 ----- wrappers/test/test_multiqc.py | 48 ---- wrappers/test/test_picard.py | 116 -------- wrappers/test/test_rseqc.py | 151 ----------- wrappers/test/test_salmon.py | 83 ------ wrappers/test/test_samtools.py | 12 - wrappers/test/utils.py | 152 ----------- wrappers/test_toy.py | 100 ------- wrappers/wrappers/atropos/README.md | 167 ------------ wrappers/wrappers/atropos/environment.yaml | 4 - wrappers/wrappers/atropos/wrapper.py | 80 ------ wrappers/wrappers/average-bigwigs/README.md | 75 ----- .../wrappers/average-bigwigs/environment.yaml | 5 - wrappers/wrappers/average-bigwigs/wrapper.py | 32 --- .../wrappers/combos/merge_and_dedup/README.md | 66 ----- .../combos/merge_and_dedup/environment.yaml | 7 - wrappers/wrappers/demo/README.md | 69 ----- wrappers/wrappers/demo/environment.yaml | 4 - wrappers/wrappers/demo/wrapper.py | 27 -- wrappers/wrappers/dupradar/README.md | 83 ------ wrappers/wrappers/dupradar/environment.yaml | 10 - wrappers/wrappers/dupradar/wrapper.py | 94 ------- wrappers/wrappers/epic2/environment.yaml | 8 - wrappers/wrappers/fastq-dump/environment.yaml | 5 - wrappers/wrappers/fastq-dump/wrapper.py | 41 --- wrappers/wrappers/fastq_screen/README.md | 61 ----- .../wrappers/fastq_screen/environment.yaml | 7 - wrappers/wrappers/fastq_screen/wrapper.py | 72 ----- wrappers/wrappers/fastqc/README.md | 32 --- wrappers/wrappers/fastqc/environment.yaml | 9 - wrappers/wrappers/fastqc/wrapper.py | 48 ---- wrappers/wrappers/macs2/callpeak/README.md | 61 ----- .../wrappers/macs2/callpeak/environment.yaml | 8 - wrappers/wrappers/sicer/README.md | 59 ---- wrappers/wrappers/sicer/environment.yaml | 10 - wrappers/wrappers/sicer/wrapper.py | 147 ---------- wrappers/wrappers/spp/README.md | 175 ------------ wrappers/wrappers/spp/environment.yaml | 11 - wrappers/wrappers/spp/wrapper.py | 256 ------------------ 54 files changed, 3613 deletions(-) delete mode 100644 wrappers/.gitignore delete mode 100644 wrappers/LICENSE delete mode 100644 wrappers/README.md delete mode 100644 wrappers/test/conftest.py delete mode 100644 wrappers/test/raw_data_fixtures.py delete mode 100644 wrappers/test/test_atropos.py delete mode 100644 wrappers/test/test_bowtie2.py delete mode 100644 wrappers/test/test_cutadapt.py delete mode 100644 wrappers/test/test_deeptools.py delete mode 100644 wrappers/test/test_demo.py delete mode 100644 wrappers/test/test_dupradar.py delete mode 100644 wrappers/test/test_fastq_screen.py delete mode 100644 wrappers/test/test_fastqc.py delete mode 100644 wrappers/test/test_featurecounts.py delete mode 100644 wrappers/test/test_hisat2.py delete mode 100644 wrappers/test/test_kallisto.py delete mode 100644 wrappers/test/test_multiqc.py delete mode 100644 wrappers/test/test_picard.py delete mode 100644 wrappers/test/test_rseqc.py delete mode 100644 wrappers/test/test_salmon.py delete mode 100644 wrappers/test/test_samtools.py delete mode 100644 wrappers/test/utils.py delete mode 100644 wrappers/test_toy.py delete mode 100644 wrappers/wrappers/atropos/README.md delete mode 100644 wrappers/wrappers/atropos/environment.yaml delete mode 100644 wrappers/wrappers/atropos/wrapper.py delete mode 100644 wrappers/wrappers/average-bigwigs/README.md delete mode 100644 wrappers/wrappers/average-bigwigs/environment.yaml delete mode 100644 wrappers/wrappers/average-bigwigs/wrapper.py delete mode 100644 wrappers/wrappers/combos/merge_and_dedup/README.md delete mode 100644 wrappers/wrappers/combos/merge_and_dedup/environment.yaml delete mode 100644 wrappers/wrappers/demo/README.md delete mode 100644 wrappers/wrappers/demo/environment.yaml delete mode 100644 wrappers/wrappers/demo/wrapper.py delete mode 100644 wrappers/wrappers/dupradar/README.md delete mode 100644 wrappers/wrappers/dupradar/environment.yaml delete mode 100644 wrappers/wrappers/dupradar/wrapper.py delete mode 100644 wrappers/wrappers/epic2/environment.yaml delete mode 100644 wrappers/wrappers/fastq-dump/environment.yaml delete mode 100644 wrappers/wrappers/fastq-dump/wrapper.py delete mode 100644 wrappers/wrappers/fastq_screen/README.md delete mode 100644 wrappers/wrappers/fastq_screen/environment.yaml delete mode 100644 wrappers/wrappers/fastq_screen/wrapper.py delete mode 100644 wrappers/wrappers/fastqc/README.md delete mode 100644 wrappers/wrappers/fastqc/environment.yaml delete mode 100644 wrappers/wrappers/fastqc/wrapper.py delete mode 100644 wrappers/wrappers/macs2/callpeak/README.md delete mode 100644 wrappers/wrappers/macs2/callpeak/environment.yaml delete mode 100644 wrappers/wrappers/sicer/README.md delete mode 100644 wrappers/wrappers/sicer/environment.yaml delete mode 100644 wrappers/wrappers/sicer/wrapper.py delete mode 100644 wrappers/wrappers/spp/README.md delete mode 100644 wrappers/wrappers/spp/environment.yaml delete mode 100644 wrappers/wrappers/spp/wrapper.py diff --git a/wrappers/.gitignore b/wrappers/.gitignore deleted file mode 100644 index ede3cdda..00000000 --- a/wrappers/.gitignore +++ /dev/null @@ -1,5 +0,0 @@ -.test* -__pycache__ -.snakemake -.cache -**.snakemake* diff --git a/wrappers/LICENSE b/wrappers/LICENSE deleted file mode 100644 index 17b3ab77..00000000 --- a/wrappers/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ -MIT License - -Copyright (c) 2016 lcdb - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/wrappers/README.md b/wrappers/README.md deleted file mode 100644 index 79d134e9..00000000 --- a/wrappers/README.md +++ /dev/null @@ -1 +0,0 @@ -See documentation at http://lcdb-wf.readthedocs.io/en/latest/wrappers.html diff --git a/wrappers/test/conftest.py b/wrappers/test/conftest.py deleted file mode 100644 index d346905e..00000000 --- a/wrappers/test/conftest.py +++ /dev/null @@ -1,10 +0,0 @@ -import os -import pytest -import tempfile -import shutil -import inspect -from snakemake.shell import shell -from snakemake.utils import makedirs -from lcdblib.snakemake import aligners - -from raw_data_fixtures import * diff --git a/wrappers/test/raw_data_fixtures.py b/wrappers/test/raw_data_fixtures.py deleted file mode 100644 index c19f8601..00000000 --- a/wrappers/test/raw_data_fixtures.py +++ /dev/null @@ -1,180 +0,0 @@ -""" -Fixtures used for downloading data from the test data repo -""" - -import os -import pytest -from utils import tmpdir_for_func, _download_file, symlink_in_tempdir, run, dpath - -# ---------------------------------------------------------------------------- -# FASTQ files -@pytest.fixture(scope='session') -def sample1_se_fq(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.small_R1.fastq.gz' - return _download_file(fn, d) - -@pytest.fixture(scope='session') -def sample1_se_tiny_fq(tmpdir_factory): - """ - Single-end FASTQ file with 1010 reads - """ - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.tiny_R1.fastq.gz' - return _download_file(fn, d) - -@pytest.fixture(scope='session') -def sample1_pe_fq(tmpdir_factory): - pair = [] - d = tmpdir_for_func(tmpdir_factory) - for fn in [ - 'rnaseq_samples/sample1/sample1.small_R1.fastq.gz', - 'rnaseq_samples/sample1/sample1.small_R2.fastq.gz' - ]: - pair.append(_download_file(fn, d)) - return pair - -@pytest.fixture(scope='session') -def sample1_pe_tiny_fq(tmpdir_factory): - pair = [] - d = tmpdir_for_func(tmpdir_factory) - for fn in [ - 'rnaseq_samples/sample1/sample1.tiny_R1.fastq.gz', - 'rnaseq_samples/sample1/sample1.tiny_R2.fastq.gz' - ]: - pair.append(_download_file(fn, d)) - return pair - -# ---------------------------------------------------------------------------- -# BAM files - -@pytest.fixture(scope='session') -def sample1_se_bam(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.small.single.sorted.bam' - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def sample1_pe_bam(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.small.paired.sorted.bam' - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def sample1_se_tiny_bam(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.tiny.single.sorted.bam' - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def sample1_pe_tiny_bam(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'rnaseq_samples/sample1/sample1.tiny.paired.sorted.bam' - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def sample1_se_bam_bai(sample1_se_bam, tmpdir_factory): - """ - Returns both the bam and the bam.bai - """ - snakefile = ''' - rule index: - input: bam='sample1.sorted.bam' - output: bai='sample1.sorted.bam.bai' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_bam: 'sample1.sorted.bam' - - } - ) - tmpdir = str(tmpdir_factory.mktemp('sample1_se_bam_bai')) - run(dpath('../wrappers/samtools/index'), snakefile, None, input_data_func, tmpdir) - return { - 'bam': os.path.join(tmpdir, 'sample1.sorted.bam'), - 'bai': os.path.join(tmpdir, 'sample1.sorted.bam.bai'), - } - - -@pytest.fixture(scope='session') -def sample1_se_tiny_bam_bai(sample1_se_tiny_bam, tmpdir_factory): - """ - Returns both the bam and the bam.bai - """ - snakefile = ''' - rule index: - input: bam='sample1.sorted.bam' - output: bai='sample1.sorted.bam.bai' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.sorted.bam' - - } - ) - tmpdir = str(tmpdir_factory.mktemp('sample1_se_tiny_bam_bai')) - run(dpath('../wrappers/samtools/index'), snakefile, None, input_data_func, tmpdir) - return { - 'bam': os.path.join(tmpdir, 'sample1.sorted.bam'), - 'bai': os.path.join(tmpdir, 'sample1.sorted.bam.bai'), - } - -# ---------------------------------------------------------------------------- -# Annotations - -@pytest.fixture(scope='session') -def transcriptome(tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - fn = 'seq/dm6.small.transcriptome.fa' - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def dm6_fa(tmpdir_factory): - fn = 'seq/dm6.small.fa' - d = tmpdir_for_func(tmpdir_factory) - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def annotation(tmpdir_factory): - fn = 'annotation/dm6.small.gtf' - d = tmpdir_for_func(tmpdir_factory) - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def annotation_refflat(tmpdir_factory): - fn = 'annotation/dm6.small.refflat' - d = tmpdir_for_func(tmpdir_factory) - return _download_file(fn, d) - - -@pytest.fixture(scope='session') -def annotation_db(annotation): - import gffutils - gffutils.create_db( - data=annotation, dbfn=annotation + '.db', - merge_strategy='merge', - id_spec={'transcript': ['transcript_id', 'transcript_symbol'], - 'gene': ['gene_id', 'gene_symbol']}, - gtf_transcript_key='transcript_id', - gtf_gene_key='gene_id') - return annotation + '.db' - - -@pytest.fixture(scope='session') -def annotation_bed12(annotation_db): - import gffutils - db = gffutils.FeatureDB(annotation_db) - bed12 = '.'.join(annotation_db.strip().split('.')[:-2]) + '.bed12' - with open(bed12, 'w') as handle: - for t in db.features_of_type('transcript'): - handle.write(db.bed12(t, name_field='transcript_id') + '\n') - return bed12 diff --git a/wrappers/test/test_atropos.py b/wrappers/test/test_atropos.py deleted file mode 100644 index f695202e..00000000 --- a/wrappers/test/test_atropos.py +++ /dev/null @@ -1,156 +0,0 @@ -import pytest -import os -import gzip -from utils import run, dpath, symlink_in_tempdir - - -def test_atropos_simple(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule atropos: - input: - fastq='sample1_R1.fastq.gz' - output: - fastq='sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - threads: 2 - wrapper: "file:wrapper" - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2) - - -def test_atropos_simple_with_log(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule atropos: - input: - fastq='sample1_R1.fastq.gz' - output: - fastq='sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - threads: 2 - log: 'sample1.atropos.log' - wrapper: "file:wrapper" - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is Atropos' in open('sample1.atropos.log').readline() - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2) - - -def test_atropos_se_with_list(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule atropos: - input: 'sample1_R1.fastq.gz' - output: 'sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - threads: 2 - wrapper: "file:wrapper" - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2) - - -def test_atropos_pe(sample1_pe_tiny_fq, tmpdir): - snakefile = ''' - rule atropos: - input: - R1='sample1_R1.fastq.gz', - R2='sample1_R2.fastq.gz', - output: - R1='sample1_R1.trim.fastq.gz', - R2='sample2_R1.trim.fastq.gz', - params: extra='-a AAA' - threads: 2 - log: 'sample1.atropos.log' - wrapper: "file:wrapper" - ''' - input_data_func = symlink_in_tempdir( - { - sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz', - sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz', - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is Atropos' in open('sample1.atropos.log').readline() - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2) - - -def test_atropos_pe_with_list(sample1_pe_tiny_fq, tmpdir): - - snakefile = ''' - rule atropos: - input: 'sample1_R1.fastq.gz', 'sample1_R2.fastq.gz', - output: 'sample1_R1.trim.fastq.gz', 'sample2_R1.trim.fastq.gz', - params: extra='-a AAA' - threads: 2 - log: 'sample1.atropos.log' - wrapper: "file:wrapper" - ''' - input_data_func = symlink_in_tempdir( - { - sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz', - sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz', - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is Atropos' in open('sample1.atropos.log').readline() - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/atropos'), snakefile, check, input_data_func, tmpdir, cores=2) diff --git a/wrappers/test/test_bowtie2.py b/wrappers/test/test_bowtie2.py deleted file mode 100644 index 6ee9b76f..00000000 --- a/wrappers/test/test_bowtie2.py +++ /dev/null @@ -1,95 +0,0 @@ -import os -import pytest -from snakemake.shell import shell -from lcdblib.snakemake import aligners -from utils import run, dpath, symlink_in_tempdir, tmpdir_for_func - - -@pytest.fixture(scope='session') -def bowtie2_indexes(dm6_fa, tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - snakefile = ''' - rule bowtie2: - input: fasta='dm6.fa' - output: index=['dm6.1.bt2', 'dm6.2.bt2'] - log: 'bowtie2.log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - dm6_fa: 'dm6.fa' - } - ) - - def check(): - assert 'Total time for backward call to driver' in open('bowtie2.log').readlines()[-1] - assert list(shell('bowtie2-inspect dm6 -n', iterable=True)) == ['2L', '2R'] - - run( - dpath('../wrappers/bowtie2/build'), - snakefile, check, input_data_func, d) - return aligners.bowtie2_index_from_prefix(os.path.join(d, 'dm6')) - - -def _dict_of_bowtie2_indexes(bowtie2_indexes, prefix): - d = {} - indexes = aligners.bowtie2_index_from_prefix(prefix) - bowtie2_indexes = sorted(bowtie2_indexes) - indexes = sorted(indexes) - for k, v in zip(bowtie2_indexes, indexes): - d[k] = v - return d - - -def test_bowtie2_align_se(bowtie2_indexes, sample1_se_tiny_fq, tmpdir): - d = _dict_of_bowtie2_indexes(bowtie2_indexes, 'dm6') - indexes = list(d.values()) - snakefile = ''' - rule bowtie2_align: - input: - fastq='sample1_R1.fastq.gz', - index={indexes} - output: - bam='sample1.bam' - log: "bowtie2.log" - wrapper: "file:wrapper" - '''.format(indexes=indexes) - d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz' - input_data_func = symlink_in_tempdir(d) - - def check(): - assert "overall alignment rate" in open('bowtie2.log').read() - - # should have at least some mapped and unmapped - assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) > 0 - assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0 - - run(dpath('../wrappers/bowtie2/align'), snakefile, check, input_data_func, tmpdir) - - -def test_bowtie2_align_se_rm_unmapped(bowtie2_indexes, sample1_se_tiny_fq, tmpdir): - d = _dict_of_bowtie2_indexes(bowtie2_indexes, 'dm6') - indexes = list(d.values()) - snakefile = ''' - rule bowtie2_align: - input: - fastq='sample1_R1.fastq.gz', - index={indexes} - output: - bam='sample1.bam' - params: - samtools_view_extra='-F 0x04' - log: "bowtie2.log" - wrapper: "file:wrapper" - '''.format(indexes=indexes) - d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz' - input_data_func = symlink_in_tempdir(d) - - def check(): - assert "overall alignment rate" in open('bowtie2.log').read() - - # should have at least some mapped and unmapped - assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) == 0 - assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0 - - run(dpath('../wrappers/bowtie2/align'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_cutadapt.py b/wrappers/test/test_cutadapt.py deleted file mode 100644 index 97f5c7f3..00000000 --- a/wrappers/test/test_cutadapt.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir - -def test_cutadapt_simple(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule cutadapt: - input: - fastq='sample1_R1.fastq.gz' - output: - fastq='sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir) - - -def test_cutadapt_simple_with_log(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule cutadapt: - input: - fastq='sample1_R1.fastq.gz' - output: - fastq='sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - log: 'sample1.cutadapt.log' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is cutadapt' in open('sample1.cutadapt.log').readline() - - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir) - - -def test_cutadapt_se_with_list(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule cutadapt: - input: 'sample1_R1.fastq.gz' - output: 'sample1_R1.trim.fastq.gz' - params: extra='-a AAA' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir) - -def test_cutadapt_pe(sample1_pe_tiny_fq, tmpdir): - snakefile = ''' - rule cutadapt: - input: - R1='sample1_R1.fastq.gz', - R2='sample1_R2.fastq.gz', - output: - R1='sample1_R1.trim.fastq.gz', - R2='sample2_R1.trim.fastq.gz', - params: extra='-a AAA' - log: 'sample1.cutadapt.log' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz', - sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz', - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is cutadapt' in open('sample1.cutadapt.log').readline() - - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir) - -def test_cutadapt_pe_with_list(sample1_pe_tiny_fq, tmpdir): - snakefile = ''' - rule cutadapt: - input: 'sample1_R1.fastq.gz', 'sample1_R2.fastq.gz', - output: 'sample1_R1.trim.fastq.gz', 'sample2_R1.trim.fastq.gz', - params: extra='-a AAA' - log: 'sample1.cutadapt.log' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_pe_tiny_fq[0]: 'sample1_R1.fastq.gz', - sample1_pe_tiny_fq[1]: 'sample1_R2.fastq.gz', - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - a = sum(1 for _ in gzip.open('sample1_R1.fastq.gz')) - b = sum(1 for _ in gzip.open('sample1_R1.trim.fastq.gz')) - assert a == b == 4040 - assert 'This is cutadapt' in open('sample1.cutadapt.log').readline() - - assert os.path.getsize('sample1_R1.fastq.gz') != os.path.getsize('sample1_R1.trim.fastq.gz') - - run(dpath('../wrappers/cutadapt'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_deeptools.py b/wrappers/test/test_deeptools.py deleted file mode 100644 index cbf87690..00000000 --- a/wrappers/test/test_deeptools.py +++ /dev/null @@ -1,37 +0,0 @@ -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir -import pyBigWig - -def test_deeptools_bamCoverage(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, tmpdir): - snakefile = ''' - rule deeptools: - input: - bam='sample1.bam', - bai='sample1.bam.bai' - output: 'sample1.bw', - log: 'deeptools.log' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.bam', - sample1_se_tiny_bam_bai['bai']: 'sample1.bam.bai', - } - ) - - def check(): - bw = pyBigWig.open('sample1.bw') - header_keys = list(bw.header().keys()) - for k in ['maxVal', 'minVal', 'nBasesCovered', 'nLevels', 'sumData', - 'sumSquared', 'version']: - assert k in header_keys - - # bigWig version should be independent of BAM input, so we can check - # the value - assert bw.header()['version'] == 4 - - first_chrom = list(bw.chroms().keys())[0] - assert isinstance(bw.stats(first_chrom)[0], float) - - run(dpath('../wrappers/deeptools/bamCoverage'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_demo.py b/wrappers/test/test_demo.py deleted file mode 100644 index dd7be5ee..00000000 --- a/wrappers/test/test_demo.py +++ /dev/null @@ -1,159 +0,0 @@ -# This file demonstrates tests for the `demo` wrapper. It is heavily commented, -# and is included as part of the test suite to ensure that it's correct. - -# The `run` function does most of the work. It creates a tempdir, copies over -# input data, Snakefile, and wrapper, runs the Snakefile, and runs -# a user-provided test function against the output. -from utils import run - - -# The `dpath` function figures out the path the wrapper even when in a tempdir -from utils import dpath - -# `symlink_in_tempdir` is a decorator function that lets us easily map fixtures -# to input files expected by our Snakefile. The examples below will demonstrate -# how it works. -from utils import symlink_in_tempdir - - -# A note on fixtures -# ------------------ -# -# py.test implicitly does a `from conftest import *`, so we will have the -# fixtures from that package available here. -# -# Currently we have the fixtures from raw_data_fixtures.py imported into -# conftest.py, which in turn makes them available in this file. -# -# py.test also includes a built-in `tmpdir` fixture which we use here to have -# a nicely-named tmpdir for running the test. -# -# See http://doc.pytest.org/en/latest/fixture.html for more info. - - -# Our first test. The test function names must start with `test_` in order for -# py.test to find them. -def test_demo(sample1_se_tiny_fq, tmpdir): - - # A note on these arguments - # ------------------------- - # - # Test function arguments are expected to be fixtures. The fixture - # `sample1_se_tiny_fq` will be the path to the downloaded example data. See - # conftest.sample1_se_tiny_fq(). - # - # The fixture `tmpdir` (which comes built-in with py.test) will be - # a py.path.local object pointing to a tempdir created just for this test. - # It will match the glob /tmp/pytest-*, and only the last 3 tempdirs are - # retained. - - # Write the snakefile - # ------------------- - # First we write the Snakefile to use in testing. Inputs need to come from - # fixutres. Write whatever filename you'd like; we'll connect the fixture - # to the written filename below. - # - # `snakefile` is typically a triple-quoted string; it will be automatically - # run through textwrap.dedent later so you don't have to worry about - # indentation. - # - # The wrapper will be copied to a subdirectory of the temp dir called, - # appropriately enough, "wrapper". So your snakefile will generally end - # with the line `wrapper: "file:wrapper"`. - snakefile = ''' - rule demo: - input: 'a.fastq.gz' - output: 'b.fastq.gz' - wrapper: "file:wrapper" - ''' - - # Map fixtures to input files - # --------------------------- - # Next we map the fixture sample1_se_tiny_fq (a temp file which has downloaded - # data from the test data repo into a temp dir) to the input file that our - # Snakefile expects. - # - # Keys are paths to downloaded example data (typically downloaded just once - # per py.test session), which is provided by the fixture. The values of the - # dict are paths relative to the Snakefile and must match what is expected - # by the snakefile. - # - # Technically, `symlink_in_tempdir` returns a function that takes a path as - # its argument and symlinks keys over to values within that path. While - # this seems a little convoluted, doing it this way means that we don't - # have to keep track -- or even care -- what the fixture's provided - # filename is, avoiding the need to keep looking back at the fixtures - # module to remember what the filenames are. It keeps the input file setup - # logic tightly coupled to the Snakefile, since they're both defined in the - # same function. - # - # So: since the above snakefile expects a.fastq.gz as input, we need to - # make that happen, like this: - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'a.fastq.gz' - } - ) - - # Write a test function - # --------------------- - # This is our test function. It will be called after the Snakefile has been - # run and it will be called in the same temp directory in which the - # Snakefile is run, so paths should be relative to the Snakefile. - # - # This function should not accept any arguments. - # - # In this case, the demo wrapper simply copies input to output, so here we - # assert the files are identical. - def check(): - assert open('a.fastq.gz', 'rb').read() == open('b.fastq.gz', 'rb').read() - - # Call `run()` - # ------------ - # Now that we have defined everything, the `run` function does all of the - # work. Note we pass the `tmpdir` fixture here. - # - # (that's because py.test manages tmpdirs for tests, which are in this - # current module, but run() lives in the utils module which won't get - # nicely managed. But run() needs to know where to build the test case, - # hence the need to pass it here) - run(dpath('../wrappers/demo'), snakefile, check, input_data_func, tmpdir) - - - -# This test function shows how to use downloaded paired-end data from -# a different fixture. -def test_demo_pe(sample1_pe_fq, tmpdir): - - # In contrast to the sample1_se_tiny_fq fixture used in the previous function, - # here the paired-end fixture `sample1_pe_fq` is a tuple of path names (see - # conftest.sample1_pe_fq()) - - - # The snakefile reflects what the wrapper expects for PE (see - # wrappers/demo/README.md). - snakefile = ''' - rule demo: - input: - R1='a1.fastq.gz', - R2='a2.fastq.gz' - output: - R1='b1.fastq.gz', - R2='b2.fastq.gz' - wrapper: "file:wrapper" - ''' - - # Map fixture to input files. Again, since this is paired-end we need to - # make sure both files are provided the right filename for testing. - input_data_func=symlink_in_tempdir( - { - sample1_pe_fq[0]: 'a1.fastq.gz', - sample1_pe_fq[1]: 'a2.fastq.gz', - } - ) - - def check(): - assert open('a1.fastq.gz', 'rb').read() == open('b1.fastq.gz', 'rb').read() - assert open('a2.fastq.gz', 'rb').read() == open('b2.fastq.gz', 'rb').read() - - run(dpath('../wrappers/demo'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_dupradar.py b/wrappers/test/test_dupradar.py deleted file mode 100644 index 6122bd5c..00000000 --- a/wrappers/test/test_dupradar.py +++ /dev/null @@ -1,49 +0,0 @@ -import os -import pytest -from test_picard import sample1_se_bam_markdups -from utils import symlink_in_tempdir, run, dpath - - -@pytest.fixture(scope='session') -def sample1_se_dupradar(sample1_se_bam_markdups, annotation, tmpdir_factory): - snakefile = ''' - rule dupradar: - input: - bam='sample1.bam', - annotation='dm6.gtf' - output: - density_scatter='sample1.density_scatter.png', - expression_histogram='sample1.expression_histogram.png', - expression_barplot='sample1.expression_barplot.png', - expression_boxplot='sample1.expression_boxplot.png', - multimapping_histogram='sample1.multimapping_histogram.png', - dataframe='sample1.dupradar.tsv', - model='sample1.model.txt', - curve='sample1.curve.txt' - wrapper: - 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_bam_markdups['bam']: 'sample1.bam', - annotation: 'dm6.gtf', - } - ) - tmpdir = str(tmpdir_factory.mktemp('dupradar_fixture')) - run(dpath('../wrappers/dupradar'), snakefile, None, input_data_func, tmpdir, use_conda=False) - mapping = dict( - density_scatter='sample1.density_scatter.png', - expression_histogram='sample1.expression_histogram.png', - expression_barplot='sample1.expression_barplot.png', - expression_boxplot='sample1.expression_boxplot.png', - multimapping_histogram='sample1.multimapping_histogram.png', - dataframe='sample1.dupradar.tsv', - ) - for k, v in mapping.items(): - mapping[k] = os.path.join(tmpdir, v) - return mapping - - -#@pytest.mark.xfail -def test_dupradar(sample1_se_dupradar): - assert open(sample1_se_dupradar['dataframe']).readline().startswith('"ID"\t"geneLength"') diff --git a/wrappers/test/test_fastq_screen.py b/wrappers/test/test_fastq_screen.py deleted file mode 100644 index 5cae9832..00000000 --- a/wrappers/test/test_fastq_screen.py +++ /dev/null @@ -1,36 +0,0 @@ -import os -import zipfile -from utils import run, dpath, rm, symlink_in_tempdir -from test_bowtie2 import bowtie2_indexes - -def test_fastq_screen(sample1_se_tiny_fq, bowtie2_indexes, tmpdir): - snakefile = ''' - rule fastq_screen: - input: - fastq='sample1_R1.fastq.gz', - dm6={indexes} - output: - txt='sample1_R1_screen.txt' - params: - subset=100000, - aligner='bowtie2' - wrapper: - "file:wrapper" - '''.format(indexes=bowtie2_indexes) - - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - with open('sample1_R1_screen.txt') as fh: - res = fh.readlines() - r1 = res[0].strip().split() - r3 = res[2].strip().split() - assert r1[-1] == '100000' - assert r3[0] == 'dm6' - - - run(dpath('../wrappers/fastq_screen'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_fastqc.py b/wrappers/test/test_fastqc.py deleted file mode 100644 index 5df5eda9..00000000 --- a/wrappers/test/test_fastqc.py +++ /dev/null @@ -1,70 +0,0 @@ -import os -import zipfile -from utils import run, dpath, rm, symlink_in_tempdir - -import pytest -from utils import tmpdir_for_func, _download_file - -@pytest.fixture(scope='session') -def fastqc(sample1_se_tiny_fq, tmpdir_factory): - snakefile = ''' - rule fastqc: - input: - fastq='sample1_R1.fastq.gz' - output: - html='sample1_R1_fastqc.html', - zip='sample1_R1_fastqc.zip' - wrapper: "file:wrapper"''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - tmpdir = str(tmpdir_factory.mktemp('fastqc_fixture')) - run(dpath('../wrappers/fastqc'), snakefile, None, input_data_func, tmpdir) - return os.path.join(tmpdir, 'sample1_R1_fastqc.zip') - - -def test_fastqc(sample1_se_tiny_fq, tmpdir): - snakefile = ''' - rule fastqc: - input: - fastq='sample1_R1.fastq.gz' - output: - html='results/sample1_R1.html', - zip='sample1_R1.zip' - wrapper: "file:wrapper"''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1_R1.fastq.gz' - } - ) - - def check(): - assert '' in open('results/sample1_R1.html').readline() - contents = [ - 'sample1_R1_fastqc/', - 'sample1_R1_fastqc/Icons/', - 'sample1_R1_fastqc/Images/', - 'sample1_R1_fastqc/Icons/fastqc_icon.png', - 'sample1_R1_fastqc/Icons/warning.png', - 'sample1_R1_fastqc/Icons/error.png', - 'sample1_R1_fastqc/Icons/tick.png', - 'sample1_R1_fastqc/summary.txt', - 'sample1_R1_fastqc/Images/per_base_quality.png', - 'sample1_R1_fastqc/Images/per_tile_quality.png', - 'sample1_R1_fastqc/Images/per_sequence_quality.png', - 'sample1_R1_fastqc/Images/per_base_sequence_content.png', - 'sample1_R1_fastqc/Images/per_sequence_gc_content.png', - 'sample1_R1_fastqc/Images/per_base_n_content.png', - 'sample1_R1_fastqc/Images/sequence_length_distribution.png', - 'sample1_R1_fastqc/Images/duplication_levels.png', - 'sample1_R1_fastqc/Images/adapter_content.png', - 'sample1_R1_fastqc/fastqc_report.html', - 'sample1_R1_fastqc/fastqc_data.txt', - 'sample1_R1_fastqc/fastqc.fo' - ] - for i in zipfile.ZipFile('sample1_R1.zip').namelist(): - assert i in contents - - run(dpath('../wrappers/fastqc'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_featurecounts.py b/wrappers/test/test_featurecounts.py deleted file mode 100644 index cb3760f3..00000000 --- a/wrappers/test/test_featurecounts.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir - -def test_featurecounts_se(sample1_se_tiny_bam, annotation, tmpdir): - snakefile = ''' - rule featurecounts: - input: - annotation='dm6.gtf', - bam='sample1.bam' - output: - counts='sample1.counts', - log: 'featurecounts.log' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.bam', - annotation: 'dm6.gtf', - } - ) - - def check(): - assert '//===================' in open('featurecounts.log').read() - assert '# Program:featureCounts' in open('sample1.counts').readline() - assert open('sample1.counts.summary').readline().startswith('Status') - assert sum(1 for _ in open('sample1.counts')) == 169 - - run(dpath('../wrappers/featurecounts'), snakefile, check, input_data_func, tmpdir) - -def test_featurecounts_pe(sample1_pe_tiny_bam, annotation, tmpdir): - snakefile = ''' - rule featurecounts: - input: - annotation='dm6.gtf', - bam='sample1.bam' - output: - counts='sample1.counts', - log: 'featurecounts.log' - params: extra='-p -P -s 1 -B --splitOnly' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_pe_tiny_bam: 'sample1.bam', - annotation: 'dm6.gtf', - } - ) - - def check(): - assert '//===================' in open('featurecounts.log').read() - assert '# Program:featureCounts' in open('sample1.counts').readline() - assert open('sample1.counts.summary').readline().startswith('Status') - assert sum(1 for _ in open('sample1.counts')) == 169 - - # TODO: maybe assert that below a certain level are counted when all - # those extra arguments are used? - - run(dpath('../wrappers/featurecounts'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_hisat2.py b/wrappers/test/test_hisat2.py deleted file mode 100644 index add7abb0..00000000 --- a/wrappers/test/test_hisat2.py +++ /dev/null @@ -1,120 +0,0 @@ -import os -import pytest -from snakemake.shell import shell -from lcdblib.snakemake import aligners -from utils import run, dpath, symlink_in_tempdir, tmpdir_for_func - - -@pytest.fixture(scope='session') -def hisat2_indexes(dm6_fa, tmpdir_factory): - d = tmpdir_for_func(tmpdir_factory) - snakefile = ''' - rule hisat2: - input: fasta='2L.fa' - output: index=['2L.1.ht2', '2L.2.ht2'] - log: 'hisat.log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - dm6_fa: '2L.fa' - } - ) - - def check(): - assert 'Total time for call to driver' in open('hisat.log').readlines()[-1] - assert list(shell('hisat2-inspect 2L -n', iterable=True)) == ['2L', '2R'] - - run( - dpath('../wrappers/hisat2/build'), - snakefile, check, input_data_func, d) - return aligners.hisat2_index_from_prefix(os.path.join(d, '2L')) - - -def _dict_of_hisat2_indexes(hisat2_indexes, prefix): - d = {} - indexes = aligners.hisat2_index_from_prefix(prefix) - hisat2_indexes = sorted(hisat2_indexes) - indexes = sorted(indexes) - for k, v in zip(hisat2_indexes, indexes): - d[k] = v - return d - - -def test_hisat2_align_se(hisat2_indexes, sample1_se_tiny_fq, tmpdir): - d = _dict_of_hisat2_indexes(hisat2_indexes, '2L') - indexes = list(d.values()) - snakefile = ''' - rule hisat2_align: - input: - fastq='sample1_R1.fastq.gz', - index={indexes} - output: - bam='sample1.bam' - log: "hisat2.log" - wrapper: "file:wrapper" - '''.format(indexes=indexes) - d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz' - input_data_func = symlink_in_tempdir(d) - - def check(): - assert "overall alignment rate" in open('hisat2.log').read() - - # should have at least some mapped and unmapped - assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) > 0 - assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0 - - run(dpath('../wrappers/hisat2/align'), snakefile, check, input_data_func, tmpdir) - - -def test_hisat2_align_se_SRA(hisat2_indexes, tmpdir): - d = _dict_of_hisat2_indexes(hisat2_indexes, '2L') - indexes = list(d.values()) - snakefile = ''' - rule hisat2_align: - input: - index={indexes} - output: - bam='sample1.bam' - params: hisat2_extra='--sra-acc SRR1990338' - log: "hisat2.log" - wrapper: "file:wrapper" - '''.format(indexes=indexes) - input_data_func = symlink_in_tempdir(d) - - def check(): - assert "overall alignment rate" in open('hisat2.log').read() - - # should have at least some mapped and unmapped - assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) > 0 - assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0 - - run(dpath('../wrappers/hisat2/align'), snakefile, check, input_data_func, tmpdir) - - -def test_hisat2_align_se_rm_unmapped(hisat2_indexes, sample1_se_tiny_fq, tmpdir): - d = _dict_of_hisat2_indexes(hisat2_indexes, '2L') - indexes = list(d.values()) - snakefile = ''' - rule hisat2_align: - input: - fastq='sample1_R1.fastq.gz', - index={indexes} - output: - bam='sample1.bam' - params: - samtools_view_extra='-F 0x04' - log: "hisat2.log" - wrapper: "file:wrapper" - '''.format(indexes=indexes) - d[sample1_se_tiny_fq] = 'sample1_R1.fastq.gz' - input_data_func = symlink_in_tempdir(d) - - def check(): - assert "overall alignment rate" in open('hisat2.log').read() - - # should have at least some mapped and unmapped - assert int(list(shell('samtools view -c -f 0x04 sample1.bam', iterable=True))[0]) == 0 - assert int(list(shell('samtools view -c -F 0x04 sample1.bam', iterable=True))[0]) > 0 - - run(dpath('../wrappers/hisat2/align'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_kallisto.py b/wrappers/test/test_kallisto.py deleted file mode 100644 index 32e32e1b..00000000 --- a/wrappers/test/test_kallisto.py +++ /dev/null @@ -1,69 +0,0 @@ -import os -import json -import pytest -import pysam -from snakemake.shell import shell -from lcdblib.snakemake import aligners -from utils import run, dpath, rm, symlink_in_tempdir, tmpdir_for_func - - -@pytest.fixture(scope='session') -def kallisto_index(tmpdir_factory, transcriptome): - d = tmpdir_for_func(tmpdir_factory) - snakefile = ''' - rule kallisto: - input: fasta='transcriptome.fa' - output: index='transcriptome.idx' - log: 'log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - transcriptome: 'transcriptome.fa', - } - ) - - def check(): - log = open('log').read() - assert '[build] target deBruijn graph' - - run( - dpath('../wrappers/kallisto/index'), - snakefile, check, input_data_func, d) - return os.path.join(d, 'transcriptome.idx') - - -def test_kallisto_quant(tmpdir, sample1_se_tiny_fq, kallisto_index): - snakefile = ''' - rule kallisto_quant: - input: - fastq='sample1.fq.gz', - index='out/transcriptome.idx' - - params: extra='--single --fragment-length=200 --sd=20' - output: - h5='quant/abundance.h5', - tsv='quant/abundance.tsv', - json='quant/run_info.json', - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1.fq.gz', - kallisto_index: 'out/transcriptome.idx', - } - ) - - def check(): - assert sum(1 for _ in open('quant/abundance.tsv')) == 310 - assert open('quant/abundance.tsv').readline() == ( - 'target_id\tlength\teff_length\test_counts\ttpm\n') - keys = ['call', 'index_version', 'n_bootstraps', 'n_processed', 'n_targets', 'start_time'] - d = json.load(open('quant/run_info.json')) - for k in keys: - assert k in d - - - run( - dpath('../wrappers/kallisto/quant'), - snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_multiqc.py b/wrappers/test/test_multiqc.py deleted file mode 100644 index 8f361807..00000000 --- a/wrappers/test/test_multiqc.py +++ /dev/null @@ -1,48 +0,0 @@ -import pytest -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir -from test_fastqc import fastqc - - -def test_multiqc(fastqc, tmpdir): - snakefile = ''' - rule multiqc: - input: 'results/sample1_R1_fastqc.zip' - output: 'multiqc.html' - log: 'log' - params: - analysis_directory='results' - wrapper: 'file:wrapper' - ''' - input_data_func=symlink_in_tempdir( - { - fastqc: 'results/sample1_R1_fastqc.zip', - } - ) - - def check(): - assert '' in open('multiqc.html').readline() - - run(dpath('../wrappers/multiqc'), snakefile, check, input_data_func, tmpdir) - -def test_multiqc_other_dir(fastqc, tmpdir): - snakefile = ''' - rule multiqc: - input: 'results/sample1_R1_fastqc.zip' - output: 'reports/multiqc.html' - log: 'log' - params: - analysis_directory='results' - wrapper: 'file:wrapper' - ''' - input_data_func=symlink_in_tempdir( - { - fastqc: 'results/sample1_R1_fastqc.zip', - } - ) - - def check(): - assert '' in open('reports/multiqc.html').readline() - - run(dpath('../wrappers/multiqc'), snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_picard.py b/wrappers/test/test_picard.py deleted file mode 100644 index 659d116b..00000000 --- a/wrappers/test/test_picard.py +++ /dev/null @@ -1,116 +0,0 @@ -import pytest -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir - - -@pytest.fixture(scope='session') -def sample1_se_bam_markdups(sample1_se_bam, tmpdir_factory): - snakefile = ''' - rule markduplicates: - input: - bam='sample1.bam' - output: - bam='sample1.dupsmarked.bam', - metrics='sample1.dupmetrics.txt' - log: 'log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_bam: 'sample1.bam', - } - ) - tmpdir = str(tmpdir_factory.mktemp('markduplicates_fixture')) - run(dpath('../wrappers/picard/markduplicates'), snakefile, None, input_data_func, tmpdir, use_conda=True) - return { - 'bam': os.path.join(tmpdir, 'sample1.dupsmarked.bam'), - 'metrics': os.path.join(tmpdir, 'sample1.dupmetrics.txt') - } - - -def test_markduplicates_se(sample1_se_bam_markdups, tmpdir): - assert open(sample1_se_bam_markdups['metrics']).readline().startswith('##') - - -def test_picard_collectrnaseqmetrics_se(sample1_se_tiny_bam, annotation_refflat, tmpdir): - snakefile = ''' - rule collectrnaseqmetrics: - input: - bam='sample1.bam', - refflat='dm6.refflat', - output: - metrics='sample1.metrics' - log: 'log' - params: - extra="STRAND=NONE", - java_args='-Xmx512m' - wrapper: 'file:wrapper' - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.bam', - annotation_refflat: 'dm6.refflat', - } - ) - - def check(): - assert '## METRICS CLASS' in open('sample1.metrics').read() - - run(dpath('../wrappers/picard/collectrnaseqmetrics'), snakefile, check, input_data_func, tmpdir, use_conda=True) - - -def test_picard_collectrnaseqmetrics_se_plot(sample1_se_tiny_bam, annotation_refflat, tmpdir): - snakefile = ''' - rule collectrnaseqmetrics: - input: - bam='sample1.bam', - refflat='dm6.refflat', - output: - metrics='sample1.metrics', - plot='sample1.pdf' - log: 'log' - params: extra="STRAND=NONE CHART=sample1.pdf" - wrapper: 'file:wrapper' - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.bam', - annotation_refflat: 'dm6.refflat', - } - ) - - def check(): - assert '## METRICS CLASS' in open('sample1.metrics').read() - - run(dpath('../wrappers/picard/collectrnaseqmetrics'), snakefile, check, input_data_func, tmpdir, use_conda=True) - - -@pytest.mark.xfail -def test_picard_collectrnaseqmetrics_too_small_heap(sample1_se_tiny_bam, annotation_refflat, tmpdir): - # set the java vm heap size to 128 bytes which should fail. This tests to - # make sure the java args are making it through to the wrapper. - snakefile = ''' - rule collectrnaseqmetrics: - input: - bam='sample1.bam', - refflat='dm6.refflat', - output: - metrics='sample1.metrics' - log: 'log' - params: - extra="STRAND=NONE", - java_args='-Xmx128' - wrapper: 'file:wrapper' - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1.bam', - annotation_refflat: 'dm6.refflat', - } - ) - - def check(): - assert '## METRICS CLASS' in open('sample1.metrics').read() - - run(dpath('../wrappers/picard/collectrnaseqmetrics'), snakefile, check, input_data_func, tmpdir, use_conda=True) diff --git a/wrappers/test/test_rseqc.py b/wrappers/test/test_rseqc.py deleted file mode 100644 index d97ae919..00000000 --- a/wrappers/test/test_rseqc.py +++ /dev/null @@ -1,151 +0,0 @@ -import pytest -import os -import gzip -from utils import run, dpath, rm, symlink_in_tempdir -from textwrap import dedent - -def test_infer_experiment(sample1_se_tiny_bam, annotation_bed12, tmpdir): - snakefile = ''' - rule infer_experiment: - input: - bam='sample1_R1.bam', - bed='dm6.bed12' - output: - txt = 'sample1_R1.infer_experiment.txt' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1_R1.bam', - annotation_bed12: 'dm6.bed12' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - expected = dedent("""\ - This is SingleEnd Data - Fraction of reads failed to determine: - Fraction of reads explained by "++,--": - Fraction of reads explained by "+-,-+":""").splitlines(False) - - with open('sample1_R1.infer_experiment.txt', 'r') as handle: - results = handle.read().strip() - for ex in expected: - assert ex in results - - run(dpath('../wrappers/rseqc/infer_experiment'), snakefile, check, input_data_func, tmpdir, use_conda=True) - - -def test_gB_cov(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, annotation_bed12, tmpdir): - snakefile = ''' - rule geneBody_coverage: - input: - bam='sample1_R1.sort.bam', - bai='sample1_R1.sort.bam.bai', - bed='dm6.bed12' - output: txt='sample1_R1.geneBodyCoverage.txt', - r='sample1_R1.geneBodyCoverage.r', - img='sample1_R1.geneBodyCoverage.pdf', - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1_R1.sort.bam', - sample1_se_tiny_bam_bai['bai']: 'sample1_R1.sort.bam.bai', - annotation_bed12: 'dm6.bed12' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - - # R code - with open('sample1_R1.geneBodyCoverage.r', 'r') as handle: - result = handle.readline().split(' ')[0] - - assert result == 'sample1_R1.sort' - - # text - with open('sample1_R1.geneBodyCoverage.txt', 'r') as handle: - result = handle.readlines()[1].split('\t')[0] - - assert result == 'sample1_R1.sort' - - # PDF - assert os.path.exists('sample1_R1.geneBodyCoverage.pdf') - - run(dpath('../wrappers/rseqc/geneBody_coverage'), snakefile, check, input_data_func, tmpdir, use_conda=True) - - -def test_gB_cov_png(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, annotation_bed12, tmpdir): - snakefile = ''' - rule geneBody_coverage: - input: - bam='sample1_R1.sort.bam', - bai='sample1_R1.sort.bam.bai', - bed='dm6.bed12' - output: - txt='sample1_R1.geneBodyCoverage.txt', - r='sample1_R1.geneBodyCoverage.r', - img='sample1_R1.geneBodyCoverage.png', - params: - extra: = '-f png' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1_R1.sort.bam', - sample1_se_tiny_bam_bai['bai']: 'sample1_R1.sort.bam.bai', - annotation_bed12: 'dm6.bed12' - } - ) - - def check(): - """ Check that the PNG is created """ - assert os.path.exists('sample1_R1.geneBodyCoverage.png') - - -@pytest.mark.skip -def test_tin(sample1_se_tiny_bam, sample1_se_tiny_bam_bai, annotation_bed12, tmpdir): - snakefile = ''' - rule tin: - input: - bam='sample1_R1.sort.bam', - bai='sample1_R1.sort.bam.bai', - bed='dm6.bed12' - output: table='sample1_R1.tin.tsv', - summary='sample1_R1.tin.summary.txt' - wrapper: "file:wrapper" - ''' - input_data_func=symlink_in_tempdir( - { - sample1_se_tiny_bam: 'sample1_R1.sort.bam', - sample1_se_tiny_bam_bai['bai']: 'sample1_R1.sort.bam.bai', - annotation_bed12: 'dm6.bed12' - } - ) - - def check(): - """ - check for line lengths and that they are at least different sized - """ - - # R code - with open('sample1_R1.tin.tsv', 'r') as handle: - result = handle.readline().strip().split('\t') - - assert result == ['geneID', 'chrom', 'tx_start', 'tx_end', 'TIN'] - - # text - with open('sample1_R1.tin.summary.txt', 'r') as handle: - result = handle.readline().strip().split('\t') - - assert result == ['Bam_file', 'TIN(mean)', 'TIN(median)', 'TIN(stdev)'] - - run(dpath('../wrappers/rseqc/tin'), snakefile, check, input_data_func, tmpdir, use_conda=True) - diff --git a/wrappers/test/test_salmon.py b/wrappers/test/test_salmon.py deleted file mode 100644 index 2e3796fa..00000000 --- a/wrappers/test/test_salmon.py +++ /dev/null @@ -1,83 +0,0 @@ -import os -import pytest -from snakemake.shell import shell -from utils import run, dpath, rm, symlink_in_tempdir, tmpdir_for_func - - -@pytest.fixture(scope='session') -def salmon_index(tmpdir_factory, transcriptome): - d = tmpdir_for_func(tmpdir_factory) - snakefile = ''' - rule salmon: - input: fasta='transcriptome.fa' - output: hash='salmon_index/hash.bin' - log: 'log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - transcriptome: 'transcriptome.fa', - } - ) - - def check(): - log = open('log').read() - assert '[info] done building index' in log - - run( - dpath('../wrappers/salmon/index'), - snakefile, check, input_data_func, d) - return os.path.join(d, 'salmon_index') - - -def test_salmon_quant(tmpdir, sample1_se_tiny_fq, salmon_index): - snakefile = ''' - rule salmon_quant: - input: - unmatedReads='sample1.fq.gz', - index=['idx/hash.bin', 'idx/sa.bin'] - output: 'sample1/salmon/quant.sf' - params: extra='--libType A' - log: 'salmon.quant.log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1.fq.gz', - salmon_index: 'idx', - } - ) - - def check(): - assert open('sample1/salmon/quant.sf').readline() == ( - 'Name\tLength\tEffectiveLength\tTPM\tNumReads\n') - - run( - dpath('../wrappers/salmon/quant'), - snakefile, check, input_data_func, tmpdir) - -def test_salmon_quant_single_index(tmpdir, sample1_se_tiny_fq, salmon_index): - snakefile = ''' - rule salmon_quant: - input: - unmatedReads='sample1.fq.gz', - index='idx/hash.bin' - output: 'sample1/salmon/quant.sf' - params: extra='--libType A' - log: 'salmon.quant.log' - wrapper: 'file:wrapper' - ''' - input_data_func = symlink_in_tempdir( - { - sample1_se_tiny_fq: 'sample1.fq.gz', - salmon_index: 'idx', - } - ) - - def check(): - assert open('sample1/salmon/quant.sf').readline() == ( - 'Name\tLength\tEffectiveLength\tTPM\tNumReads\n') - - run( - dpath('../wrappers/salmon/quant'), - snakefile, check, input_data_func, tmpdir) diff --git a/wrappers/test/test_samtools.py b/wrappers/test/test_samtools.py deleted file mode 100644 index 51ff105a..00000000 --- a/wrappers/test/test_samtools.py +++ /dev/null @@ -1,12 +0,0 @@ -import subprocess as sp -import pytest -from snakemake import shell - - -def test_samtools_sort_and_index(sample1_se_tiny_bam, sample1_se_tiny_bam_bai): - """ - This test is primarily a trigger for the fixtures. - """ - with pytest.raises(sp.CalledProcessError): - shell('samtools view {sample1_se_tiny_bam} 2L:1-100') - shell('samtools view {sample1_se_tiny_bam_bai[bam]} 2L:1-100') diff --git a/wrappers/test/utils.py b/wrappers/test/utils.py deleted file mode 100644 index 74dd396b..00000000 --- a/wrappers/test/utils.py +++ /dev/null @@ -1,152 +0,0 @@ -""" -Stripped-down version of Snakemake's test framework. -""" - -import sys -import os -from textwrap import dedent -import subprocess as sp -import tempfile -import hashlib -import urllib -import shutil -import shlex -import inspect - -import pytest -from snakemake import snakemake -from snakemake.shell import shell -from snakemake.utils import makedirs - - -SCRIPTPATH = shutil.which('snakemake') - -# test data url -URL = 'https://github.com/lcdb/lcdb-test-data/blob/add-chipseq/data/{}?raw=true' - - -def tmpdir_for_func(factory): - caller = inspect.stack()[1][3] - return str(factory.mktemp(caller)) - - -def _download_file(fn, d): - """ - Intended to be called from a pytest.fixture function. - - `fn` is a path to a file that is used to fill in `URL`. `d` is a tempdir - likely created by the calling function to which the file will be - downloaded. - - The path to the downloaded file is returned. - """ - url = URL.format(fn) - dest = os.path.join(d, fn) - makedirs(os.path.dirname(dest)) - basename = os.path.basename(fn) - shell('wget -q -O- {url} > {dest}') - return dest - - -def dpath(path): - "path relative to this file" - return os.path.realpath(os.path.join(os.path.dirname(__file__), path)) - - -def md5sum(filename): - data = open(filename, 'rb').read() - return hashlib.md5(data).hexdigest() - - -def run(path, snakefile, check=None, input_data_func=None, tmpdir=None, use_conda=False, **params): - """ - Parameters - ---------- - - path : str - Path to a wrapper directory. - - snakefile : str - Contents of a snakefile. `dedent()` will be run on it. - - check : callable or None - After running the snakefile on the input data, this function will be - called while inside the directory. This function is where the actual - tests (assertions etc) should be performed. - - If None, the snakefile will be run but no tests will be performed on - the output. - - input_data_func : None | callable - If not None, then this callable object will be called with - a single argument corresponding to the temp directory. It will be - called after the wrapper and test-case contents have been copied to the - temp dir, but before the test is run. It is expected to create any data - required in whatever directory structure is required. - - tmpdir : None or path - - """ - # store any tempdirs here for later deletion - to_clean_up = [] - - - if tmpdir is None: - tmpdir = tempfile.mkdtemp(prefix='.test', dir=os.path.abspath('.')) - else: - tmpdir = str(tmpdir) - try: - # copy over the wrapper - wrapper_dir = os.path.join(tmpdir, 'wrapper') - os.makedirs(wrapper_dir) - cmds = ( - 'find {} -maxdepth 1 -type f -print0 | xargs -0 cp -t {}' - .format(shlex.quote(path), shlex.quote(wrapper_dir)) - ) - sp.call(cmds, shell=True) - - # write the snakefile, filling in the "wrapper" placeholder - with open(os.path.join(tmpdir, 'Snakefile'), 'w') as fout: - fout.write('shell.executable("/bin/bash")\n') - fout.write(dedent(snakefile)) - - # Create the input data - input_data_func(tmpdir) - - success = snakemake(os.path.join(tmpdir, 'Snakefile'), workdir=tmpdir, stats='stats.txt', - snakemakepath=SCRIPTPATH, config={}, use_conda=use_conda, **params) - assert success, 'expected successful execution' - - # Change to the tmpdir and run the test function - if check is not None: - cwd = os.getcwd() - os.chdir(tmpdir) - check() - os.chdir(cwd) - - finally: - for t in to_clean_up: - shutil.rmtree(t) - #shutil.rmtree(tmpdir) - - -def symlink_in_tempdir(mapping): - """ - Returns a function that can be used for the `input_data_func` to utils.run. - - `mapping` is a dict where keys are 'target' and values are 'linkname'. - - It will symlink the data downloaded by the fixture into the temp dir - created for the test case. - """ - def _wrapped(tmpdir): - for k, v in mapping.items(): - _linkname = os.path.join(tmpdir, v) - _target = k - _linkdir = os.path.dirname(_linkname) - shell('mkdir -p {_linkdir} && ln -s {_target} {_linkname}') - return _wrapped - - -def rm(path): - shutil.rmtree(path) diff --git a/wrappers/test_toy.py b/wrappers/test_toy.py deleted file mode 100644 index a8e63a12..00000000 --- a/wrappers/test_toy.py +++ /dev/null @@ -1,100 +0,0 @@ -import os -from textwrap import dedent -import pytest -import utils - -# Each module has a config dict -config = dict() - - -def generic_fixture(key, mapping, factory): - """ - Tries to handle as much of the magic as possible. - - Parameters - ---------- - key : str - Key into the module-level config dict - - mapping : dict - Maps paths from fixtures to input files expected by the snakefile - - tmpdir : str - Path to temporary dir, usually created by utils.tmpdir_for_func - - Returns - ------- - After a successful Snakemake run, returns the dictionary of the config's - `output` key but with paths fixed to be relative to tmpdir. This returned - dict is ready to be used as a fixture by test functions. - """ - conf = config[key] - tmpdir = utils.tmpdir_for_func(factory) - input_data_func = utils.symlink_in_tempdir(mapping) - utils.run(utils.dpath(conf['wrapper']), conf['snakefile'], None, input_data_func, tmpdir) - output = conf['output'].copy() - for k, v in output.items(): - output[k] = os.path.join(tmpdir, v) - return output - - -# In order for the doc generation to find this config info without re-running -# all tests, it needs to be in the module-level dict. It similarly can't be -# added during the fixture function's runtime. -# -# However, the mapping and tmpdir must be provided by the function, so the -# config and the function are tightly coupled. -# -# So we add the item to the dictionary here, right above the function that will -# be using it to keep them tightly coupled in the file. -config['hisat2_index'] = dict( - description="Basic example of generating a hisat2 index", - wrapper="../wrappers/hisat2/build", - snakefile=""" - rule hisat2_build: - input: - fasta="2L.fa" - output: - index=expand("hisat2_index/assembly.{n}.ht2", n=range(1,9)) - log: "hisat.log" - wrapper: "file://wrapper" - """, - output={'prefix': 'hisat2_index/assembly'} -) - - -# All the hard work is done in the config and in generic_fixture(). Now we just -# need to set up the correct mapping of fixtures to input files. -@pytest.fixture(scope='module') -def hisat2_index(tmpdir_factory, dm6_fa): - mapping = {dm6_fa: '2L.fa'} - return generic_fixture('hisat2_index', mapping, tmpdir_factory) - -# The actual test. -def test_index(hisat2_index): - assert os.path.exists(hisat2_index['prefix'] + '.1.ht2') - - -def extract_examples_for_wrapper(wrapper): - """ - Returns the examples for the wrapper in markdown format. - - Parameters - ---------- - wrapper : str - Expected to be the value of one of the config dict's `wrapper` keys. - """ - markdown = [] - for k, v in config.items(): - if v['wrapper'] != wrapper: - continue - snakefile = dedent(v['snakefile']) - markdown.append( - dedent( - """ - {} - - ```python""".format(v['description']))) - markdown.append(snakefile) - markdown.append("```") - return "\n".join(markdown) diff --git a/wrappers/wrappers/atropos/README.md b/wrappers/wrappers/atropos/README.md deleted file mode 100644 index 56b28b18..00000000 --- a/wrappers/wrappers/atropos/README.md +++ /dev/null @@ -1,167 +0,0 @@ -# Wrapper for atropos -[Atropos](https://atropos.readthedocs.io/en/latest/index.html) is a fork of -[Cutadapt](http://cutadapt.readthedocs.io/en/stable/index.html) which finds and -removes adapter sequences, primers, poly-A tails and other types of unwanted -sequence from your high-throughput sequencing reads. - -# Examples - -Minimal usage: - -``` -rule atropos: - input: fastq='{sample}.fastq' - output: fastq='{sample}.trim.fastq' - threads: 4 - wrapper: - "file://path/to/atropos" -``` - -Use an adapters file and quality-trim reads to Q20: - -``` -rule atropos: - input: fastq='{sample}.fastq' - output: fastq='{sample}.trim.fastq' - params: extra="-a file:adapters.fa -q 20" - threads: 4 - wrapper: - "file://path/to/atropos" -``` - -Optionally provide the adapters file as input in order to trigger a re-run if -it has changed. The wrapper only pays attention to `input.fastq`, so adding -another key doesn't affect the wrapper: - -``` -rule atropos: - input: - fastq='{sample}.fastq', - adapters='adapters.fa' - output: fastq='{sample}.trim.fastq' - params: extra="-a file:adapters.fa -q 20" - threads: 4 - wrapper: - "file://path/to/atropos" -``` - -Example of how to use with other output files. Since the wrapper only pays -attention to `output.fastq`, so other output files can be indicated but their -filenames have to be indicated in `params.`: - -``` -rule atropos: - input: - fastq='{sample}.fastq', - adapters='adapters.fa' - output: - fastq='{sample}.trim.fastq', - short='{sample}.trim.too-short.fastq', - untrimmed='{sample}.untrimmed.fastq', - params: - extra=( - "-a file:adapters.fa " - "-q 20 " - "--too-short-output={sample}.trim.too-short.fastq " - "--untrimmed-output={sample}.untrimmed.fastq" - ) - threads: 4 - wrapper: - "file://path/to/atropos" -``` - -You can also run in pair-end mode. - -``` -rule atropos: - input: - R1='{sample}_r1.fastq', - R2='{sample}_r2.fastq', - adapters='adapters.fa' - output: - R1='{sample}_r1.trim.fastq', - R1='{sample}_r2.trim.fastq' - params: extra="-a file:adapters.fa -A file:adapters.fa -q 20" - threads: 4 - wrapper: - "file://path/to/atropos" -``` - - -## Input - -All inputs are FASTQ files, and they can be optionally gzipped. - -### Single-end mode: - -fastq : single-end FASTQ file - -### Paired-end mode: - -R1 : Read 1 FASTQ -R2 : Read 2 FASTQ - -See examples below for other input options including adapters. - -## Output -q -### Single-end mode: - -fastq : Trimmed FASTQ file. - -### Paired-end mode: - -R1 : trimmed R1 FASTQ file -R2 : trimmed R2 FASTQ file - -See examples below for other output options. - -## Log -If a log file is specified, stdout and stderr will be captured there. - -## Threads -One improvement of atropos over cutadapt is the ability to use threads which -are passed to the `-T` option. - -## Params -Additional parameters can be passed to atropos verbatim by supplying a string -in `params.extra`. - - -## Notes - -To dynamically select PE or SE without using `dynamic` support in snakemake, -you can use a PHONY rule and use a function for `params.R2`, like in this -example: - -```python -def _input_func_atropos(wildcards): - """Determine if the sample is PE or SE""" - flags = some function to pull in se or pe info - if 'PE' in flags: - return {'R1': expand(fastqs['r1'], **wildcards)[0], 'R2': expand(fastqs['r2'], **wildcards)[0]} - else: - return {'R1': expand(fastqs['r1'], **wildcards)[0]} - -def _params_r2_atropos(wildcards): - """function to make temp R2 if pe.""" - flags = some function to pull in se or pe info - if 'PE' in flags: - return expand(patterns['atropos']['r2'], **wildcards)[0] + '.tmp.gz' - else: - return None - -rule atropos: - input: unpack(_input_func_atropos) - output: R1=temp(patterns['atropos']['r1']) - params: R2=_params_r2_atropos - threads: 8 - wrapper: wrapper_for('atropos') - -rule atropos_phony: - input: rules.atropos.output - output: temp(patterns['atropos']['r2']) - shell: """ - mv {output[0]}.tmp.gz {output[0]} - """ -``` diff --git a/wrappers/wrappers/atropos/environment.yaml b/wrappers/wrappers/atropos/environment.yaml deleted file mode 100644 index 314bcf2c..00000000 --- a/wrappers/wrappers/atropos/environment.yaml +++ /dev/null @@ -1,4 +0,0 @@ -channels: - - bioconda -dependencies: - - atropos ==1.1.5 diff --git a/wrappers/wrappers/atropos/wrapper.py b/wrappers/wrappers/atropos/wrapper.py deleted file mode 100644 index b6af4311..00000000 --- a/wrappers/wrappers/atropos/wrapper.py +++ /dev/null @@ -1,80 +0,0 @@ -__author__ = "Ryan Dale" -__copyright__ = "Copyright 2016, Ryan Dale" -__email__ = "dalerr@niddk.nih.gov" -__license__ = "MIT" - -from snakemake.shell import shell - -extra = snakemake.params.get('extra', '') -log = snakemake.log_fmt_shell() -inputs = snakemake.input -outputs = snakemake.output - -if isinstance(inputs, dict) and isinstance(outputs, dict): - # Get inputs - in_R1 = inputs.get('R1', None) - in_R2 = inputs.get('R2', None) - in_FASTQ = inputs.get('fastq', None) - - if (in_R1 is None) and (in_FASTQ is not None): - in_R1 = in_FASTQ - elif (in_R1 is None) and (in_FASTQ is None): - raise KeyError('If providing a dictionary for input/output, you must uese either ' - '`R1` or `fastq` for the first read. If providing a second read you must use `R2`.') - - # Get outputs - out_R1 = outputs.get('R1', None) - out_R2 = outputs.get('R2', snakemake.params.get('R2', None)) - out_FASTQ = outputs.get('fastq', None) - - if (out_R1 is None) and (out_FASTQ is not None): - out_R1 = out_FASTQ - elif (out_R1 is None) and (out_FASTQ is None): - raise KeyError('If providing a dictionary for input/output, you must uese either ' - '`R1` or `fastq` for the first read. If providing a second read you must use `R2`.') - -elif isinstance(inputs, list) and isinstance(outputs, list): - # Get inputs - if len(inputs) == 1: - in_R1 = inputs[0] - in_R2 = None - elif len(inputs) == 2: - in_R1 = sorted(inputs)[0] - in_R2 = sorted(inputs)[1] - else: - raise IndexError("If providing a list for input/output, they must have either 1 or 2 values.") - - # Get outputs - if len(outputs) == 1: - out_R1 = outputs[0] - out_R2 = snakemake.params.get('R2', None) - elif len(outputs) == 2: - out_R1 = sorted(outputs)[0] - out_R2 = sorted(outputs)[1] - else: - raise IndexError("If providing a list for input/output, they must have either 1 or 2 values.") - -# Run paired end if both in_R2 and out_R2 are provided -if (in_R2 is not None) and (out_R2 is not None): - shell( - "atropos trim " - "--threads {snakemake.threads} " - "{extra} " - "-pe1 {in_R1} " - "-pe2 {in_R2} " - "-o {out_R1} " - "-p {out_R2} " - "{log}" - ) -elif (in_R1 is not None) and (out_R1 is not None) and (in_R2 is None) and (out_R2 is None): - shell( - "atropos trim " - "{extra} " - "--threads {snakemake.threads} " - "-se {in_R1} " - "-o {out_R1} " - "{log}" - ) -else: - raise ValueError("Input and Output must match. If you give two value for " - "input you must give two values for output.") diff --git a/wrappers/wrappers/average-bigwigs/README.md b/wrappers/wrappers/average-bigwigs/README.md deleted file mode 100644 index af837c1f..00000000 --- a/wrappers/wrappers/average-bigwigs/README.md +++ /dev/null @@ -1,75 +0,0 @@ -# Average bigWigs - -Often we'd like to merge multiple bigWigs together for downstream work -(heatmaps, etc) but there's no single tool to do this. This wrapper runs -`bigWigMerge` on the inputs to sum their values, then uses `awk` to divide by -their values and sort the way bedGraphToBigWig wants them. - -The intermediate bedGraph file will be created in ``$TMPDIR``. - -## Examples - -Minimal usage: - -```python -rule average_bigwigs: - input: - bigwigs=[ - 'a.bw', - 'b.bw', - 'c.bw'], - chromsizes='genome.chromsizes' - output: - 'out.bw' - wrapper: - 'file://path/to/wrapper' -``` - -Increase memory used for sorting: - -```python -rule average_bigwigs: - input: - bigwigs=[ - 'a.bw', - 'b.bw', - 'c.bw'], - chromsizes='genome.chromsizes' - output: - 'out.bw' - params: - memory='32G' - wrapper: - 'file://path/to/wrapper' -``` - -Single bigwig just gets symlinked over. - -```python -rule average_bigwigs: - input: - bigwigs='a.bw', - chromsizes='genome.chromsizes' - output: - 'out.bw' - params: - memory='32G' - wrapper: - 'file://path/to/wrapper' -``` - -## Input - -List of bigWig files. - - -## Output - -Single bigWig file created by averaging the inputs - -## Threads -Does not use threads - -## Params - -memory: Passed to `sort` as the `-S` argument. diff --git a/wrappers/wrappers/average-bigwigs/environment.yaml b/wrappers/wrappers/average-bigwigs/environment.yaml deleted file mode 100644 index 64dcd155..00000000 --- a/wrappers/wrappers/average-bigwigs/environment.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - bioconda -dependencies: - - ucsc-bigwigmerge - - ucsc-bedgraphtobigwig diff --git a/wrappers/wrappers/average-bigwigs/wrapper.py b/wrappers/wrappers/average-bigwigs/wrapper.py deleted file mode 100644 index 94be840a..00000000 --- a/wrappers/wrappers/average-bigwigs/wrapper.py +++ /dev/null @@ -1,32 +0,0 @@ -import os, sys -sys.path.append(os.path.abspath('../../')) -from lib import utils -import tempfile -from snakemake.shell import shell -# Inspired by http://wresch.github.io/2014/01/31/merge-bigwig-files.html - -# If memory was supplied, we'll use that for sorting. -if 'memory' in snakemake.params: - mem_arg = '-S {snakemake.params.memory}' -else: - mem_arg = '' - -if len(snakemake.input.bigwigs) == 1: - utils.make_relative_symlink(snakemake.input.bigwigs[0], snakemake.output[0]) - -else: - - # bigWigMerge outputs sum; we need to divide each by n. - f = 1.0 / len(snakemake.input.bigwigs) - - tmp = tempfile.NamedTemporaryFile(delete=False).name - tmpdir = tempfile.gettempdir() - - shell( - 'export LC_ALL=C; ' - 'bigWigMerge {snakemake.input.bigwigs} stdout 2> {snakemake.log} ' - """| awk 'BEGIN{{OFS="\t"}}{{$4={f}*$4; print}}' """ - '| sort {mem_arg} -T {tmpdir} -k1,1 -k2,2n > {tmp} ' - '&& bedGraphToBigWig {tmp} {snakemake.input.chromsizes} ' - '{snakemake.output} &>> {snakemake.log}' - ) diff --git a/wrappers/wrappers/combos/merge_and_dedup/README.md b/wrappers/wrappers/combos/merge_and_dedup/README.md deleted file mode 100644 index b768e7d2..00000000 --- a/wrappers/wrappers/combos/merge_and_dedup/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# Merge and deduplicate - -Merges BAM files and then deduplicates the output. However if only one BAM file -is created, the file is simply symlinked. - -This wrapper is often needed in ChIP-seq to merge technical replicates. The -same fragment could have been sequenced in multiple tech reps, resulting in -duplicate reads in the merged output even though each individual BAM already -had duplicates removed. - -This method has an advantage over merging first and then deduping in separate -rules when we want to retain both individual (per tech rep) deduped BAMs as -well as merged deduped BAMs. Since the deduping has already happened once for -each tech rep, we want to avoid doing so again if no merging happens. - -## Examples - -Minimal usage: - -```python -rule merge_and_dedup: - input: 'a1.bam', 'a2.bam' - output: - bam='a-merged.bam', - metrics='a-merged.bam.metrics' - wrapper: - 'file://path/to/wrapper' -``` - -In the following case, a symlink will be created since no merging needs to be -performed on a single file: - -```python -rule merge_and_dedup: - input: 'a1.bam' - output: - bam='a-merged.bam', - metrics='a-merged.bam.metrics' - wrapper: - 'file://path/to/wrapper' -``` - - -## Input - -Single BAM or list of BAMs. - -## Output - -- `bam`: output bam file -- `metrics`: optional output metrics file. Default is to use - `{snakemake.output.bam}.metrics`. - -## Threads - -Threads are passed to `samtools merge`. - -## Params - -- `samtools_merge_extra`: addtional args passed verbatim to `samtools merge` - -- `markduplicates_extra`: addtional args passed verbatim to `markduplicates_extra` - -- `java_args`: passed to MarkDuplicates, often used to provide more memory - (e.g., `-Xmx32g`). Be sure to increase the corresponding rule's memory - resource to account for the additional allocation diff --git a/wrappers/wrappers/combos/merge_and_dedup/environment.yaml b/wrappers/wrappers/combos/merge_and_dedup/environment.yaml deleted file mode 100644 index b3e77ddb..00000000 --- a/wrappers/wrappers/combos/merge_and_dedup/environment.yaml +++ /dev/null @@ -1,7 +0,0 @@ -channels: - - bioconda - - conda-forge - -dependencies: - - picard - - samtools diff --git a/wrappers/wrappers/demo/README.md b/wrappers/wrappers/demo/README.md deleted file mode 100644 index a87fb3aa..00000000 --- a/wrappers/wrappers/demo/README.md +++ /dev/null @@ -1,69 +0,0 @@ -# Demo wrapper - -This wrapper demonstrates current best-practices. - -The target audience of the wrapper's README should be yourself six months from -now, under a tight deadline, frantically looking for that rule you wrote so you -can copy/paste into a custom Snakefile. - -Examples should come first. There should be at least a minimal example and -a reasonably complicated example. To be complete you can add links to docs, -a brief description of the tool, and example output. - -This demo wrapper simply copies input files to output files. - -## Examples - -Minimal usage: - -```python -rule demo: - input: 'a.txt' - output: 'b.txt' - wrapper: - 'file://path/to/wrapper' -``` - -"paired-end" usage: - -```python -rule demo: - input: - R1='a1.txt', - R2='a2.txt' - output: - R1='b1.txt', - R2='b2.txt' - wrapper: - 'file://path/to/wrapper' -``` - -## Input - -Input file formats for this wrapper can be anything. - -### Single-end mode: - -Expects a single unnamed input file. - -### Paired-end mode: - -Expects two input files with keys `R1` and `R2`. - -## Output - -Output files are simply copies of input. - -### Single-end mode: - -Expects a single unnamed output file - -### Paired-end mode: - -Expects two output files with keys `R1` and `R2`. - -## Threads -Does not use threads - -## Params -Does not use params diff --git a/wrappers/wrappers/demo/environment.yaml b/wrappers/wrappers/demo/environment.yaml deleted file mode 100644 index f56993b2..00000000 --- a/wrappers/wrappers/demo/environment.yaml +++ /dev/null @@ -1,4 +0,0 @@ -channels: - - defaults -dependencies: - - python=3 diff --git a/wrappers/wrappers/demo/wrapper.py b/wrappers/wrappers/demo/wrapper.py deleted file mode 100644 index 158ce409..00000000 --- a/wrappers/wrappers/demo/wrapper.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python - -from snakemake.shell import shell - -# All wrappers must be able to handle an optional params.extra. -extra = snakemake.params.get('extra', '') - - -# This lets us handle whether to write to a log file or to write to stdout. -# See snakemake.script.log_fmt_shell for details. -log = snakemake.log_fmt_shell() - - -# This demo shows how to handle paired-end and single-end input data as two -# different cases, depending on whether the rule's input included an "R2" key -# or not. -paired_end = ( - 'R1' in snakemake.input.keys() and - 'R2' in snakemake.input.keys() -) - -if paired_end: - shell('cp {snakemake.input.R1} {snakemake.output.R1}') - shell('cp {snakemake.input.R2} {snakemake.output.R2}') - -else: - shell("cp {snakemake.input} {snakemake.output} {log}") diff --git a/wrappers/wrappers/dupradar/README.md b/wrappers/wrappers/dupradar/README.md deleted file mode 100644 index 0667bd9c..00000000 --- a/wrappers/wrappers/dupradar/README.md +++ /dev/null @@ -1,83 +0,0 @@ -# Wrapper for dupRadar - -dupRadar provides an easy way to distinguish between artifactual vs natural -duplicate reads in RNA-Seq data. Prior to dupRadar only global duplication rates -were used and they don't take into account the effect of gene expression levels. -dupRadar relates *duplication rates* and *length normalized read counts* of every -gene to model the dependency of both variables. - -[Link to homepage](https://www.bioconductor.org/packages/release/bioc/html/dupRadar.html) - -[Link to manual](https://www.bioconductor.org/packages/devel/bioc/vignettes/dupRadar/inst/doc/dupRadar.html) - -## Example - -Single-end, not stranded: - -```python -rule dupRadar: - input: - bam='sample1.bam', - annotation='dm6.gtf', - output: - density_scatter='sample1.density_scatter.png', - expression_histogram='sample1.expression_histogram.png', - expression_boxplot='sample1.expression_boxplot.png', - expression_barplot='sample1.expression_barplot.png', - multimapping_histogram='sample1.multimapping_histogram.png', - dataframe='sample1.dupradar.tsv' - wrapper: - wrapper_for('dupRadar') -``` - -Paired-end, stranded: - -```python -rule dupRadar: - input: - bam='{sample_dir}/{sample}/{sample}.cutadapt.hisat2.unique.sort.dedup.bam', - annotation='annotations/dm6.gtf', - output: - density_scatter='sample1.density_scatter.png', - expression_histogram='sample1.expression_histogram.png', - expression_boxplot='sample1.expression_boxplot.png', - expression_barplot='sample1.expression_barplot.png', - dataframe='sample1.dupradar.tsv' - params: - paired=True, - stranded=True - wrapper: - wrapper_for('dupRadar') -``` - -## Input -* `bam`: BAM file with mapped reads has to be duplicate marked using either - Picard or BamUtil - -* `annotation`: GTF file contaning features to count the reads falling on the - features. - -## Output -Output plots are described in the [dupRadar -vignette)[http://bioconductor.org/packages/release/bioc/vignettes/dupRadar/inst/doc/dupRadar.html]. -See that page for descriptions of outputs and how to interpret them. - -* `density_scatter`: expression vs percent duplication -* `expression_boxplot`: expression vs percent duplication, binned into boxes -* `expression_histogram`: standard histogram of expression (RPKM) -* `expression_barplot`: percentage duplication in 5% expression bins. -* `multimapping_histogram`: histogram showing fraction of reads coming from - multimapping reads -* `dataframe`: results from `analyzeDuprates` saved as a TSV for downstream - analysis. Following the vignette, we also add the fraction of multimappers in - each gene as the column `mhRate`. -* `model`: Slope and intercept of the dupsExpFit -* `curve`: Simplified curve of the GLM for downstream plotting - -## Threads -Threads are passed to dupRadar and are in turn passed to featureCounts, which -it calls automatically. - -## Params -* `paired`: True | False. Default False. -* `stranded`: True | False | "reverse". Default False. diff --git a/wrappers/wrappers/dupradar/environment.yaml b/wrappers/wrappers/dupradar/environment.yaml deleted file mode 100644 index d59b35e1..00000000 --- a/wrappers/wrappers/dupradar/environment.yaml +++ /dev/null @@ -1,10 +0,0 @@ -channels: - - conda-forge - - bioconda - - lcdb -dependencies: - - python=3 - - bioconductor-dupradar - - r-kernsmooth - - r-base >=3.5.1 - - ghostscript diff --git a/wrappers/wrappers/dupradar/wrapper.py b/wrappers/wrappers/dupradar/wrapper.py deleted file mode 100644 index e9ef30d6..00000000 --- a/wrappers/wrappers/dupradar/wrapper.py +++ /dev/null @@ -1,94 +0,0 @@ -import tempfile -from snakemake.shell import shell -import os, sys -sys.path.append(os.path.abspath('../..')) -from lib import helpers - -extra = snakemake.params.get('extra', '') -try: - log = snakemake.log -except AttributeError: - log = None - -stranded = snakemake.params.get('stranded', False) -try: - stranded_int = {False: 0, True: 1, 'reverse': 2}[stranded] -except KeyError: - raise ValueError('"stranded" must be True|False|"reverse"') - -paired = snakemake.params.get('paired', False) -try: - paired_bool= {True: 'TRUE', False: 'FALSE'}[paired] -except KeyError: - raise ValueError('"paired" must be True or False') - -tempdir = tempfile.mkdtemp() - -# To avoid issues with png() related to X11 and cairo, we can use bitmap() instead. -# (thanks -# http://stackoverflow.com/questions/24999983/ -# r-unable-to-start-device-png-capabilities-has-true-for-png -# #comment52353278_25064603 ) - -script = """ -library(dupRadar) -bam <- "{snakemake.input.bam}" -gtf <- "{snakemake.input.annotation}" -dm <- analyzeDuprates(bam, gtf, {stranded_int}, {paired_bool}, {snakemake.threads}, tmpDir = "{tempdir}") - -dm$mhRate <- (dm$allCountsMulti - dm$allCounts) / dm$allCountsMulti -bitmap(file="{snakemake.output.multimapping_histogram}") -hist(dm$mhRate, breaks=50, main=basename(bam), - xlab="Multimapping rate per gene", ylab="Frequency") -dev.off() - -bitmap(file="{snakemake.output.density_scatter}") -duprateExpDensPlot(dm, main=basename(bam)) -dev.off() - -bitmap(file="{snakemake.output.expression_histogram}") -expressionHist(dm) -dev.off() - -bitmap(file="{snakemake.output.expression_boxplot}") -par(mar=c(10,4,4,2)+.1) -duprateExpBoxplot(dm, main=basename(bam)) -dev.off() - -bitmap(file="{snakemake.output.expression_barplot}") -readcountExpBoxplot(dm) -dev.off() - -write.table(dm, file="{snakemake.output.dataframe}", sep="\\t") - -# The following is from -# https://github.com/ewels/NGI-RNAseq/blob/master/bin/dupRadar.r - -fit <- duprateExpFit(DupMat=dm) -df <- data.frame(intercept=as.numeric(fit$intercept), slope=c(fit$slope)) -cat("# dupRadar model params\\n", file="{snakemake.output.model}") -write.table(df, file="{snakemake.output.model}", sep="\\t", append=TRUE, row.names=FALSE) - -# Get numbers from dupRadar GLM -curve_x <- sort(log10(dm$RPK)) -curve_y = 100*predict(fit$glm, data.frame(x=curve_x), type="response") -# Remove all of the infinite values -infs = which(curve_x %in% c(-Inf,Inf)) -curve_x = curve_x[-infs] -curve_y = curve_y[-infs] -# Reduce number of data points -curve_x <- curve_x[seq(1, length(curve_x), 10)] -curve_y <- curve_y[seq(1, length(curve_y), 10)] -# Convert x values back to real counts -curve_x = 10^curve_x -# Write to file -write.table( - cbind(curve_x, curve_y), - file="{snakemake.output.curve}", - quote=FALSE, row.names=FALSE -) -""".format(**locals()) - -tmp = tempfile.NamedTemporaryFile(delete=False).name -helpers.rscript(script, tmp, log=log) -shell("rm -r {tempdir}") diff --git a/wrappers/wrappers/epic2/environment.yaml b/wrappers/wrappers/epic2/environment.yaml deleted file mode 100644 index cacda5da..00000000 --- a/wrappers/wrappers/epic2/environment.yaml +++ /dev/null @@ -1,8 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - epic2 - - numpy - - bedtools - - ucsc-bedsort=377 diff --git a/wrappers/wrappers/fastq-dump/environment.yaml b/wrappers/wrappers/fastq-dump/environment.yaml deleted file mode 100644 index 6653b6cc..00000000 --- a/wrappers/wrappers/fastq-dump/environment.yaml +++ /dev/null @@ -1,5 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - sra-tools>=3 diff --git a/wrappers/wrappers/fastq-dump/wrapper.py b/wrappers/wrappers/fastq-dump/wrapper.py deleted file mode 100644 index 507efe43..00000000 --- a/wrappers/wrappers/fastq-dump/wrapper.py +++ /dev/null @@ -1,41 +0,0 @@ -from snakemake import shell -output = snakemake.output -log = snakemake.log - -srr = snakemake.params.sampletable.loc[snakemake.wildcards.sample, 'Run'] - -if hasattr(snakemake.params, "limit"): - limit = f'-X {snakemake.params.limit}' -else: - limit = "" - -# Two different paths depending on the layout. In both cases, we -# want to avoid creating the final output until the very end, to -# avoid incomplete downloads. -if snakemake.params.is_paired: - # For PE we need to use --split-files, which also means using - # the slower --gzip - shell( - 'fastq-dump ' - '{srr} ' - '--gzip ' - '--split-files ' - '{limit} ' - '&> {log}' - ) - - # The filenames are predictable, so we can move them as needed. - shell('mv {srr}_1.fastq.gz {output[0]}') - shell('mv {srr}_2.fastq.gz {output[1]}') - -else: - # For SE, we can use the faster stdout | gzip, and move it - # directly when done. - shell( - 'fastq-dump ' - '{srr} ' - '-Z ' - '{limit} ' - '2> {log} | gzip -c > {output[0]}.tmp ' - '&& mv {output[0]}.tmp {output[0]} ' - ) diff --git a/wrappers/wrappers/fastq_screen/README.md b/wrappers/wrappers/fastq_screen/README.md deleted file mode 100644 index efd36a32..00000000 --- a/wrappers/wrappers/fastq_screen/README.md +++ /dev/null @@ -1,61 +0,0 @@ -# Wrapper for fastq_screen - -[`fastq_screen`](http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen) -screens a library of sequences in FASTQ format against a set of sequence -databases identifying the composition of the library and possible contaminants. - -Fastq screen uses a configuration file pointing to different database. For example: - -``` -DATABASE ecoli /data/Escherichia_coli/Bowtie2Index/genome BOWTIE2 -DATABASE hg19 /data/hg19/Bowtie2Index/genome BOWTIE2 -DATABASE mm10 /data/mm10/Bowtie2Index/genome BOWTIE2 -``` - -This configuration file is automatically generated by the wrapper based on -which indexes are given as inputs (see **Example**). Currently the wrapper only -supports bowtie2 and defaults to using a subset of 100000 reads. Which can be -overridden using `params.subset` setting. Furthermore, `params.extra` is -passed arguments verbatim to `fastq_screen`, for example -`extra="--illumina1_3"` or `extra="--bowtie2 '--trim5=8'"`. - -Note that `fastq_screen` hard-codes the output filenames. This wrapper moves -the hard-coded output files to those specified by the rule. Currently the -wrapper does not save png's generated by fastq screen. It does, however, support -the contextual saving of tagged and/or filtered output fastqs from fastq_screen. -If desired, combinations of "--tag" and/or "--filter [filter_codes]" should be -provided to the run via the "extra" parameter in the Snakemake rule. The output -fastqs will *not* be tracked by Snakemake. They will be named as -"{snakemake.output.txt}.tagged.fastq.gz" or "{snakemake.output.txt}.tagged_filter.fastq.gz" -respectively. - -## Example: - -``` -rule fastq_screen: - input: - fastq="samples/{sample}.fastq.gz", - ecoli=["/data/Escherichia_coli/Bowtie2Index/genome.1.bt2", "/data/Escherichia_coli/Bowtie2Index/genome.2.bt2"], - hg19=["/data/hg19/Bowtie2Index/genome.1.bt2", "/data/hg19/Bowtie2Index/genome.2.bt2"], - mm10=["/data/mm10/Bowtie2Index/genome.1.bt2", "/data/mm10/Bowtie2Index/genome.2.bt2"] - output: - txt="qc/{sample}.fastq_screen.txt" - params: - subset=100000, - aligner='bowtie2' - threads: 8 - wrapper: - "file:wrapper" -``` - -## Input - -* `fastq` is a FASTQ file, gzipped or not. - -* Additional arguments are used as labels and their values will be used to - generate database location. - -## Output - -`txt`: a text file containing the fraction of reads mapping to each provided -index diff --git a/wrappers/wrappers/fastq_screen/environment.yaml b/wrappers/wrappers/fastq_screen/environment.yaml deleted file mode 100644 index 360a727c..00000000 --- a/wrappers/wrappers/fastq_screen/environment.yaml +++ /dev/null @@ -1,7 +0,0 @@ -channels: - - conda-forge - - bioconda -dependencies: - - python=3 - - fastq-screen - - bowtie2 diff --git a/wrappers/wrappers/fastq_screen/wrapper.py b/wrappers/wrappers/fastq_screen/wrapper.py deleted file mode 100644 index 9b262cc1..00000000 --- a/wrappers/wrappers/fastq_screen/wrapper.py +++ /dev/null @@ -1,72 +0,0 @@ -import os -from snakemake.shell import shell -import sys -sys.path.append(os.path.abspath('../..')) -from lib import aligners -import tempfile - -__author__ = "Ryan Dale" -__copyright__ = "Copyright 2016, Ryan Dale" -__email__ = "dalerr@niddk.nih.gov" -__license__ = "MIT" - -# Pull in parameters -extra = snakemake.params.get('extra', '') -aligner = snakemake.params.get('aligner', 'bowtie2') -subset = snakemake.params.get('subset', 100000) - -if aligner == 'bowtie2': - parse_index = aligners.prefix_from_bowtie2_index - -# Make log -log = snakemake.log_fmt_shell() - -# snakemake.params.fastq_screen_config can be either a dict or a string. If -# string, interpret as a filename pointing to the fastq_screen config file. -# Otherwise, create a new tempfile out of the contents of the dict: - -tmp = tempfile.NamedTemporaryFile(delete=False).name -with open(tmp, 'w') as fout: - for k, v in snakemake.input.items(): - if k != 'fastq': - label = k - if isinstance(v, str): - v = [v] - index = parse_index(v) - fout.write( - '\t'.join(['DATABASE', label, index, aligner.upper()]) + '\n') - config_file = tmp - -# fastq_screen hard-codes filenames according to this prefix. We will send -# hard-coded output to a temp dir, and then move them later. -tempdir = tempfile.mkdtemp() - -# Note that we assume only R1 is coming in. -prefix = os.path.basename(snakemake.input.fastq[0].split('.fastq')[0]) - -shell( - "fastq_screen --outdir {tempdir} " - "--force " - "--aligner {aligner} " - "--conf {config_file} " - "--subset {subset} " - "--threads {snakemake.threads} " - "{extra} " - "{snakemake.input.fastq} " - "{log}" -) - -# Move output to the filenames specified by the rule -shell("cp {tempdir}/{prefix}_screen.txt {snakemake.output.txt}") - -# Check for the output of the --tag option to fastq_screen -if os.path.isfile("{tempdir}/{prefix}.tagged.fastq.gz"): - shell("cp {tempdir}/{prefix}.tagged.fastq.gz {snakemake.output.txt}.tagged.fastq.gz") - -# Check for the output of the --filter XXXXXX option to fastq_screen -if os.path.isfile("{tempdir}/{prefix}.tagged_filter.fastq.gz"): - shell("cp {tempdir}/{prefix}.tagged_filter.fastq.gz {snakemake.output.txt}.tagged_filter.fastq.gz") - -# Clean up temp -shell("rm -r {tempdir}") -shell("rm {tmp}") diff --git a/wrappers/wrappers/fastqc/README.md b/wrappers/wrappers/fastqc/README.md deleted file mode 100644 index 678bf9be..00000000 --- a/wrappers/wrappers/fastqc/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Wrapper for FastQC - -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) performs -quality control for high-throughput sequencing data. - -## Input -FASTQ, SAM, or BAM file. FastQC will auto-detect, but you can also use -`--format` and one of bam, sam, bam_mapped, sam_mapped or fastq in the -params.extra field (see example). - -## Output -- html: an html file containing the report for the sample -- zip: a zip file containing the images and text file of results - -## Threads -Supports threads, passed in as the `--threads` arg - -## Params -Additional parameters can be passed to FastQC verbatim by supplying a string in params.extra. - -# Example - -``` -rule fastqc: - input: 'samples/{sample}.fastq' - output: - html='samples/{sample}.fastqc.html', - zip='samples/{sample}.fastqc.zip' - params: extra="--contaminants adapters.tsv --format fastq" - wrapper: - "file://path/to/fastqc" -``` diff --git a/wrappers/wrappers/fastqc/environment.yaml b/wrappers/wrappers/fastqc/environment.yaml deleted file mode 100644 index 3d0dee62..00000000 --- a/wrappers/wrappers/fastqc/environment.yaml +++ /dev/null @@ -1,9 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - # for fastqc running in minimal containers, which complain about missing - # fonts - - openjdk >=8.0.144 - - font-ttf-dejavu-sans-mono - - fastqc diff --git a/wrappers/wrappers/fastqc/wrapper.py b/wrappers/wrappers/fastqc/wrapper.py deleted file mode 100644 index 32032bbd..00000000 --- a/wrappers/wrappers/fastqc/wrapper.py +++ /dev/null @@ -1,48 +0,0 @@ -__author__ = "Ryan Dale" -__copyright__ = "Copyright 2016, Ryan Dale" -__email__ = "dalerr@niddk.nih.gov" -__license__ = "MIT" - -import os -from snakemake.shell import shell -from snakemake.utils import makedirs - -# fastqc creates a zip file and an html file but the filename is hard-coded by -# replacing fastq|fastq.gz|fq|fq.gz|bam with _fastqc.zip|_fastqc.html in the -# input file's basename. -# -# So we identify that file and move it to the expected output after fastqc is -# done. - -outfile = os.path.basename(snakemake.input[0]) -outdir = os.path.dirname(snakemake.output.html) -if outdir == '': - outdir = '.' - -strip = ['.fastq', '.fq', '.gz', '.bam'] -for s in strip: - outfile = outfile.replace(s, '') -out_zip = os.path.join(outdir, outfile + '_fastqc.zip') -out_html = os.path.join(outdir, outfile + '_fastqc.html') - -extra = snakemake.params.get('extra', '') -log = snakemake.log_fmt_shell() - -shell( - 'fastqc ' - '--threads {snakemake.threads} ' - '--noextract ' - '--quiet ' - '--outdir {outdir} ' - '{extra} ' - '{snakemake.input} ' - '{log} ' -) - -def same_file(x, y): - return os.path.abspath(x) == os.path.abspath(y) - -if not same_file(out_zip,snakemake.output.zip): - shell('mv {out_zip} {snakemake.output.zip}') -if not same_file(out_html, snakemake.output.html): - shell('mv {out_html} {snakemake.output.html}') diff --git a/wrappers/wrappers/macs2/callpeak/README.md b/wrappers/wrappers/macs2/callpeak/README.md deleted file mode 100644 index bafad838..00000000 --- a/wrappers/wrappers/macs2/callpeak/README.md +++ /dev/null @@ -1,61 +0,0 @@ -# MACS2 - -Wraps the `macs2 callpeak` subprogram to call ChIP-seq peaks on input BAM -files. - -## Examples - -Minimal usage. MACS2 outputs a whole directory; this directory is the dirname -of `output.bed`. Note the specification of the genome size in `params.extra`. - -```python -rule macs2: - input: - treatment='ip.bam', - control='input.bam', - chromsizes='dm6.chromsizes' - output: - bed='out/peaks.bed' - extra: '-g dm' - wrapper: - 'file://path/to/wrapper' -``` - -MACS2 supports multiple ip and input samples (they are concatenated). This also -shows broad peak-calling, asks MACS2 to create scaled bedgraphs, and adds them as -output files so downstream rules can use them: - -```python -rule macs2: - input: - treatment=['ip1.bam', 'ip2.bam'], - control=['input1.bam', 'input2.bam'], - chromsizes='dm6.chromsizes' - output: - bed='out/peaks.bed' - params: extra='-g dm --bdg --SPMR --broad' - wrapper: - 'file://path/to/wrapper' -``` - -## Input - -`treatment`: single BAM or list of BAMs for IP - -`control`: single BAM or list of BAMs for input - -`chromsizes`: Chromsizes table, used to ensure peak boundaries do not extend -outside of chromosome limits. - -## Output - -`bed`: BED file of called peaks. This is symlinked from the -`*_peaks.narrowPeak` or `*_peaks.broadPeak` file created by MACS2. - -Other files are created, these can be added as additional named outputs for use -by downstream rules, however the wrapper only pays attention to -`snakemake.output.bed`. - - -## Params -Additional params in `extra` will be passed verbatim to `macs2 callpeak`. diff --git a/wrappers/wrappers/macs2/callpeak/environment.yaml b/wrappers/wrappers/macs2/callpeak/environment.yaml deleted file mode 100644 index 51d04270..00000000 --- a/wrappers/wrappers/macs2/callpeak/environment.yaml +++ /dev/null @@ -1,8 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - macs2 - - numpy - - bedtools - - ucsc-bedsort=377 diff --git a/wrappers/wrappers/sicer/README.md b/wrappers/wrappers/sicer/README.md deleted file mode 100644 index 9be29101..00000000 --- a/wrappers/wrappers/sicer/README.md +++ /dev/null @@ -1,59 +0,0 @@ -# SICER - -Wraps the `sicer` program to call ChIP-seq peaks on input BED files. - -## Examples - -Minimal usage. SICER is the best operating piece of hot garbage you'll ever find. -It has a completely fixed set of input parameters it requires, hard-coded genome -data in SICER/lib/GenomeData.py (submit bug report in bioconda if you need -additions), and it can't be run from the same directory at the same time due to -hard coded output filenames. It's a proper mess boss. - -```python -rule sicer: - input: - ip='ip.bed', - control='input.bed', - redundancy_threshold=1, - window_size=200, - fragment_size=150, - effective_genome_fraction=0.75, - gap_size=600, - fdr=0.01 - output: - bed='out/peaks.bed' - wrapper: - 'file://path/to/wrapper' -``` - - -## Input - -`ip`: single BED for IP - -`control`: single BED for input - -`redundancy_threshold`: cutoff count above which duplicates are removed - -`window_size`: SICER resolution; 200 recommended for histones - -`fragment_size`: twice the shift from the beginning to the center of a read - -`effective_genome_fraction`: percentage of mappable genome; only set it here if you want to override the genome build in config.yaml - -`gap_size`: nonnegative integer multiple of window size. used to merge contiguous regions (higher means more liberal merging). - -`fdr`: FDR cutoff for calling significant regions. - -## Output - -`bed`: BED file of called peaks. This is a delicately processed version of `*island.bed` from SICER. - -Other files are created, these can be added as additional named outputs for use -by downstream rules, however the wrapper only pays attention to -`snakemake.output.bed`. - - -## Params -Do not use `extra` for this rule. diff --git a/wrappers/wrappers/sicer/environment.yaml b/wrappers/wrappers/sicer/environment.yaml deleted file mode 100644 index 44cd4d76..00000000 --- a/wrappers/wrappers/sicer/environment.yaml +++ /dev/null @@ -1,10 +0,0 @@ -channels: - - bioconda - - conda-forge -dependencies: - - python=2 - - numpy - - sicer - - bedtools - - ucsc-bedsort=377 - - ucsc-wigtobigwig=377 diff --git a/wrappers/wrappers/sicer/wrapper.py b/wrappers/wrappers/sicer/wrapper.py deleted file mode 100644 index 7fd29a9e..00000000 --- a/wrappers/wrappers/sicer/wrapper.py +++ /dev/null @@ -1,147 +0,0 @@ -import tempfile -import os -import glob -from snakemake import shell - -logfile = None - -# as SICER's interface is rather strict, this wrapper enforces named variables -# instead of 'extra' arbitrary string - -def get_value(key, key2=None): - """ - Get the value from params.block if it exists, otherwise from params. - - If key2 is not None, it's a different key to extract from the same params.block. - - Raises ValueError if nothing is configured. - """ - if key2 is None: - key2 = key - val = snakemake.params.block.get(key, snakemake.params.get(key)) - else: - val = snakemake.params.block.get(key, snakemake.params.block.get(key2)) - - if val is None: - raise ValueError( - "SICER requires the specification of '{0}'".format(key)) - return val - -redundancy_threshold = get_value('redundancy_threshold') -window_size = get_value('window_size') -fragment_size = get_value('fragment_size') -effective_genome_fraction = get_value('effective_genome_fraction', 'reference_effective_genome_fraction') -gap_size = get_value('gap_size') -fdr = get_value('fdr') -genome_build = get_value('genome_build', 'reference_genome_build') - -outdir, basebed = os.path.split(snakemake.output.bed) -label = snakemake.params.block['label'] - -tmpdir = tempfile.mkdtemp() -cwd = os.getcwd() - -# SICER expects bed input format, not bam as in other peak callers -shell( - 'bamToBed -i {snakemake.input.ip} > {tmpdir}/ip.bed ; ' - 'bamToBed -i {snakemake.input.control} > {tmpdir}/in.bed ' -) - -# SICER emits a single hard-coded file that does not respect output directory. -# So move each run into its own temp directory to avoid collisions with -# other processes. -os.chdir(tmpdir) - -shell( - # there is a CI-specific bug, in which the python symlink is not correctly resolved to python2.7; - # so as a really desperate hack, modify SICER's python calls to directly touch 2.7 - """sed 's/^python/$CONDA_PREFIX\/bin\/python2.7/' """ - """$CONDA_PREFIX/share/sicer*/SICER.sh > {tmpdir}/SICER.sh && chmod u+x {tmpdir}/SICER.sh """ -) -shell( - # run SICER - """{tmpdir}/SICER.sh {tmpdir} ip.bed in.bed {tmpdir} """ - """{genome_build} {redundancy_threshold} {window_size} """ - """{fragment_size} {effective_genome_fraction} {gap_size} {fdr} > tmp.output 2>&1 """ -) - -# Move back once the run is complete. -os.chdir(cwd) - -# one of the results files gets converted to the broadPeak format ala macs -resultsfile = glob.glob(os.path.join(tmpdir, '*-islands-summary-FDR*')) -if len(resultsfile) == 1: - hit = resultsfile[0] - basehit = os.path.basename(resultsfile[0]) -elif len(resultsfile) > 1: - raise ValueError( - "Multiple islands-summary-FDR files found in {1}: {0}" - .format(os.listdir(tmpdir), tmpdir) - ) -else: - raise ValueError("No islands-summary-FDR file found in {1}: {0}".format(os.listdir(tmpdir), tmpdir)) - -# "summary graph for [the run] in bedGraph format" -summary_graph = glob.glob(os.path.join(tmpdir, '*-W{0}.graph*'.format(window_size))) -if len(summary_graph) == 1: - summary_graph = summary_graph[0] -else: - raise ValueError("SICER graph output file not found") - -# the bedGraph file above, normalized by library size per million, in wig format -normalized_prefilter_wig = glob.glob(os.path.join(tmpdir, '*-W{0}-normalized.wig'.format(window_size))) -if len(normalized_prefilter_wig) == 1: - normalized_prefilter_wig = normalized_prefilter_wig[0] -else: - raise ValueError("SICER normalized prefilter wig file not found") - -# "summary of all candidate islands with their statistical significance -candidate_islands = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-islands-summary'.format(window_size, gap_size))) -if len(candidate_islands) == 1: - candidate_islands = candidate_islands[0] -else: - raise ValueError("SICER candidate islands file not found") - -# "delineation of significant islands" -significant_islands = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-island.bed'.format(window_size, gap_size))) -if len(significant_islands) == 1: - significant_islands = significant_islands[0] -else: - raise ValueError("SICER significant islands file not found") - -# "library of raw redundancy-removed reads on significant islands -redundancy_removed = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-islandfiltered.bed'.format(window_size, gap_size))) -if len(redundancy_removed) == 1: - redundancy_removed = redundancy_removed[0] -else: - raise ValueError("SICER redundancy removed library file not found") - -# "wig file for the island-filtered redundancy-removed reads -normalized_postfilter_wig = glob.glob(os.path.join(tmpdir, '*-W{0}-G{1}-FDR*-islandfiltered-normalized.wig'.format(window_size, gap_size))) -if len(normalized_postfilter_wig) == 1: - normalized_postfilter_wig = normalized_postfilter_wig[0] -else: - raise ValueError("SICER normalized postfilter wig file not found") - -shell( - "export LC_COLLATE=C; " - # format the output in broadPeak format - # note that SICER can emit p-values of 0 and in that case this file will contain "inf" entries - """awk -F"\\t" -v lab={label} """ - """'{{printf("%s\\t%d\\t%d\\t%s_peak_%d\\t%d\\t.\\t%g\\t%g\\t%g\\n", $1, """ - """$2, $3-1, lab, NR, -10*log($6)/log(10), $7, -log($6)/log(10), -log($8)/log(10))}}' """ - "{hit} > {snakemake.output.bed}.tmp && " - # sort the bed file, just to be sure - "bedSort {snakemake.output.bed}.tmp {snakemake.output.bed} && " - # rename the assorted output files - "mv {resultsfile} {snakemake.output.bed}-islands-summary-significant && " - "mv {summary_graph} {snakemake.output.bed}.graph && " - "wigToBigWig {normalized_prefilter_wig} {snakemake.input.chromsizes} {snakemake.output.bed}-normalized-prefilter.bigWig && " - "wigToBigWig {normalized_postfilter_wig} {snakemake.input.chromsizes} {snakemake.output.bed}-normalized-postfilter.bigWig && " - "mv {candidate_islands} {snakemake.output.bed}-islands-summary && " - "mv {significant_islands} {snakemake.output.bed}-island.bed && " - "mv {redundancy_removed} {snakemake.output.bed}-islandfiltered.bed && " - "mv {tmpdir}/tmp.output {snakemake.output.bed}.log && " - # clean up the temp directory - "rm {snakemake.output.bed}.tmp && rm -Rf {tmpdir}" -) diff --git a/wrappers/wrappers/spp/README.md b/wrappers/wrappers/spp/README.md deleted file mode 100644 index a8eb7c43..00000000 --- a/wrappers/wrappers/spp/README.md +++ /dev/null @@ -1,175 +0,0 @@ -# spp - -Wraps the [`spp`](http://compbio.med.harvard.edu/Supplements/ChIP-seq/) peak-caller. - -This is a rather complicated wrapper. See input and output sections below for -details. - - -## Examples - -Minimal usage: - -```python -rule spp: - input: - ip="ip.bam", - control="control.bam", - chromsizes='dm6.chromsizes' - output: "peaks.bed" - wrapper: - 'file://path/to/wrapper' -``` - -Specify parameters (see below for options): - - -```python -rule spp: - input: - ip="ip.bam", - control="control.bam", - chromsizes='dm6.chromsizes' - output: "peaks.bed" - params: block={'fdr': 0.1} - - wrapper: - 'file://path/to/wrapper' -``` - -Specify additional output files: - -```python -rule spp: - input: - ip="ip.bam", - control="control.bam", - chromsizes='dm6.chromsizes' - output: - bed="peaks.bed" - enrichment_estimates="enrichment_est.bedgraph", - smoothed_enrichment_mle="enrichment_mle.bedgraph", - rdata="image.RData" - params: block={'fdr': 0.1} - log: "spp.log" -``` - -The works, with multiple replicate BAMs to be merged, keeping the tempfiles, -increasing the memory available to MarkDuplicates, all the output files, -adjusting spp params, and using 8 threads for merging and duplicates removal: - - -```python -rule spp: - input: - ip=["ip.bam", "ip2.bam"], - control=["control.bam", "control2.bam", "control3.bam"], - chromsizes='dm6.chromsizes' - output: - bed="peaks.bed" - enrichment_estimates="enrichment_est.bedgraph", - smoothed_enrichment_mle="enrichment_mle.bedgraph", - rdata="image.RData" - log: 'spp.log' - threads: 8 - params: - block={'fdr': 0.1, 'bins': 10}, - java_args='-Xmx64g' - keep_tempfiles=True - log: "spp.log" -``` - -## Input - -`ip`, `control`: BAM files. Duplicates should already be removed. - -`chromsizes`: Chromsizes table, used to ensure peak boundaries do not extend -outside of chromosome limits. - -SPP itself only supports a single BAM file for IP and a single BAM file for -control. However, to support the common case of pooling replicates to gain -coverage, this wrapper does handle multiple BAMs. - -If more than one BAM is provided for either IP or control, the BAMs are merged -and then duplicates are removed from the merged file (to handle reads that -occur in both replicates, which would otherwise cause spp to complain) are -then removed using MarkDuplicates. This merged, deduped BAM is then provided to -SPP. - -The merged BAM, merged-and-deduped BAM, and metrics file (from MarkDuplicates) -are created as temp files. The temp filenames are indicated in the log. If you -need these for debugging, set `params: keep_tempfiles=True` to keep them. - -## Output - -The only required output is `bed`. Others, if specified, will trigger their -respective creation. - -`bed`: narrowPeak format. - -`smoothed_enrichment_mle`: BEDGRAPH file (even though SPP calls it a "WIG") of -smoothed enrichment using the `smoothed.enrichment.mle` method from SPP. -Optional, if not specified it will not be created. - -`enrichment_estimates`: BEDGRAPH file (even though SPP calls it a "WIG") of -enrichment estimates using the `get.conservative.fold.enrichment.profile` -function from SPP. Optional, if not specified will not be created. - -`rdata`: Saves an image of the workspace. Handy for debugging. Optional, if not -specified will not be created. - -An R script named after the BED file (`{snakemake.output.bed}.R`), will be -written to the output directory. This can be run from the same directory as the -snakefile was run from for debugging purposes. - -## Threads -We do not run SPP in parallel mode due to trouble with running the `snow` -library on clusters (it seems to crash unexpectedly and intermittently). -However, for multiple BAMs, we pass the threads to samtools and MarkDuplicates. - -## Params - -### wrapper params - -`keep_tempfiles`: bool; if True then tempfiles created by merging and deduping -replicate BAMs will be retained for debugging purposes. - -`java_args`: str; additional args provided to picard, e.g., `java_args="-Xmx64g"` - -### spp params - -Since SPP doesn't have a command-line interface, we can't use the "extras=" -mechanism to pass params verbatim. Instead, the R script created by the wrapper -supports the following parameters, provided as keys to the `block` param to -make it easier to work with the chipseq config format. For example: - -```python -params: - block={'bins': 5, 'fdr': 0.1}, - java_args='-Xmx64g' -``` - -`srange`: tuple; controls the range of lags over which to calculate -cross-correlation. Default is `(50, 500)` - -`bins`: integer; controls how the binding characteristics will be binned. Default -is `5`. - -`tecfilter`: bool; passed to `find.binding.positions` function. Default is True; -set to False to prevent the exclusion of large regions with higher input than -expected. - -`remove_anomalies`: bool; enable/disable the remove.tag.anomalies step. Defualt -is False (do not remove anomalies). Setting to True can increase the time -dramatically. - -`fdr`: float; false discovery rate when calling peaks. Default is `0.05`. - -`whs`: int. window half-size. Used if the auto-calculated -`binding.characteristics` is NA. Default is `500`. - -`zthr`: float. Z threshold used when adding broad regions. Default is `3`. - -`bandwidth`: int. Bandwith for smoothing WIG file. Default is `200`. - -`step`: int; step size for smoothing WIG file. Default is `100`. diff --git a/wrappers/wrappers/spp/environment.yaml b/wrappers/wrappers/spp/environment.yaml deleted file mode 100644 index 42dd8086..00000000 --- a/wrappers/wrappers/spp/environment.yaml +++ /dev/null @@ -1,11 +0,0 @@ -channels: - - conda-forge - - bioconda - - defaults - -dependencies: - - picard - - bedtools - - samtools - - r-spp - - r >=3.5.1 diff --git a/wrappers/wrappers/spp/wrapper.py b/wrappers/wrappers/spp/wrapper.py deleted file mode 100644 index 364c3ba1..00000000 --- a/wrappers/wrappers/spp/wrapper.py +++ /dev/null @@ -1,256 +0,0 @@ -from textwrap import dedent -import tempfile -from snakemake.shell import shell -log = snakemake.log_fmt_shell(append=True) - -# Since we'll be appending the output from multiple commands to the same log, -# we want to ensure that the provided log file is empty to start -if snakemake.log: - shell('cat /dev/null > {snakemake.log}') - -java_args = snakemake.params.get('java_args', '') -keep_tempfiles = snakemake.params.get('keep_tempfiles', False) - -registered_for_deletion = [ - snakemake.output.bed + '.tmp', - snakemake.output.bed + '.tmp.genome', -] - - -def merge_and_dedup(bams): - """ - spp only handles one replicate at a time. To support pooled samples, we - merge and remove duplicates, storing the result in a tempfile. - - If only one item is provided, return it immediately - """ - - if len(bams) == 1: - return bams - - merged = tempfile.NamedTemporaryFile(delete=False, prefix='merged', suffix='.bam').name - merged_and_deduped = tempfile.NamedTemporaryFile(delete=False, prefix='merged_and_duped', suffix='.bam').name - metrics = tempfile.NamedTemporaryFile(delete=False, prefix='metrics', suffix='.txt').name - - shell('echo "tempfiles created by merge_and_dedup: {merged} {merged_and_deduped} {metrics}" {log}') - - if not keep_tempfiles: - registered_for_deletion.extend([merged, merged_and_deduped, metrics]) - - bams = ' '.join(bams) - shell( - 'samtools merge ' - '-f ' - '-@ {snakemake.threads} ' - '{merged} ' - '{bams} ' - '{log} ' - ) - shell( - 'picard ' - '{java_args} ' - 'MarkDuplicates ' - 'INPUT={merged} ' - 'OUTPUT={merged_and_deduped} ' - 'METRICS_FILE={metrics} ' - 'REMOVE_DUPLICATES=true ' - '{log} ' - ) - return merged_and_deduped - - -def Rbool(x): - """ - Convert to R boolean string used to fill in a template - """ - if x: - return 'TRUE' - return 'FALSE' - - -# ---------------------------------------------------------------------------- -# DEFAULTS -# -extra = snakemake.params.block.get('extra', {}) - -DEFAULTS = { - # srange controls the range of lags over which to calculate cross-correlation - 'srange': (50, 500), - # bins controls how the binding characteristics will be binned - 'bins': 5, - # enable/disable the remove.tag.anomalies step - 'remove_anomalies': False, - # false discovery rate when calling peaks - 'fdr': 0.05, - # window half-size. Used if binding.characteristics is NA. - 'whs': 500, - # Z threshold used when adding broad regions. - 'zthr': 3, - # bandwith for smoothing WIG file - 'bandwidth': 200, - # step for smoothing WIG file - 'step': 100, - # Set to False to disable the filtering of large regions with high input signal - 'tecfilter': True, -} - -params = {} -for k, v in DEFAULTS.items(): - v = extra.get(k, v) - if isinstance(v, bool): - v = Rbool(v) - params[k] = v - -# ---------------------------------------------------------------------------- - -# R_template is incrementally built up so that we can intersperse comments and -# to keep things better organized. It will be filled in with `**locals()` at -# the end. - -ip = merge_and_dedup(snakemake.input.ip) -control = merge_and_dedup(snakemake.input.control) - - -R_template = """ -library(spp) -chip.data <- read.bam.tags("{ip}") -input.data <- read.bam.tags("{control}") -""" - - -# -R_template += """ -for (chrom in names(chip.data$tags)){{ - if (length(chip.data$tags[[chrom]]) < 10){{ - print(paste("Chromosome", chrom, "has <10 reads; removing from analysis")) - chip.data$tags[[chrom]] <- NULL - chip.data$quality[[chrom]] <- NULL - input.data$tags[[chrom]] <- NULL - input.data$quality[[chrom]] <- NULL - }} -}} -""" - -# Use configured srange and bins, if provided. `accept.all.tags=TRUE` is -# hard-coded since we were getting errors if FALSE. -R_template += """ -binding.characteristics <- get.binding.characteristics( - chip.data, - srange=c({params[srange][0]}, {params[srange][1]}), - bin={params[bins]}, - accept.all.tags=TRUE, - remove.tag.anomalies={params[remove_anomalies]} -) -""" - -R_template += """ -# Extract info from binding characteristics -tag.shift <- round(binding.characteristics$peak$x/2) -detection.window.halfsize <- binding.characteristics$whs -if (!is.finite(detection.window.halfsize)){{ - detection.window.halfsize <- {params[whs]} -}} -""" - -R_template += """ -# Reset data to tags, and remove any chromosomes with no data. -# (tags is a list, names are chromosomes and values are integer vectors) - -chip.data <- chip.data$tags -input.data <- input.data$tags - -chip.data[sapply(chip.data, is.null)] <- NULL -input.data[sapply(input.data, is.null)] <- NULL -""" - - -if 'smoothed_enrichment_mle' in snakemake.output.keys(): - R_template += dedent(""" - smoothed.enrichment.estimate <- get.smoothed.enrichment.mle( - chip.data, - input.data, - bandwidth={params[bandwidth]}, - step={params[step]}, - tag.shift=tag.shift) - writewig( - smoothed.enrichment.estimate, - "{snakemake.output.smoothed_enrichment_mle}", - feature="" - ) - """) - -if 'enrichment_estimates' in snakemake.output.keys(): - R_template += dedent(""" - enrichment.estimates <- get.conservative.fold.enrichment.profile( - chip.data, input.data, fws=500, step=100, alpha=0.01 - ) - writewig(enrichment.estimates, "{snakemake.output.enrichment_estimates}", feature="") - rm(enrichment.estimates) - """) - -R_template += """ -# Get peaks -bp <- find.binding.positions( - signal.data=chip.data, - control.data=input.data, - fdr={params[fdr]}, - whs=detection.window.halfsize, - tec.filter={params[tecfilter]} -) -""" - -R_template += """ -# Add broad regions to peaks -bp <- add.broad.peak.regions( - chip.data, - input.data, - bp, - window.size=detection.window.halfsize, - z.thr={params[zthr]} -) -write.narrowpeak.binding(bp, "{snakemake.output.bed}.tmp") -""" - -# Save image for later introspection or debugging -if 'rdata' in snakemake.output.keys(): - R_template += dedent(""" - save.image("{snakemake.output.rdata}") - """) - -# write the filled-in template to the output directory for later debugging -script_filename = snakemake.output.bed + '.R' -with open(script_filename, 'w') as fout: - fout.write(R_template.format(**locals())) - -# Run it -shell('Rscript {script_filename} {log}') - -# Fix the output file so that it doesn't have negative numbers and so it fits -# inside the genome -shell( - """awk -F "\\t" '{{OFS="\\t"; print $1, "0", $2}}' """ - "{snakemake.input.chromsizes} " - "> {snakemake.output.bed}.tmp.genome" -) -shell( - "sort -k1,1 -k2,2n {snakemake.output.bed}.tmp | " - """awk -F "\\t" '{{OFS="\\t"; if (($2>0) && ($3>0)) print $0}}' | """ - "bedtools intersect -a - -b {snakemake.output.bed}.tmp.genome > {snakemake.output.bed}" -) - -# SPP's writewig() adds a header and is space-separated, so this turns it into -# a proper bedGraph file ready for conversion to bigwig. -if 'enrichment_estimates' in snakemake.output.keys(): - shell('grep -v "track" {snakemake.output.enrichment_estimates} ' - '| sed "s/ /\\t/g" > {snakemake.output.enrichment_estimates}.tmp ' - '&& mv {snakemake.output.enrichment_estimates}.tmp ' - '{snakemake.output.enrichment_estimates}') - -if 'smoothed_enrichment_mle' in snakemake.output.keys(): - shell('grep -v "track" {snakemake.output.smoothed_enrichment_mle} ' - '| sed "s/ /\\t/g" > {snakemake.output.smoothed_enrichment_mle}.tmp ' - '&& mv {snakemake.output.smoothed_enrichment_mle}.tmp ' - '{snakemake.output.smoothed_enrichment_mle}') - -for fn in registered_for_deletion: - shell('rm -v {fn} {log}') From 9f0036654e0dbb7f1e0973d30995cd6d16399710 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 20 Jan 2025 10:15:48 -0500 Subject: [PATCH 073/196] resources to strings --- workflows/chipseq/Snakefile | 97 +++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 46 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 9c8a2f37..98152f5f 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -6,7 +6,6 @@ import pandas as pd sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") from lib import utils from lib import chipseq -from lib.utils import autobump, gb, hours configfile: "config/config.yaml" @@ -59,9 +58,9 @@ if utils.detect_sra(sampletable): is_paired=is_paired, # extra="-X 100000", # [enable for test] resources: - mem_mb=gb(1), - disk_mb=autobump(gb=1), - runtime=autobump(hours=2) + mem="1g", + disk="1g", + runtime="2h", run: srr = sampletable.loc[wildcards.sample, "Run"] extra = params.get("extra", "") @@ -85,8 +84,8 @@ rule symlinks: expand(patterns["fastq"], n=n, allow_missing=True), threads: 1 resources: - mem_mb=100, - runtime=10, + mem="1g", + runtime="10m", run: assert len(output) == len(input), (input, output) for src, linkname in zip(input, output): @@ -109,8 +108,8 @@ rule cutadapt: "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", threads: 6 resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", params: extra=( ( @@ -154,8 +153,8 @@ rule fastqc: html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html", zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip", resources: - mem_mb=gb(8), - runtime=autobump(hours=2), + mem="8g", + runtime="2h", log: "{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log", run: @@ -198,8 +197,8 @@ rule bowtie2: patterns["bam"] + ".log", threads: 16 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), + mem="32g", + runtime="2h", params: extra="", run: @@ -235,8 +234,8 @@ rule unique: patterns["unique"], threads: 1 resources: - mem_mb=gb(1), - runtime=autobump(hours=2), + mem="1g", + runtime="2h", params: # NOTE: the quality score chosen here should reflect the scores output # by the aligner used. For example, STAR uses 255 as max mapping @@ -253,8 +252,8 @@ rule fastq_count: "{sample_dir}/{sample}/{sample}{suffix}.fastq.gz.libsize", threads: 1 resources: - mem_mb=gb(1), - runtime=autobump(hours=2), + mem="1g", + runtime="2h", shell: "zcat {input} | echo $((`wc -l`/4)) > {output}" @@ -266,8 +265,8 @@ rule bam_count: "{sample_dir}/{sample}/{suffix}.bam.libsize", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", shell: "samtools view -c {input} > {output}" @@ -279,8 +278,8 @@ rule bam_index: bai="{prefix}.bam.bai", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", shell: "samtools index {input} {output}" @@ -295,9 +294,9 @@ rule markduplicates: patterns["markduplicates"]["bam"] + ".log", threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), - disk_mb=gb(100), + mem="32g", + disk="100g", + runtime="2h", params: java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g' # [enable for test] @@ -326,9 +325,9 @@ rule merge_techreps: patterns["merged_techreps"] + ".log", threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), - disk_mb=gb(100), + mem="32g", + disk="100g", + runtime="2h", params: java_args="-Xmx32g", # [disable for test] # java_args='-Xmx2g' # [enable for test] @@ -348,8 +347,8 @@ if is_paired: patterns["collectinsertsizemetrics"]["metrics"] + ".log", threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), + mem="32g", + runtime="2h", params: java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g' # [enable for test] @@ -373,8 +372,8 @@ rule bigwig: patterns["bigwig"] + ".log", threads: 1 resources: - mem_mb=gb(16), - runtime=autobump(hours=2), + mem="16g", + runtime="2h", shell: "bamCoverage " "--bam {input.bam} " @@ -416,8 +415,8 @@ rule fingerprint: patterns["fingerprint"]["metrics"] + ".log", threads: 1 resources: - mem_mb=gb(32), - runtime=autobump(hours=2), + mem="32g", + runtime="2h", run: if len(input.control) == 0: jsdsample_arg = "" @@ -461,8 +460,8 @@ rule macs2: output: bed=patterns["peaks"]["macs2"], resources: - mem_mb=gb(16), - runtime=autobump(hours=2), + mem="16g", + runtime="2h", log: patterns["peaks"]["macs2"] + ".log", params: @@ -496,8 +495,8 @@ rule epic2: output: bed=patterns["peaks"]["epic2"], resources: - mem_mb=gb(16), - runtime=autobump(hours=2), + mem="16g", + runtime="2h", log: patterns["peaks"]["epic2"] + ".log", params: @@ -517,8 +516,8 @@ rule bed_to_bigbed: output: "{prefix}.bigbed", resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", log: "{prefix}.bigbed.log", script: @@ -536,8 +535,8 @@ rule multibigwigsummary: tab=patterns["multibigwigsummary"]["tab"], threads: 16 resources: - mem_mb=gb(16), - runtime=autobump(hours=2), + mem="16g", + runtime="2h", run: # from the input files, figure out the sample name. labels = " ".join([i.split("/")[-2] for i in input]) @@ -562,8 +561,8 @@ rule plotcorrelation: heatmap=patterns["plotcorrelation"]["heatmap"], tab=patterns["plotcorrelation"]["tab"], resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", shell: "plotCorrelation " "--corData {input} " @@ -587,8 +586,8 @@ rule idxstats: output: txt=patterns["samtools"]["idxstats"], resources: - mem_mb=gb(16), - runtime=autobump(hours=2), + mem="16g", + runtime="2h", log: patterns["samtools"]["idxstats"] + ".log", shell: @@ -601,6 +600,9 @@ rule flagstat: bai=patterns["markduplicates"]["bam"] + ".bai", output: patterns["samtools"]["flagstat"], + resources: + mem="8g", + runtime="2h", log: patterns["samtools"]["flagstat"] + ".log", shell: @@ -613,6 +615,9 @@ rule samtools_stats: bai=patterns["markduplicates"]["bam"] + ".bai", output: patterns["samtools"]["stats"], + resources: + mem="8g", + runtime="2h", log: patterns["samtools"]["stats"] + ".log", shell: @@ -647,8 +652,8 @@ rule multiqc: patterns["multiqc"] + ".log", threads: 1 resources: - mem_mb=gb(2), - runtime=autobump(hours=2), + mem="2g", + runtime="2h", run: analysis_directory = "data" outdir = os.path.dirname(output[0]) From 65d2e3ba9b4281fb029a57fbc8367629aa25c952 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 20 Jan 2025 14:23:26 -0500 Subject: [PATCH 074/196] rm chipseq patterns --- workflows/chipseq/Snakefile | 179 +++++++++--------- .../chipseq/config/chipseq_patterns.yaml | 75 -------- 2 files changed, 91 insertions(+), 163 deletions(-) delete mode 100644 workflows/chipseq/config/chipseq_patterns.yaml diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 98152f5f..4dabbb52 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -20,7 +20,7 @@ sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" n = ["1", "2"] if is_paired else ["1"] SAMPLES = sampletable.iloc[:, 0].values -patterns = yaml.safe_load(open("config/chipseq_patterns.yaml"))["patterns_by_sample"] +LABELS = sampletable.label.values peaks = chipseq.add_bams_to_peak_calling(config) @@ -36,8 +36,8 @@ localrules: rule targets: input: - patterns["multiqc"], - expand(patterns["bigwig"], label=sampletable.label), + "data/chipseq_aggregation/multiqc.html", + expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=LABELS), [v["bed"] for k, v in peaks.items()], @@ -81,7 +81,8 @@ rule symlinks: else sampletable.loc[wc.sample, ["orig_filename"]] ), output: - expand(patterns["fastq"], n=n, allow_missing=True), + expand("data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", n=n, + allow_missing=True), threads: 1 resources: mem="1g", @@ -95,15 +96,19 @@ rule symlinks: rule symlink_targets: input: expand( - "data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=n + "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=n ), rule cutadapt: input: - fastq=expand(patterns["fastq"], n=n, allow_missing=True), + fastq=expand( + "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", + n=n, allow_missing=True), output: - fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), + fastq=expand( + "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", + n=n, allow_missing=True), log: "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", threads: 6 @@ -180,7 +185,8 @@ rule fastqc: rule bowtie2: input: - fastq=expand(patterns["cutadapt"], n=n, allow_missing=True), + fastq=expand( + "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", n=n, allow_missing=True), index=multiext( f"{REFERENCES}/bowtie2/genome", ".1.bt2", @@ -192,9 +198,9 @@ rule bowtie2: ".fa", ), output: - bam=temporary(patterns["bam"]), + bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"), log: - patterns["bam"] + ".log", + "data/chipseq_samples/{sample}/{sample}.cutadapt.bam.log", threads: 16 resources: mem="32g", @@ -229,9 +235,9 @@ rule bowtie2: rule unique: input: - patterns["bam"], + "data/chipseq_samples/{sample}/{sample}.cutadapt.bam", output: - patterns["unique"], + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam", threads: 1 resources: mem="1g", @@ -286,12 +292,12 @@ rule bam_index: rule markduplicates: input: - bam=patterns["unique"], + bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam", output: - bam=patterns["markduplicates"]["bam"], - metrics=patterns["markduplicates"]["metrics"], + bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", + metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics" log: - patterns["markduplicates"]["bam"] + ".log", + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.log" threads: 1 resources: mem="32g", @@ -315,14 +321,14 @@ rule markduplicates: rule merge_techreps: input: lambda wc: expand( - patterns["markduplicates"]["bam"], + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", sample=utils.get_techreps(sampletable, wc.label), ), output: - bam=patterns["merged_techreps"], - metrics=patterns["merged_techreps"] + ".metrics", + bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", + metrics="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.metrics", log: - patterns["merged_techreps"] + ".log", + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.log" threads: 1 resources: mem="32g", @@ -339,12 +345,12 @@ if is_paired: rule collectinsertsizemetrics: input: - bam=patterns["markduplicates"]["bam"], + bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", output: - pdf=patterns["collectinsertsizemetrics"]["pdf"], - metrics=patterns["collectinsertsizemetrics"]["metrics"], + pdf="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.pdf", + metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics", log: - patterns["collectinsertsizemetrics"]["metrics"] + ".log", + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics.log" threads: 1 resources: mem="32g", @@ -364,12 +370,12 @@ if is_paired: rule bigwig: input: - bam=patterns["merged_techreps"], - bai=patterns["merged_techreps"] + ".bai", + bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", + bai="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", output: - patterns["bigwig"], + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", log: - patterns["bigwig"] + ".log", + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig.log", threads: 1 resources: mem="16g", @@ -382,7 +388,7 @@ rule bigwig: "--minMappingQuality 20 " "--ignoreDuplicates " # Can't use the CPM normalization for testing due to <1000 reads total - # in example data; keep uncommented when running in production + # in example data "--normalizeUsing CPM " # [disable for test] "--extendReads 300 " "&> {log}" @@ -396,23 +402,25 @@ rule fingerprint: Note: uses the merged techreps. """ input: - bams=lambda wc: expand(patterns["merged_techreps"], label=wc.ip_label), + bams=lambda wc: expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=wc.ip_label), control=lambda wc: expand( - patterns["merged_techreps"], + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=chipseq.merged_input_for_ip(sampletable, wc.ip_label), ), - bais=lambda wc: expand(patterns["merged_techreps"] + ".bai", label=wc.ip_label), + bais=lambda wc: expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", + label=wc.ip_label), control_bais=lambda wc: expand( - patterns["merged_techreps"] + ".bai", + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", label=chipseq.merged_input_for_ip(sampletable, wc.ip_label), ), output: - plot=patterns["fingerprint"]["plot"], - raw_counts=patterns["fingerprint"]["raw_counts"], - metrics=patterns["fingerprint"]["metrics"], + plot="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.png", + raw_counts="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.tab", + metrics="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics", threads: 8 log: - patterns["fingerprint"]["metrics"] + ".log", + "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics.log", threads: 1 resources: mem="32g", @@ -444,26 +452,23 @@ rule fingerprint: rule macs2: - """ - Run the macs2 peak caller - """ input: ip=lambda wc: expand( - patterns["merged_techreps"], + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "ip"), ), control=lambda wc: expand( - patterns["merged_techreps"], + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "control"), ), chromsizes=rules.chromsizes.output, output: - bed=patterns["peaks"]["macs2"], + bed="data/chipseq_peaks/macs2/{macs2_run}/peaks.bed", resources: mem="16g", runtime="2h", log: - patterns["peaks"]["macs2"] + ".log", + "data/chipseq_peaks/macs2/{macs2_run}/peaks.bed.log", params: block=lambda wc: chipseq.block_for_run(config, wc.macs2_run, "macs2"), script: @@ -471,34 +476,31 @@ rule macs2: rule epic2: - """ - Run the epic2 peak caller - """ input: ip=lambda wc: expand( - patterns["merged_techreps"], + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "ip"), ), control=lambda wc: expand( - patterns["merged_techreps"], + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "control"), ), bai=lambda wc: expand( - patterns["merged_techreps"] + ".bai", + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "ip"), ) + expand( - patterns["merged_techreps"] + ".bai", + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", label=chipseq.samples_for_run(config, wc.epic2_run, "epic2", "control"), ), chromsizes=rules.chromsizes.output, output: - bed=patterns["peaks"]["epic2"], + bed="data/chipseq_peaks/epic2/{epic2_run}/peaks.bed", resources: mem="16g", runtime="2h", log: - patterns["peaks"]["epic2"] + ".log", + "data/chipseq_peaks/epic2/{epic2_run}/peaks.bed.log" params: block=lambda wc: chipseq.block_for_run(config, wc.epic2_run, "epic2"), is_paired=is_paired, @@ -529,10 +531,10 @@ rule multibigwigsummary: Summarize the bigWigs across genomic bins """ input: - expand(patterns["bigwig"], label=sampletable.label), + expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=sampletable.label), output: - npz=patterns["multibigwigsummary"]["npz"], - tab=patterns["multibigwigsummary"]["tab"], + npz="data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz", + tab="data/chipseq_aggregation/deeptools/multibigwigsummary.tab", threads: 16 resources: mem="16g", @@ -556,10 +558,10 @@ rule plotcorrelation: Plot a heatmap of correlations across all samples """ input: - patterns["multibigwigsummary"]["npz"], + npz="data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz", output: - heatmap=patterns["plotcorrelation"]["heatmap"], - tab=patterns["plotcorrelation"]["tab"], + tab="data/chipseq_aggregation/deeptools/plotcorrelation.tab", + heatmap="data/chipseq_aggregation/deeptools/correlation_heatmap.png", resources: mem="2g", runtime="2h", @@ -581,75 +583,76 @@ rule plotcorrelation: rule idxstats: input: - bam=patterns["markduplicates"]["bam"], - bai=patterns["markduplicates"]["bam"] + ".bai", + bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", + bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai", output: - txt=patterns["samtools"]["idxstats"], + txt="data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt", resources: mem="16g", runtime="2h", log: - patterns["samtools"]["idxstats"] + ".log", + "data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt.log" shell: "samtools idxstats {input.bam} 2> {log} 1> {output.txt}" rule flagstat: input: - bam=patterns["markduplicates"]["bam"], - bai=patterns["markduplicates"]["bam"] + ".bai", + bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", + bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai", output: - patterns["samtools"]["flagstat"], + "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt", resources: mem="8g", runtime="2h", log: - patterns["samtools"]["flagstat"] + ".log", + "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt.log" shell: "samtools flagstat {input.bam} > {output}" rule samtools_stats: input: - bam=patterns["markduplicates"]["bam"], - bai=patterns["markduplicates"]["bam"] + ".bai", + bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", + bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai", output: - patterns["samtools"]["stats"], + "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt", resources: mem="8g", runtime="2h", log: - patterns["samtools"]["stats"] + ".log", + "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt.log" shell: "samtools stats {input.bam} > {output}" rule multiqc: input: - expand(patterns["bam"], sample=SAMPLES), - expand(patterns["fastqc"]["raw"], sample=SAMPLES), - expand(patterns["fastqc"]["cutadapt"], sample=SAMPLES), - expand(patterns["fastqc"]["bam"], sample=SAMPLES), - expand(patterns["bigwig"], label=sampletable.label), - expand(patterns["samtools"]["idxstats"], sample=SAMPLES), - expand(patterns["samtools"]["flagstat"], sample=SAMPLES), - expand(patterns["samtools"]["stats"], sample=SAMPLES), - expand(patterns["merged_techreps"], label=sampletable.label), + expand("data/chipseq_samples/{sample}/{sample}.cutadapt.bam", sample=SAMPLES), + expand("data/chipseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip", sample=SAMPLES), + expand("data/chipseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip", sample=SAMPLES), + expand("data/chipseq_samples/{sample}/fastqc/{sample}.cutadapt.unique.nodups.bam_fastqc.zip", sample=SAMPLES), + expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=sampletable.label), + expand("data/chipseq_samples/{sample}/samtools_stats_{sample}.txt", sample=SAMPLES), + expand("data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt", sample=SAMPLES), + expand("data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt", sample=SAMPLES), + expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=sampletable.label), expand( - patterns["fingerprint"]["metrics"], + "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics", ip_label=sampletable.loc[sampletable.antibody != "input", "label"], ), - expand(patterns["collectinsertsizemetrics"], sample=SAMPLES) - if is_paired - else [], + expand( + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics", + sample=SAMPLES + ) if is_paired else [], [v["bigbed"] for v in peaks.values()], - patterns["multibigwigsummary"]["tab"], - patterns["plotcorrelation"]["tab"], + "data/chipseq_aggregation/deeptools/plotcorrelation.tab", + "data/chipseq_aggregation/deeptools/multibigwigsummary.tab", config="config/multiqc_config.yaml", output: - patterns["multiqc"], + "data/chipseq_aggregation/multiqc.html", log: - patterns["multiqc"] + ".log", + "data/chipseq_aggregation/multiqc.html.log", threads: 1 resources: mem="2g", diff --git a/workflows/chipseq/config/chipseq_patterns.yaml b/workflows/chipseq/config/chipseq_patterns.yaml deleted file mode 100644 index 90b511c9..00000000 --- a/workflows/chipseq/config/chipseq_patterns.yaml +++ /dev/null @@ -1,75 +0,0 @@ -patterns_by_sample: - - fastq: 'data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz' - cutadapt: 'data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz' - bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.bam' - - fastqc: - raw: 'data/chipseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip' - cutadapt: 'data/chipseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip' - bam: 'data/chipseq_samples/{sample}/fastqc/{sample}.cutadapt.unique.nodups.bam_fastqc.zip' - - libsizes: - fastq: 'data/chipseq_samples/{sample}/{sample}_R1.fastq.gz.libsize' - cutadapt: 'data/chipseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize' - bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.bam.libsize' - unique: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam.libsize' - nodups: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.libsize' - - fastq_screen: 'data/chipseq_samples/{sample}/{sample}.cutadapt.screen.txt' - libsizes_table: 'data/chipseq_aggregation/libsizes_table.tsv' - libsizes_yaml: 'data/chipseq_aggregation/libsizes_table_mqc.yaml' - multiqc: 'data/chipseq_aggregation/multiqc.html' - unique: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam' - - markduplicates: - bam: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam' - bai: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai' - metrics: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics' - - merged_techreps: 'data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam' - - bigwig: 'data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig' - - fingerprint: - plot: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.png' - raw_counts: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.tab' - metrics: 'data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics' - - multibigwigsummary: - npz: 'data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz' - tab: 'data/chipseq_aggregation/deeptools/multibigwigsummary.tab' - - plotcorrelation: - tab: 'data/chipseq_aggregation/deeptools/plotcorrelation.tab' - heatmap: 'data/chipseq_aggregation/deeptools/correlation_heatmap.png' - - collectinsertsizemetrics: - pdf: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.pdf' - metrics: 'data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics' - - samtools: - idxstats: 'data/rnaseq_samples/{sample}/samtools_idxstats_{sample}.txt' - flagstat: 'data/rnaseq_samples/{sample}/samtools_flagstat_{sample}.txt' - stats: 'data/rnaseq_samples/{sample}/samtools_stats_{sample}.txt' - - peaks: - macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed' - spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bed' - sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bed' - epic2: 'data/chipseq_peaks/epic2/{epic2_run}/peaks.bed' - -patterns_by_peaks: - peaks: - macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bed' - spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bed' - sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bed' - epic2: 'data/chipseq_peaks/epic2/{epic2_run}/peaks.bed' - bigbed: - macs2: 'data/chipseq_peaks/macs2/{macs2_run}/peaks.bigbed' - spp: 'data/chipseq_peaks/spp/{spp_run}/peaks.bigbed' - sicer: 'data/chipseq_peaks/sicer/{sicer_run}/peaks.bigbed' - epic2: 'data/chipseq_peaks/epic2/{epic2_run}/peaks.bigbed' - -patterns_by_aggregate: - merged_bigwig: 'data/chipseq_aggregation/merged_bigwigs/{merged_bigwig_label}.bigwig' From 3b57a27e1288c841c11b5c7f256beef257df8dd9 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 20 Jan 2025 14:37:00 -0500 Subject: [PATCH 075/196] update chipseq_trackhub.py --- workflows/chipseq/chipseq_trackhub.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py index e2bf9ecb..d069b015 100644 --- a/workflows/chipseq/chipseq_trackhub.py +++ b/workflows/chipseq/chipseq_trackhub.py @@ -25,7 +25,6 @@ from trackhub.upload import upload_hub, stage_hub from lib import chipseq -from lib.patterns_targets import ChIPSeqConfig ap = argparse.ArgumentParser() ap.add_argument('config', help='Main config.yaml file') @@ -53,8 +52,6 @@ genome=hub_config['hub']['genome'] ) -c = ChIPSeqConfig(config, os.path.join(os.path.dirname(args.config), 'chipseq_patterns.yaml')) - # Set up subgroups based on unique values from columns specified in the config df = pandas.read_csv(config['sampletable'], comment='#', sep='\t') cols = hub_config['subgroups']['columns'] @@ -82,8 +79,7 @@ SubGroupDefinition( name='algorithm', label='algorithm', mapping={ 'macs2': 'macs2', - 'spp': 'spp', - 'sicer': 'sicer', + 'epic2': 'epic2', 'NA': 'NA', })) @@ -146,8 +142,7 @@ def decide_color(samplename): for label in df['label'].unique(): - # ASSUMPTION: bigwig filename pattern - bigwig = c.patterns['bigwig'].format(label=label) + bigwig = f"data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig" subgroup = df[df.loc[:, 'label'] == label].to_dict('records')[0] subgroup = { From 4e86e1668141691a0b20fde3e02a90518d4dc9d0 Mon Sep 17 00:00:00 2001 From: Ryan Dale Date: Mon, 20 Jan 2025 14:41:00 -0500 Subject: [PATCH 076/196] update rnaseq_trackhub.py --- workflows/rnaseq/rnaseq_trackhub.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/workflows/rnaseq/rnaseq_trackhub.py b/workflows/rnaseq/rnaseq_trackhub.py index 91273574..6fe17f80 100644 --- a/workflows/rnaseq/rnaseq_trackhub.py +++ b/workflows/rnaseq/rnaseq_trackhub.py @@ -9,8 +9,6 @@ """ import os -import sys -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) import re from pprint import pprint import pandas @@ -22,8 +20,6 @@ from trackhub.upload import upload_hub, stage_hub import argparse -from lib.patterns_targets import RNASeqConfig - ap = argparse.ArgumentParser() ap.add_argument('config', help='Main config.yaml file') ap.add_argument('hub_config', help='Track hub config YAML file') @@ -41,7 +37,6 @@ for cfg in args.additional_configs: update_config(config, yaml.load(open(cfg), Loader=yaml.FullLoader)) -c = RNASeqConfig(config, os.path.join(os.path.dirname(args.config), 'rnaseq_patterns.yaml')) hub, genomes_file, genome, trackdb = default_hub( hub_name=hub_config['hub']['name'], @@ -126,7 +121,7 @@ def decide_color(samplename): for direction in 'pos', 'neg': # ASSUMPTION: bigwig filename pattern - bigwig = c.patterns['bigwig'][direction].format(sample=sample) + bigwig = f"data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.{direction}.bigwig" subgroup = df[df.iloc[:, 0] == sample].to_dict('records')[0] subgroup = { From 69376c919ce561dfa3d7ba90d4d34aa068173d82 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 29 Mar 2025 14:27:32 +0000 Subject: [PATCH 077/196] rm colocalization workflow --- workflows/colocalization/Snakefile | 230 --------------- workflows/colocalization/config/config.yaml | 8 - workflows/colocalization/run_test.sh | 3 - .../scripts/colocalization_heatmap.py | 267 ------------------ .../colocalization/scripts/heatmap_env.yaml | 7 - 5 files changed, 515 deletions(-) delete mode 100644 workflows/colocalization/Snakefile delete mode 100644 workflows/colocalization/config/config.yaml delete mode 100755 workflows/colocalization/run_test.sh delete mode 100644 workflows/colocalization/scripts/colocalization_heatmap.py delete mode 100644 workflows/colocalization/scripts/heatmap_env.yaml diff --git a/workflows/colocalization/Snakefile b/workflows/colocalization/Snakefile deleted file mode 100644 index cb5a7991..00000000 --- a/workflows/colocalization/Snakefile +++ /dev/null @@ -1,230 +0,0 @@ -import sys -sys.path.insert(0, srcdir('../..')) -import os -from textwrap import dedent -import yaml -import tempfile -import pandas as pd -from lib import helpers, aligners -from lib import utils -from lib import common -from lib.patterns_targets import RNASeqConfig, ChIPSeqConfig -import os -from snakemake.utils import makedirs -import pandas -import yaml -import numpy as np - -configfile: 'config/config.yaml' - -chipseq_config = ChIPSeqConfig('config/config.yaml', 'config/chipseq_patterns.yaml', workdir='../chipseq') - -subworkflow chipseq: - configfile: chipseq_config.path - workdir: '../chipseq' - -subworkflow references: - configfile: chipseq_config.path - workdir: '../chipseq' - -subworkflow external: - workdir: '../external' - -chipseq_refdict, chipseq_args = common.references_dict(chipseq_config.config) - -# The rule to create the chromsizes file is in the references workflow; the -# path to it can be determined from the config file (though it is awkwardly -# nested) -chromsizes = references( - chipseq_refdict[ - chipseq_config.config['organism'] - ][ - chipseq_config.config['aligner']['tag'] - ]['chromsizes'] -) - -# In the existing config file, we assume that all BED files are from the -# `external` workflow. - -for k, v in config['beds'].items(): - config['beds'][k] = external(v) - -# If ADD_CHIPSEQ_PEAKS is True, we will addn all the called peaks to the bed -# files to check for colocalization. -ADD_CHIPSEQ_PEAKS = True -# ADD_CHIPSEQ_PEAKS = False # [TEST SETTINGS -1] - -if ADD_CHIPSEQ_PEAKS: - peaks = chipseq(utils.flatten(chipseq_config.targets['peaks'])) - for fn in peaks: - toks = fn.split('/') - peakcaller = toks[-3] - label = toks[-2] - key = peakcaller + '_' + label - config['beds'][key] = fn - - -targets = expand( - '{outdir}/{algorithm}/{domain}/{query}/{query}_vs_{reference}.txt', - outdir=config['output'], - domain=config['domains'].keys(), - query=config['beds'].keys(), - reference=config['beds'].keys(), - algorithm=['IntervalStats', 'jaccard', 'fisher'], -) - -# Currently-supported options {algorithm: (possible values)} -# IntervalStats: (f_05, f_01, f_001) -# jaccard: (jaccard) -# fisher: (pval) -pattern = '{outdir}/{algorithm}/{domain}/{value}_heatmap.pdf' -targets += expand(pattern, outdir=config['output'], domain=config['domains'], - algorithm='IntervalStats', value=['f_01']) -targets += expand(pattern, outdir=config['output'], domain=config['domains'], - algorithm='jaccard', value=['jaccard']) -targets += expand(pattern, outdir=config['output'], domain=config['domains'], - algorithm='fisher', value=['pval']) - -rule targets: - input: targets - - -rule sorted_chromsizes: - input: chromsizes - output: os.path.join(config['output'], config['organism'] + '.sorted.chromsizes') - shell: - 'sort -k1,1 {input} > {output}' - -rule chromsizes_bed: - input: rules.sorted_chromsizes.output - output: os.path.join(config['output'], config['organism'] + '.bed') - shell: - """awk '{{OFS="\\t"; print $1,"0",$2}}' {input} > {output}""" - - -rule jaccard: - input: - domain=lambda wc: config['domains'][getattr(wc, 'domain')], - query=lambda wc: config['beds'][getattr(wc, 'query')], - reference=lambda wc: config['beds'][getattr(wc, 'reference')], - chromsizes=rules.sorted_chromsizes.output - output: '{outdir}/jaccard/{domain}/{query}/{query}_vs_{reference}.txt' - shell: - """ - bedtools intersect -a {input.query} -b {input.domain} | sort -k1,1 -k2n > {output}.query.jaccard - bedtools intersect -a {input.reference} -b {input.domain} | sort -k1,1 -k2n > {output}.reference.jaccard - bedtools jaccard -a {output}.query.jaccard -b {output}.reference.jaccard -g {input.chromsizes} > {output} - rm {output}.query.jaccard {output}.reference.jaccard - """ - - -rule fisher: - input: - domain=lambda wc: config['domains'][getattr(wc, 'domain')], - query=lambda wc: config['beds'][getattr(wc, 'query')], - reference=lambda wc: config['beds'][getattr(wc, 'reference')], - chromsizes=rules.sorted_chromsizes.output - output: '{outdir}/fisher/{domain}/{query}/{query}_vs_{reference}.txt' - shell: - """ - bedtools intersect -a {input.query} -b {input.domain} | sort -k1,1 -k2n > {output}.query.fisher - bedtools intersect -a {input.reference} -b {input.domain} | sort -k1,1 -k2n > {output}.reference.fisher - bedtools fisher -a {output}.query.fisher -b {output}.reference.fisher -g {input.chromsizes} > {output} - rm {output}.query.fisher {output}.reference.fisher - """ - - -rule intervalstats: - input: - domain=lambda wc: config['domains'][getattr(wc, 'domain')], - query=lambda wc: config['beds'][getattr(wc, 'query')], - reference=lambda wc: config['beds'][getattr(wc, 'reference')], - output: '{outdir}/IntervalStats/{domain}/{query}/{query}_vs_{reference}.txt' - log: '{outdir}/IntervalStats/{domain}/{query}/{query}_vs_{reference}.log' - run: - if input.query == input.reference: - run_self = '--self' - else: - run_self = '' - shell( - 'IntervalStats ' - '--query {input.query} ' - '--reference {input.reference} ' - '--output {output}.full ' - '--domain {input.domain} ' - '{run_self} &> {log}' - ) - - # Summarize the output into a faster-to-parse file used by downstream - # analysis code. - # - # Output has columns: - # - # - n_{05,01,001}: number of significant associations at {0.05, 0.01, - # 0.001} respectively - # - # - f_{05,01,001}: fraction of total that are signficant - # - # - n: number of features - # - # - query, reference: labels - # - # - filename: "all" filename containing the details in case anything - # needs re-calculation. - _df = pandas.read_table( - str(output[0]) + '.full', - names=['query', 'closest_ref', 'length', 'distance', - 'numerator', 'denominator', 'pval']) - - n = float(len(_df)) - - def frac(x): - if n == 0: - return np.nan - return x / n - - n_05 = sum(_df.pval < 0.05) - n_01 = sum(_df.pval < 0.01) - n_001 = sum(_df.pval < 0.001) - f_05 = frac(n_05) - f_01 = frac(n_01) - f_001 = frac(n_001) - - df = pandas.DataFrame( - [ - dict( - query=wildcards.query, - filename=str(output[0]) + '.full', - reference=wildcards.reference, - n=float(n), - n_05=n_05, - n_01=n_01, - n_001=n_001, - f_05=f_05, - f_01=f_01, - f_001=f_001, - ) - ] - ) - df.to_csv(str(output[0]), sep='\t', index=False) - - -rule heatmap: - input: - expand( - '{{outdir}}/{{algorithm}}/{{domain}}/{query}/{query}_vs_{reference}.txt', - query=list(config['beds'].keys()), - reference=list(config['beds'].keys()) - ) - output: - '{outdir}/{algorithm}/{domain}/{value}_heatmap.pdf' - - shell: - 'python scripts/colocalization_heatmap.py ' - '--domain {wildcards.domain} ' - '--algorithm {wildcards.algorithm} ' - '--value {wildcards.value} ' - '--outdir {config[output]} ' - '--output {output}' - -# vim: ft=python diff --git a/workflows/colocalization/config/config.yaml b/workflows/colocalization/config/config.yaml deleted file mode 100644 index 40734704..00000000 --- a/workflows/colocalization/config/config.yaml +++ /dev/null @@ -1,8 +0,0 @@ -beds: - # from the external workflow - SuHw_Kc: data/suhw_kc.bed - CTCF_Kc: data/ctcf_kc.bed -domains: - dm6: results/dm6.bed -output: results -organism: dm6 diff --git a/workflows/colocalization/run_test.sh b/workflows/colocalization/run_test.sh deleted file mode 100755 index 7aacb413..00000000 --- a/workflows/colocalization/run_test.sh +++ /dev/null @@ -1,3 +0,0 @@ -set -e -python -m doctest ../../ci/preprocessor.py -python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@" diff --git a/workflows/colocalization/scripts/colocalization_heatmap.py b/workflows/colocalization/scripts/colocalization_heatmap.py deleted file mode 100644 index b337fbb4..00000000 --- a/workflows/colocalization/scripts/colocalization_heatmap.py +++ /dev/null @@ -1,267 +0,0 @@ -import matplotlib -matplotlib.use('agg') -import os -import glob -import pandas as pd -import numpy as np -import seaborn as sns -from scipy.spatial import distance -from scipy.cluster import hierarchy -from matplotlib import pyplot as plt -import argparse - -ap = argparse.ArgumentParser() -ap.add_argument('--domain') -ap.add_argument('--algorithm') -ap.add_argument('--value') -ap.add_argument('--outdir') -ap.add_argument('--output') -args = ap.parse_args() - -domain = args.domain -algorithm = args.algorithm -value = args.value -outdir = args.outdir -output = args.output - - -def dataframe_for_domain(domain, algorithm): - """ - Read all files within a directory and build the dataframe. - - Empty files are listed as NaNs in the dataframe. - """ - df = [] - files = glob.glob(os.path.join(outdir, algorithm, domain, '*', '*.txt')) - for filename in files: - query, reference = os.path.basename(filename).replace('.txt', '').split('_vs_') - try: - _df = pd.read_csv(filename, comment='#', sep='\t') - except pd.errors.EmptyDataError: - _df = pd.DataFrame([dict(value=np.nan)]) - - _df['query'] = query - _df['reference'] = reference - df.append( - _df.iloc[0].to_dict() - ) - return pd.DataFrame(df) - - -# Cluster methods -METRIC = 'correlation' -METHOD = 'average' - - -def dataframe_for_value(domain, algorithm, value): - - df = dataframe_for_domain(domain, algorithm) - - vmin, vmax = None, None - - # For IntervalStats, Use the "fraction of intervals with p<0.01" as the - # value. - # - # These are all positive values. NaNs are set to 0, and the diagonal is - # set to 1.0 (i.e., 100% of intervals are significant with respect to - # each other) - if algorithm == 'IntervalStats': - piv = df.pivot(index='query', columns='reference', values=value) - fill_piv = piv.fillna(0) - vmax = fill_piv.max().max() - np.fill_diagonal(fill_piv.values, 1) - units = 'fraction pvals < 0.%s' % (value.split('_')[-1]) - title = 'IntervalStats' - - # For GAT log2foldchange, set anything with qval > 0.05 to - # logfoldchange = 0. Diagonal is filled with 0 (log2foldchange of 1). - # NaNs are also set to 0. - elif algorithm == 'GAT' and value == 'l2fold': - piv = df.pivot(index='query', columns='reference', values='l2fold') - - # used for checking - mask = df.pivot(index='query', columns='reference', - values='qvalue') - title = 'GAT foldchange' - piv[mask > 0.05] = 0 - piv = piv.fillna(0) - fill_piv = piv - np.fill_diagonal(fill_piv.values, 0) - units = 'log2fold' - - # For GAT fractions, we set the upper and lower triangles of the matrix - # to the "track" and "annotation" overlaps in GAT terminology. We also - # get a significance value here (qval) so we set the fraction overlap - # to zero for anything with qval > 0.05. - elif algorithm == 'GAT' and value == 'fractions': - segment_frac = df.pivot(index='query', columns='reference', - values='percent_overlap_size_track') - annotation_frac = df.pivot(index='query', columns='reference', - values='percent_overlap_size_annotation') - mask = df.pivot(index='query', columns='reference', values='qvalue') - piv = segment_frac - lower_tri_mask = np.ones(piv.shape, dtype='bool') - lower_tri_mask[np.tril_indices(len(piv))] = False - piv[lower_tri_mask] = annotation_frac[lower_tri_mask] - piv[mask > 0.05] = 0 - fill = 0 - fill_piv = piv - units = 'percentage overlap' - title = 'GAT percentage nucleotide overlap' - - # For fisher, we want to plot the -log10(two-tail pval). - # - # So we keep track of the ratio, flip pvals where ratio <1, and replace - # inf and -inf with the otherwise max and min values respectively. NaNs - # are given a -log10(pval) = 0 (so a pval of 1.0). - elif algorithm == 'fisher' and value == 'pval': - piv = df.pivot(index='query', columns='reference', - values='two-tail') - mask_left = df.pivot(index='query', columns='reference', - values='left') - mask_right = df.pivot(index='query', columns='reference', - values='right') - mask_ratio = df.pivot(index='query', columns='reference', - values='ratio') - flip = mask_ratio < 1 - piv = -np.log10(piv) - piv[flip] *= -1 - mx = piv.replace([np.inf], 0).max().max() - mn = piv.replace([-np.inf], 0).min().min() - piv = piv.replace([np.inf], mx) - piv = piv.replace([-np.inf], mn) - fill_piv = piv.fillna(0) - units = '-log10(pval)' - title = 'Fisher' - - #################################################### - # TODO: also plot fisher ratio - #################################################### - - # For jaccard, we plot the value directly. While the value can range - # [0, 1], in practice we rarely find such good overlap. - elif algorithm == 'jaccard' and value == 'jaccard': - piv = df.pivot(index='query', columns='reference', values='jaccard') - fill_piv = piv - units = 'Jaccard statistic' - vmin, vmax = (0, .3) - title = 'Jaccard' - - return dict( - fill_piv=fill_piv, - vmin=vmin, - vmax=vmax, - units=units, - title=title - ) - - -def plot_heatmap(fill_piv, vmin, vmax, title, units, metric='euclidean', - method='average', idx=None, clustermap_kwargs=dict()): - """ - Plot a clustered heatmap of the provided values. Rows are clustered - identically as columns so that the diagonal represents the self-self - comparisons. - - Parameters - ---------- - - fill_piv : pandas.DataFrame - A prepared dataframe where rownames == colnames and where -inf, inf, - and NaN have been filled in with finite values. - - vmin, vmax : float - Colormap limits. NOT CURRENTLY USED. - - title : str - Title for plot - - units : str - Units to use in colorbar - - metric : str - Clustering metric. See `scipy.distance` for available options. - - method : clustering method - Hierarchical clustering linkage method. See `scipy.hierarchy` for - available options. - - idx : None or index - If not None, then this index is used to subset `fill_piv`. - - clustermap_kwargs : dict - Additional arguments passed to seaborn.clustermap. - """ - - - fill_piv = fill_piv.astype(float) - # subset if requested - if idx is not None: - fill_piv = fill_piv.loc[idx, idx] - - # Distance matrix, setting NaN to zero if necessary - dist = distance.pdist(fill_piv.values, metric=metric) - dist[np.isnan(dist)] = 0 - dist[dist < 0] = 0 - - # ward actually uses values directly rather than using the distance matrix. - if method == 'ward': - vals = fill_piv.values - else: - vals = dist - - # Here we compute the row linkage and provide that to sns.clustermap as - # both row and column linkages so that the same clustering is used. This - # gets us the self-self colocalization on the diagonal. - row_linkage = hierarchy.linkage(vals, method=method) - - # catch and fix errors in dendrogram before sending to clustermap - mx = row_linkage[np.isfinite(row_linkage)].max() - mn = row_linkage[np.isfinite(row_linkage)].min() - # row_linkage[np.isinf(row_linkage)] = mx - # scipy.clip(row_linkage, 0, mx, row_linkage) - ind = hierarchy.dendrogram(row_linkage, no_plot=True)['leaves'] - - - a = sns.clustermap(fill_piv, row_linkage=row_linkage, - col_linkage=row_linkage, **clustermap_kwargs) - - # Fix labels - for txt in a.ax_heatmap.get_xticklabels(): - txt.set_rotation(90) - for txt in a.ax_heatmap.get_yticklabels(): - txt.set_rotation(0) - - # Use the provided units to label the colorbar - a.cax.set_ylabel(units) - - # Add figure-level title and tweak margins. - fig = plt.gcf() - fig.suptitle(title, weight='bold', size=20) - fig.subplots_adjust(right=0.8, bottom=0.2) - return a - - -v = dataframe_for_value(domain, algorithm, value) - -if (v['fill_piv'] < 0).values.any() & (v['fill_piv'] > 0).values.any(): - center = 0 - cmap = 'RdBu_r' -else: - center = None - cmap = sns.cubehelix_palette(as_cmap=True) - - -fig = plot_heatmap( - fill_piv=v['fill_piv'], - vmin=v['vmin'], - vmax=v['vmax'], - title=v['title'], - units=v['units'], - metric='euclidean', - method='average', - idx=None, - clustermap_kwargs=dict(center=center, cmap=cmap) -) - -fig.savefig(output) diff --git a/workflows/colocalization/scripts/heatmap_env.yaml b/workflows/colocalization/scripts/heatmap_env.yaml deleted file mode 100644 index 668a0d76..00000000 --- a/workflows/colocalization/scripts/heatmap_env.yaml +++ /dev/null @@ -1,7 +0,0 @@ -channels: - - conda-forge -dependencies: - - matplotlib - - pandas - - seaborn - - scipy From 1e50d55a40b3e8446b81d8fca0a1f620b4050c17 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 29 Mar 2025 14:40:37 +0000 Subject: [PATCH 078/196] rm references and colocalization tests --- .circleci/config.yml | 39 --------------------------------------- 1 file changed, 39 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 16e5b5f0..66de1446 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -140,7 +140,6 @@ variables: cp $ORIG/workflows/rnaseq/run_test.sh $DEPLOY/workflows/rnaseq/run_test.sh cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh cp $ORIG/workflows/references/run_test.sh $DEPLOY/workflows/references/run_test.sh - # cp $ORIG/workflows/colocalization/run_test.sh $DEPLOY/workflows/colocalization/run_test.sh mkdir $DEPLOY/ci mkdir $DEPLOY/test @@ -271,17 +270,6 @@ variables: - # -------------------------------------------------------------------------- - # Standard colocalization workflow - colocalization-step: &colocalization-step - run: - name: colocalization workflow - command: | - cd $DEPLOY/workflows/colocalization - source /opt/miniforge/etc/profile.d/conda.sh - conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test colocalization --run-workflow -k -p -j2 --use-conda --orig $ORIG - # -------------------------------------------------------------------------- # Syntax note: All of the steps above, with their "&step-name" labels, can be # referred to by a corresponding "*step-name" below. The "<<: *defaults" @@ -389,23 +377,6 @@ jobs: - *get-data - *rnaseq-misc-step - # colocalization: - # <<: *defaults - # steps: - # - checkout - # - *restore_cache - # - *set-path - # - *get-data - # - *colocalization-step - - # references: - # <<: *defaults - # steps: - # - checkout - # - *restore_cache - # - *set-path - # - *get-data - # - *references-step build-docs: <<: *defaults @@ -479,14 +450,6 @@ workflows: requires: - initial-setup - pytest - # - references: - # requires: - # - initial-setup - # - pytest - # - colocalization: - # requires: - # - initial-setup - # - pytest - build-docs: requires: - initial-setup @@ -496,5 +459,3 @@ workflows: - rnaseq-misc - chipseq - chipseq-misc - - references - # - colocalization From f37e666fee4e5e979dec6306fc856ec5a2473c9f Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 2 Apr 2025 01:57:10 +0000 Subject: [PATCH 079/196] rm rnaseq_patterns --- workflows/rnaseq/config/rnaseq_patterns.yaml | 51 -------------------- 1 file changed, 51 deletions(-) delete mode 100644 workflows/rnaseq/config/rnaseq_patterns.yaml diff --git a/workflows/rnaseq/config/rnaseq_patterns.yaml b/workflows/rnaseq/config/rnaseq_patterns.yaml deleted file mode 100644 index 35681125..00000000 --- a/workflows/rnaseq/config/rnaseq_patterns.yaml +++ /dev/null @@ -1,51 +0,0 @@ -strand_check: - fastq: 'strand_check/{sample}/{sample}_R{n}.strandedness.fastq' - bam: 'strand_check/{sample}/{sample}.strandedness.bam' - tsv: 'strand_check/{sample}/{sample}.strandedness' -fastq: 'data/rnaseq_samples/{sample}/{sample}_R{n}.fastq.gz' -sra_fastq: 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz' -cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz' -bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam' -fastqc: - raw: 'data/rnaseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip' - cutadapt: 'data/rnaseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip' - bam: 'data/rnaseq_samples/{sample}/fastqc/{sample}.cutadapt.bam_fastqc.zip' -libsizes: - fastq: 'data/rnaseq_samples/{sample}/{sample}_R1.fastq.gz.libsize' - cutadapt: 'data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz.libsize' - bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.libsize' -fastq_screen: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.screen.txt' -featurecounts: - per_sample: 'data/rnaseq_samples/{sample}/{sample}_featurecounts.txt' - aggregated: 'data/rnaseq_aggregation/featurecounts.txt' -libsizes_table: 'data/rnaseq_aggregation/libsizes_table.tsv' -libsizes_yaml: 'data/rnaseq_aggregation/libsizes_table_mqc.yaml' -rrna_percentages_table: 'data/rnaseq_aggregation/rrna_percentages_table.tsv' -rrna_percentages_yaml: 'data/rnaseq_aggregation/rrna_percentages_table_mqc.yaml' -rrna: - bam: 'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam' - libsize: 'data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam.libsize' -multiqc: 'data/rnaseq_aggregation/multiqc.html' -markduplicates: - bam: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam' - bai: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.bai' - metrics: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.metrics' -collectrnaseqmetrics: - metrics: 'data/rnaseq_samples/{sample}/{sample}.collectrnaseqmetrics.metrics' -preseq: 'data/rnaseq_samples/{sample}/{sample}_preseq_c_curve.txt' -salmon: 'data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf' -kallisto: 'data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5' -rseqc: - infer_experiment: 'data/rnaseq_samples/{sample}/rseqc/{sample}_infer_experiment.txt' - read_distribution: 'data/rnaseq_samples/{sample}/rseqc/{sample}_read_distribution.txt' -bigwig: - pos: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.pos.bigwig' - neg: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.neg.bigwig' -downstream: - rnaseq: 'downstream/rnaseq.html' -patterns_by_aggregate: - merged_bigwig: 'data/rnaseq_aggregation/merged_bigwigs/{merged_bigwig_label}.bigwig' -samtools: - idxstats: 'data/rnaseq_samples/{sample}/idxstat_{sample}.txt' - flagstat: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.flagstat' - stats: 'data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.bam.stats' From 6ab28076a7fc4724e6477586fdd836da5dd76b47 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 2 Apr 2025 01:57:47 +0000 Subject: [PATCH 080/196] move sra rule to separate file --- workflows/rnaseq/Snakefile | 43 ++------------------------------------ workflows/rnaseq/sra.smk | 40 +++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 41 deletions(-) create mode 100644 workflows/rnaseq/sra.smk diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 7247bbc2..470afbb8 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -36,47 +36,8 @@ rule all: include: "../references/Snakefile" -if utils.detect_sra(sampletable): - sampletable["orig_filename"] = expand( - "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1 - ) - - if is_paired: - sampletable["orig_filename_R2"] = expand( - "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", - sample=SAMPLES, - n=2, - ) - - rule fastq_dump: - output: - fastq=expand( - "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", - n=n, - allow_missing=True, - ), - log: - "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", - params: - is_paired=is_paired, - # extra="-X 100000", # [enable for test] - resources: - mem="1g", - disk="1g", - runtime="2h", - run: - srr = sampletable.loc[wildcards.sample, "Run"] - extra = params.get("extra", "") - if is_paired: - shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") - shell("mv {srr}_1.fastq.gz {output[0]}") - shell("mv {srr}_2.fastq.gz {output[1]}") - else: - shell( - "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp" - ) - shell("mv {output[0]}.tmp {output[0]}") - +# If the sampletable is from SRA, handle it here. +include: "sra.smk" rule symlinks: diff --git a/workflows/rnaseq/sra.smk b/workflows/rnaseq/sra.smk new file mode 100644 index 00000000..5ee5f53b --- /dev/null +++ b/workflows/rnaseq/sra.smk @@ -0,0 +1,40 @@ +if utils.detect_sra(sampletable): + sampletable["orig_filename"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1 + ) + + if is_paired: + sampletable["orig_filename_R2"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + sample=SAMPLES, + n=2, + ) + + rule fastq_dump: + output: + fastq=expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + n=n, + allow_missing=True, + ), + log: + "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", + params: + is_paired=is_paired, + # extra="-X 100000", # [enable for test] + resources: + mem="1g", + disk="1g", + runtime="2h", + run: + srr = sampletable.loc[wildcards.sample, "Run"] + extra = params.get("extra", "") + if is_paired: + shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") + shell("mv {srr}_1.fastq.gz {output[0]}") + shell("mv {srr}_2.fastq.gz {output[1]}") + else: + shell( + "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp" + ) + shell("mv {output[0]}.tmp {output[0]}") From 091e21538d03e6c6800f67cc8fdf121e30c9c22d Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 2 Apr 2025 01:58:12 +0000 Subject: [PATCH 081/196] move strand check to separate file --- workflows/rnaseq/Snakefile | 81 ++----------------------------- workflows/rnaseq/strand_check.smk | 75 ++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 78 deletions(-) create mode 100644 workflows/rnaseq/strand_check.smk diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 470afbb8..91c036d4 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -35,6 +35,9 @@ rule all: include: "../references/Snakefile" +# Optionally run `snakemake strand_check` to do a preliminary run on +# automatically-subset data to evaluate strandedness. +include: "strand_check.smk" # If the sampletable is from SRA, handle it here. include: "sra.smk" @@ -66,84 +69,6 @@ rule symlink_targets: ), -# Optionally run ``snakemake strand_check`` to do a preliminary run on -# automatically-subset data to evaluate strandedness. -rule sample_strand_check: - input: - fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), - index=expand(rules.bowtie2_index.output, label="genome"), - bed12=rules.conversion_bed12.output, - output: - strandedness="strand_check/{sample}/{sample}.strandedness", - bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"), - bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"), - fastqs=temporary( - expand( - "strand_check/{sample}/{sample}_R{n}.strandedness.fastq", - n=n, - allow_missing=True, - ) - ), - log: - "strand_check/{sample}/{sample}.strandedness.log", - threads: 6 - resources: - mem="8g", - runtime="2h", - run: - prefix = os.path.commonprefix(input.index).rstrip(".") - nreads = int(1e5 * 4) - if is_paired: - shell( - "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" - ) - shell( - "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}" - ) - fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} " - else: - shell( - "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" - ) - fastqs = f"-U {output.fastqs[0]} " - shell( - "bowtie2 " - "-x {prefix} " - "{fastqs} " - "--no-unal " - "--threads {threads} 2> {log} " - "| samtools view -Sb - " - "| samtools sort - -o {output.bam} " - ) - shell("samtools index {output.bam}") - shell( - "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}" - ) - - -rule strand_check: - input: - expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES), - output: - html="strand_check/strandedness.html", - filelist=temporary("strand_check/filelist"), - log: - "strand_check/strandedness.log", - resources: - mem="1g", - runtime="2h", - run: - with open(output.filelist, "w") as fout: - for i in input: - fout.write(i + "\n") - shell( - "multiqc " - "--force " - "--module rseqc " - "--file-list {output.filelist} " - "--filename {output.html} &> {log}" - ) - rule cutadapt: input: diff --git a/workflows/rnaseq/strand_check.smk b/workflows/rnaseq/strand_check.smk new file mode 100644 index 00000000..9c8a3467 --- /dev/null +++ b/workflows/rnaseq/strand_check.smk @@ -0,0 +1,75 @@ +rule sample_strand_check: + input: + fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), + index=expand(rules.bowtie2_index.output, label="genome"), + bed12=rules.conversion_bed12.output, + output: + strandedness="strand_check/{sample}/{sample}.strandedness", + bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"), + bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"), + fastqs=temporary( + expand( + "strand_check/{sample}/{sample}_R{n}.strandedness.fastq", + n=n, + allow_missing=True, + ) + ), + log: + "strand_check/{sample}/{sample}.strandedness.log", + threads: 6 + resources: + mem="8g", + runtime="2h", + run: + prefix = os.path.commonprefix(input.index).rstrip(".") + nreads = int(1e5 * 4) + if is_paired: + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" + ) + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}" + ) + fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} " + else: + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" + ) + fastqs = f"-U {output.fastqs[0]} " + shell( + "bowtie2 " + "-x {prefix} " + "{fastqs} " + "--no-unal " + "--threads {threads} 2> {log} " + "| samtools view -Sb - " + "| samtools sort - -o {output.bam} " + ) + shell("samtools index {output.bam}") + shell( + "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}" + ) + + +rule strand_check: + input: + expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES), + output: + html="strand_check/strandedness.html", + filelist=temporary("strand_check/filelist"), + log: + "strand_check/strandedness.log", + resources: + mem="1g", + runtime="2h", + run: + with open(output.filelist, "w") as fout: + for i in input: + fout.write(i + "\n") + shell( + "multiqc " + "--force " + "--module rseqc " + "--file-list {output.filelist} " + "--filename {output.html} &> {log}" + ) From baf15c923953d5d443c48fc7dec7a07b3afe6a0f Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 2 Apr 2025 01:58:28 +0000 Subject: [PATCH 082/196] move params that don't depend on config back into rules more params inside rules more params in rule --- workflows/chipseq/Snakefile | 26 ++++++---------- workflows/rnaseq/Snakefile | 60 +++++++++++++------------------------ 2 files changed, 30 insertions(+), 56 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 4dabbb52..5fbdbe1c 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -115,18 +115,6 @@ rule cutadapt: resources: mem="2g", runtime="2h", - params: - extra=( - ( - "--nextseq-trim 20 " - "--overlap 6 " - "--minimum-length 25 " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - ) - + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " - if is_paired - else "" - ), run: if is_paired: shell( @@ -134,7 +122,11 @@ rule cutadapt: "-o {output[0]} " "-p {output[1]} " "-j {threads} " - "{params.extra} " + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " "{input.fastq[0]} " "{input.fastq[1]} " "&> {log}" @@ -144,7 +136,10 @@ rule cutadapt: "cutadapt " "-o {output[0]} " "-j {threads} " - "{params.extra} " + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " "{input.fastq[0]} " "&> {log}" ) @@ -205,8 +200,6 @@ rule bowtie2: resources: mem="32g", runtime="2h", - params: - extra="", run: prefix = os.path.commonprefix(input.index).rstrip(".") sam = output.bam.replace(".bam", ".sam") @@ -222,7 +215,6 @@ rule bowtie2: "--no-unal " "--threads {threads} " "-S {sam} " - "{params.extra} " "> {log} 2>&1" ) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 91c036d4..d8239552 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -83,18 +83,6 @@ rule cutadapt: resources: mem="2g", runtime="2h", - params: - extra=( - ( - "--nextseq-trim 20 " - "--overlap 6 " - "--minimum-length 25 " - "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - ) - + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " - if is_paired - else "" - ), run: if is_paired: shell( @@ -102,6 +90,11 @@ rule cutadapt: "-o {output[0]} " "-p {output[1]} " "-j {threads} " + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " "{params.extra} " "{input.fastq[0]} " "{input.fastq[1]} " @@ -112,6 +105,10 @@ rule cutadapt: "cutadapt " "-o {output[0]} " "-j {threads} " + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " "{params.extra} " "{input.fastq[0]} " "&> {log}" @@ -390,21 +387,16 @@ rule rRNA: resources: mem="2g", runtime="2h", - params: - extra=( - "-k 1 " - "--no-unal " - ), run: prefix = os.path.commonprefix(input.index).rstrip(".") sam = output.bam.replace(".bam", ".sam") - shell( "bowtie2 " "-x {prefix} " "-U {input.fastq} " "--threads {threads} " - "{params.extra} " + "-k 1 " + "--no-unal " "-S {sam} " "> {log} 2>&1" ) @@ -630,13 +622,6 @@ rule salmon: resources: mem="32g", runtime="2h", - params: - extra=( - "--libType=A " - "--gcBias " - "--seqBias " - "--validateMappings " - ), run: outdir = os.path.dirname(output[0]) index_dir = os.path.dirname(input.index) @@ -649,7 +634,10 @@ rule salmon: "--index {index_dir} " "--output {outdir} " "--threads {threads} " - "{params.extra} " + "--libType=A " + "--gcBias " + "--seqBias " + "--validateMappings " "{fastq_arg} " "&> {log}" ) @@ -787,19 +775,16 @@ rule bigwig_neg: "fr-firststrand": "--filterRNAstrand reverse ", "fr-secondstrand": "--filterRNAstrand forward ", }[config["stranded"]], - extra=( - "--minMappingQuality 20 " - "--smoothLength 10 " - "--normalizeUsing BPM " # [disable for test] - ), run: shell( "bamCoverage " "--bam {input.bam} " "-o {output} " "-p {threads} " - "{params.extra} " "{params.strand_arg} " + "--minMappingQuality 20 " + "--smoothLength 10 " + "--normalizeUsing BPM " # [disable for test] "&> {log}" ) @@ -822,18 +807,15 @@ rule bigwig_pos: "fr-firststrand": "--filterRNAstrand forward ", "fr-secondstrand": "--filterRNAstrand reverse ", }[config["stranded"]], - extra=( - "--minMappingQuality 20 " - "--smoothLength 10 " - "--normalizeUsing BPM " # [disable for test] - ), run: shell( "bamCoverage " "--bam {input.bam} " "-o {output} " "-p {threads} " - "{params.extra} " + "--minMappingQuality 20 " + "--smoothLength 10 " + "--normalizeUsing BPM " # [disable for test] "{params.strand_arg} " "&> {log}" ) From 26d0834ca263054311927ab1ef4c3d6daefc5c2a Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 2 Apr 2025 01:58:59 +0000 Subject: [PATCH 083/196] support only star 1-pass mode --- workflows/references/Snakefile | 29 --- workflows/rnaseq/Snakefile | 276 +++++++--------------------- workflows/rnaseq/config/config.yaml | 2 - 3 files changed, 62 insertions(+), 245 deletions(-) diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile index 682f1bfe..6ee892f8 100644 --- a/workflows/references/Snakefile +++ b/workflows/references/Snakefile @@ -140,35 +140,6 @@ rule star_index: shell("ln -s {input.fasta} {genomedir}") -rule hisat2_index: - input: - f"{REFERENCES}/genome.fa", - output: - multiext( - f"{REFERENCES}/hisat2/genome", - ".1.ht2", - ".2.ht2", - ".3.ht2", - ".4.ht2", - ".5.ht2", - ".6.ht2", - ".7.ht2", - ".8.ht2", - ".fa", - ), - log: - f"{REFERENCES}/logs/hisat2.log", - resources: - mem="32g", - disk="50g", - runtime="8h", - threads: 8 - run: - index = os.path.commonprefix(output).rstrip(".") - shell("hisat2-build" " --threads {threads}" " {input}" " {index}" " &> {log}") - shell("ln -s {input} {output[-1]}") - - rule transcriptome_fasta: input: fasta=f"{REFERENCES}/genome.fa", diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index d8239552..dd736780 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -148,222 +148,70 @@ rule fastqc: shell("mv {out_html} {output.html}") -if config["aligner"] == "hisat2": - - rule hisat2: - input: - fastq=rules.cutadapt.output, - index=rules.hisat2_index.output, - output: - bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), - log: - "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log", - threads: 16 - resources: - mem="32g", - runtime="8h", - params: - extra="", - run: - prefix = os.path.commonprefix(input.index).rstrip(".") - sam = output.bam.replace(".bam", ".sam") - - if is_paired: - assert len(input.fastq) == 2 - fastqs = "-1 {0} -2 {1} ".format(*input.fastq) - else: - assert len(input.fastq) == 1 - fastqs = "-U {0} ".format(input.fastq) - shell( - "hisat2 " - "-x {prefix} " - "{fastqs} " - "--no-unal " - "--threads {threads} " - "-S {sam} " - "{params.extra} " - "> {log} 2>&1" - ) - - shell( - "samtools view -Sb {sam} " - "| samtools sort - -o {output.bam} -O BAM " - "&& rm {sam}" - ) - - - -if config["aligner"].startswith("star"): - if os.getenv("TMPDIR"): - tmpdir_arg = "--outTmpDir $TMPDIR/star " - else: - tmpdir_arg = "" - # STAR can be run in 1-pass or 2-pass modes. Since we may be running it - # more than once in almost the same way, we pull out the shell command here - # and use it below. - STAR_CMD = ( - "STAR " - "--runThreadN {threads} " - "--genomeDir {genomedir} " - "--readFilesIn {input.fastq} " - "--readFilesCommand zcat " - "--outFileNamePrefix {prefix} " - "{tmpdir_arg} " - "{params.extra} " - ) - STAR_PARAMS = ( - # NOTE: The STAR docs indicate that the following parameters are - # standard options for ENCODE long-RNA-seq pipeline. Comments are from - # the STAR docs. - "--outFilterType BySJout " # reduces number of spurious junctions - "--outFilterMultimapNmax 20 " # if more than this many multimappers, consider unmapped - "--alignSJoverhangMin 8 " # min overhang for unannotated junctions - "--alignSJDBoverhangMin 1 " # min overhang for annotated junctions - "--outFilterMismatchNmax 999 " # max mismatches per pair - "--outFilterMismatchNoverReadLmax 0.04 " # max mismatches per pair relative to read length - "--alignIntronMin 20 " # min intron length - "--alignIntronMax 1000000 " # max intron length - "--alignMatesGapMax 1000000 " # max distance between mates - "--outSAMunmapped None " # do not report aligned reads in output - ) - logfile_extensions = ["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"] - -if config["aligner"] == "star": - - rule star: - "Align with STAR (1-pass mode)" - input: - fastq=rules.cutadapt.output, - index=rules.star_index.output, - annotation=f"{REFERENCES}/annotation.gtf", - output: - bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), - sjout=temporary( - "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab" - ), - log: - "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log", - threads: 16 - resources: - mem="64g", - runtime="8h", - disk="80g", - params: - extra=STAR_PARAMS, - run: - genomedir = os.path.dirname(input.index[0]) - outdir = os.path.dirname(output[0]) - prefix = output.bam.replace(".bam", ".star.") - shell( - STAR_CMD - + ( - "--outSAMtype BAM SortedByCoordinate " - "--outStd BAM_SortedByCoordinate > {output.bam} " - "2> {log} " - ) - ) - - # move various hard-coded log files to log directory - logfiles = expand(prefix + "{ext}", ext=logfile_extensions) - shell( - "mkdir -p {outdir}/star_logs " "&& mv {logfiles} {outdir}/star_logs" - ) - - -if config["aligner"] == "star-twopass": - - rule star_pass1: - "First pass of alignment with STAR to get the junctions" - input: - fastq=rules.cutadapt.output, - index=rules.star_index.output, - annotation=f"{REFERENCES}/annotation.gtf", - output: - sjout=temporary( - "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab" - ), - log: - "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass1.log", - threads: 16 - resources: - mem="64g", - runtime="8h", - disk="80g", - params: - extra=STAR_PARAMS, - run: - genomedir = os.path.dirname(input.index[0]) - outdir = os.path.dirname(output[0]) - prefix = output.sjout.replace("SJ.out.tab", "") - shell( - STAR_CMD - + ( - # In this first pass, we don't actually care about the - # alignment -- just the detected junctions. So we output - # the SAM to /dev/null. - "--outStd SAM > /dev/null " - "2> {log} " - ) - ) - - # move various hard-coded log files to log directory - logfiles = expand(prefix + "{ext}", ext=logfile_extensions) - shell( - "mkdir -p {outdir}/star-pass1_logs " - "&& mv {logfiles} {outdir}/star-pass1_logs" - ) - - rule star_pass2: - """ - Second pass of alignment with STAR using splice junctions across all - samples to get the final BAM - """ - input: - fastq=rules.cutadapt.output, - index=rules.star_index.output, - annotation=f"{REFERENCES}/annotation.gtf", - sjout=expand(rules.star_pass1.output, sample=SAMPLES), - output: - bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), - sjout=temporary( - "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.SJ.out.tab" - ), - log: - "data/rnaseq_samples/{sample}/{sample}.cutadapt.star-pass2.log", - threads: 16 - resources: - mem="64g", - runtime="8h", - disk="80g", - params: - extra=STAR_PARAMS, - run: - genomedir = os.path.dirname(input.index[0]) - outdir = os.path.dirname(output[0]) - prefix = output.bam.replace(".bam", ".star-pass2.") - shell( - STAR_CMD - + ( - # In contrast to pass 1, we will be keeping these BAMs -- - # so sort them - "--outSAMtype BAM SortedByCoordinate " - # Splice junction databases from all samples in the first - # pass. - "--sjdbFileChrStartEnd {input.sjout} " - "--outStd BAM_SortedByCoordinate > {output.bam} " - "2> {log} " - ) - ) - - # move various hard-coded log files to log directory - logfiles = expand(prefix + "{ext}", ext=logfile_extensions) - shell( - "mkdir -p {outdir}/star-pass2_logs " - "&& mv {logfiles} {outdir}/star-pass2_logs" - ) +rule star: + "Align with STAR (1-pass mode)" + input: + fastq=rules.cutadapt.output, + index=rules.star_index.output, + annotation=f"{REFERENCES}/annotation.gtf", + output: + bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), + sjout=temporary( + "data/rnaseq_samples/{sample}/{sample}.cutadapt.star.SJ.out.tab" + ), + log: + "data/rnaseq_samples/{sample}/{sample}.cutadapt.bam.log", + threads: 16 + resources: + mem="64g", + runtime="8h", + disk="80g", + run: + genomedir = os.path.dirname(input.index[0]) + outdir = os.path.dirname(output[0]) + prefix = output.bam.replace(".bam", ".star.") + if os.getenv("TMPDIR"): + tmpdir_arg = "--outTmpDir $TMPDIR/star " + else: + tmpdir_arg = "" + shell( + "STAR " + "--runThreadN {threads} " + "--genomeDir {genomedir} " + "--readFilesIn {input.fastq} " + "--readFilesCommand zcat " + "--outFileNamePrefix {prefix} " + "{tmpdir_arg} " + "--outSAMtype BAM SortedByCoordinate " + "--outStd BAM_SortedByCoordinate > {output.bam} " + + # NOTE: The STAR docs indicate that the following parameters are + # standard options for ENCODE long-RNA-seq pipeline. Comments are from + # the STAR docs. + "--outFilterType BySJout " # reduces number of spurious junctions + "--outFilterMultimapNmax 20 " # if more than this many multimappers, consider unmapped + "--alignSJoverhangMin 8 " # min overhang for unannotated junctions + "--alignSJDBoverhangMin 1 " # min overhang for annotated junctions + "--outFilterMismatchNmax 999 " # max mismatches per pair + "--outFilterMismatchNoverReadLmax 0.04 " # max mismatches per pair relative to read length + "--alignIntronMin 20 " # min intron length + "--alignIntronMax 1000000 " # max intron length + "--alignMatesGapMax 1000000 " # max distance between mates + "--outSAMunmapped None " # do not report aligned reads in output + "2> {log} " + ) - shell("rm -r {prefix}_STARgenome") + # move various hard-coded log files to log directory + logfile_extensions = + logfiles = expand( + prefix + "{ext}", + ext=["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"] + ) + shell( + "mkdir -p {outdir}/star_logs " + "&& mv {logfiles} {outdir}/star_logs" + ) rule rRNA: diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml index 2cbd3d66..26f5aba9 100644 --- a/workflows/rnaseq/config/config.yaml +++ b/workflows/rnaseq/config/config.yaml @@ -23,5 +23,3 @@ patterns: 'config/rnaseq_patterns.yaml' stranded: 'fr-firststrand' # for dUTP libraries # 'fr-secondstrand' # for ligation libraries # 'unstranded' # for libraries without strand specificity - -aligner: 'star' From 76affb6447800547ce17dfa9ee85fcda3e93bd84 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 2 Apr 2025 02:02:41 +0000 Subject: [PATCH 084/196] updates to env.yml --- env.yml | 230 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 133 insertions(+), 97 deletions(-) diff --git a/env.yml b/env.yml index 9bbc8a71..a4341cb0 100644 --- a/env.yml +++ b/env.yml @@ -9,17 +9,20 @@ dependencies: - alsa-lib=1.2.13 - amply=0.1.6 - annotated-types=0.7.0 + - anyio=4.9.0 - appdirs=1.4.4 - - argcomplete=3.5.2 + - argcomplete=3.6.1 - argh=0.31.3 - argparse-dataclass=2.0.0 - asttokens=3.0.0 - - attrs=24.3.0 - - babel=2.16.0 - - beautifulsoup4=4.12.3 + - attrs=25.3.0 + - babel=2.17.0 + - backports=1.0 + - backports.tarfile=1.2.0 + - beautifulsoup4=4.13.3 - bedtools=2.31.1 - binutils_impl_linux-64=2.43 - - biopython=1.84 + - biopython=1.85 - boost-cpp=1.85.0 - bowtie=1.3.1 - bowtie2=2.5.4 @@ -30,9 +33,9 @@ dependencies: - bx-python=0.13.0 - bzip2=1.0.8 - c-ares=1.34.4 - - ca-certificates=2024.12.14 - - cairo=1.18.2 - - certifi=2024.12.14 + - ca-certificates=2025.1.31 + - cairo=1.18.4 + - certifi=2025.1.31 - cffi=1.17.1 - charset-normalizer=3.4.1 - click=8.1.8 @@ -49,16 +52,19 @@ dependencies: - configargparse=1.7 - connection_pool=0.0.3 - contourpy=1.3.1 - - curl=8.11.1 + - cryptography=44.0.2 + - curl=8.12.1 - cutadapt=5.0 - cycler=0.12.1 - - datrie=0.8.2 - - decorator=5.1.1 - - deeptools=3.5.5 + - dbus=1.13.6 + - decorator=5.2.1 + - deeptools=3.5.6 - deeptoolsintervals=0.1.9 + - distlib=0.3.9 - dnaio=1.2.2 - docutils=0.21.2 - dpath=2.2.0 + - editables=0.5 - eido=0.2.4 - epic2=0.0.52 - et_xmlfile=2.0.0 @@ -68,6 +74,7 @@ dependencies: - expat=2.6.4 - fastq-screen=0.16.0 - fastqc=0.12.1 + - filelock=3.18.0 - font-ttf-dejavu-sans-mono=2.37 - font-ttf-inconsolata=3.000 - font-ttf-source-code-pro=2.038 @@ -75,8 +82,8 @@ dependencies: - fontconfig=2.15.0 - fonts-conda-ecosystem=1 - fonts-conda-forge=1 - - fonttools=4.55.3 - - freetype=2.12.1 + - fonttools=4.56.0 + - freetype=2.13.3 - fribidi=1.0.10 - gcc_impl_linux-64=14.2.0 - gffread=0.12.7 @@ -88,38 +95,50 @@ dependencies: - graphite2=1.3.13 - gsl=1.16 - gxx_impl_linux-64=14.2.0 - - h2=4.1.0 - - harfbuzz=10.1.0 + - h11=0.14.0 + - h2=4.2.0 + - harfbuzz=11.0.0 + - hatch=1.14.0 + - hatchling=1.27.0 - hdf5=1.14.3 - hisat2=2.2.1 - - hpack=4.0.0 + - hpack=4.1.0 - html5lib=1.1 - htslib=1.21 + - httpcore=1.0.7 + - httpx=0.28.1 - humanfriendly=10.0 - - humanize=4.11.0 - - hyperframe=6.0.1 + - humanize=4.12.2 + - hyperframe=6.1.0 + - hyperlink=21.0.0 - icu=75.1 - idna=3.10 - imagesize=1.4.1 - immutables=0.21 - - importlib-metadata=8.5.0 + - importlib-metadata=8.6.1 - importlib_resources=6.5.2 - iniconfig=2.0.0 - intervalstats=1.01 - - ipython=8.31.0 - - isa-l=2.31.0 + - ipython=9.0.2 + - ipython_pygments_lexers=1.1.1 + - isa-l=2.31.1 + - jaraco.classes=3.4.0 + - jaraco.context=6.0.1 + - jaraco.functools=4.1.0 - jedi=0.19.2 - - jinja2=3.1.5 + - jeepney=0.9.0 + - jinja2=3.1.6 - jsonschema=4.23.0 - jsonschema-specifications=2024.10.1 - jupyter_core=5.7.2 - kaleido-core=0.2.1 - kallisto=0.51.1 - kernel-headers_linux-64=3.10.0 + - keyring=25.6.0 - keyutils=1.6.1 - kiwisolver=1.4.7 - krb5=1.21.3 - - lcms2=2.16 + - lcms2=2.17 - ld_impl_linux-64=2.43 - lerc=4.0.0 - libaec=1.1.3 @@ -132,36 +151,36 @@ dependencies: - libbrotlienc=1.1.0 - libcblas=3.9.0 - libcups=2.3.3 - - libcurl=8.11.1 - - libdeflate=1.23 - - libedit=3.1.20240808 + - libcurl=8.12.1 + - libdeflate=1.22 + - libedit=3.1.20250104 - libev=4.33 - libexpat=2.6.4 - - libffi=3.4.2 + - libffi=3.4.6 - libgcc=14.2.0 - libgcc-devel_linux-64=14.2.0 - libgcc-ng=14.2.0 - libgd=2.3.3 + - libgff=2.0.0 - libgfortran=14.2.0 - - libgfortran-ng=14.2.0 - libgfortran5=14.2.0 - - libglib=2.82.2 + - libglib=2.84.0 - libgomp=14.2.0 - libhwloc=2.11.2 - - libiconv=1.17 + - libiconv=1.18 - libjemalloc=5.3.0 - libjpeg-turbo=3.0.0 - liblapack=3.9.0 - liblapacke=3.9.0 - - liblzma=5.6.3 - - liblzma-devel=5.6.3 + - liblzma=5.6.4 + - liblzma-devel=5.6.4 - libnghttp2=1.64.0 - libnsl=2.0.1 - - libopenblas=0.3.28 - - libopenssl-static=3.4.0 - - libpng=1.6.45 + - libopenblas=0.3.29 + - libopenssl-static=3.4.1 + - libpng=1.6.47 - libsanitizer=14.2.0 - - libsqlite=3.47.2 + - libsqlite=3.49.1 - libssh2=1.11.1 - libstdcxx=14.2.0 - libstdcxx-devel_linux-64=14.2.0 @@ -171,41 +190,44 @@ dependencies: - libwebp-base=1.5.0 - libxcb=1.17.0 - libxcrypt=4.4.36 - - libxml2=2.13.5 + - libxml2=2.13.7 - libzlib=1.3.1 - logmuse=0.2.8 - - logomaker=0.8 + - logomaker=0.8.6 - macs2=2.2.9.1 - make=4.4.1 - markdown=3.6 - markdown-it-py=3.0.0 - markupsafe=3.0.2 - mathjax=2.7.7 - - matplotlib-base=3.10.0 + - matplotlib-base=3.10.1 - matplotlib-inline=0.1.7 - mdurl=0.1.2 - - multiqc=1.26 + - more-itertools=10.6.0 + - multiqc=1.28 - munkres=1.1.4 - mysql-connector-c=6.1.11 + - narwhals=1.32.0 - natsort=8.4.0 - nbformat=5.10.4 - - ncbi-vdb=3.1.1 + - ncbi-vdb=3.2.1 - ncurses=6.5 - networkx=3.4.2 - nspr=4.36 - - nss=3.107 - - numpy=2.2.1 + - nss=3.110 + - numpy=2.2.4 - numpydoc=1.8.0 - - openjdk=23.0.1 + - openjdk=23.0.2 - openjpeg=2.5.3 - openpyxl=3.1.5 - - openssl=3.4.0 + - openssl=3.4.1 - ossuuid=1.6.2 - packaging=24.2 - pandas=2.2.3 - - pandoc=3.6.1 - - pango=1.54.0 + - pandoc=3.6.4 + - pango=1.56.3 - parso=0.8.4 + - pathspec=0.12.1 - patsy=1.0.1 - pbzip2=1.1.13 - pcre2=10.44 @@ -226,7 +248,7 @@ dependencies: - perl-file-path=2.18 - perl-file-temp=0.2304 - perl-file-which=1.24 - - perl-gd=2.56 + - perl-gd=2.83 - perl-gdgraph=1.54 - perl-gdtextutil=0.86 - perl-importer=0.026 @@ -237,6 +259,7 @@ dependencies: - perl-sub-info=0.002 - perl-term-table=0.024 - perl-test-fatal=0.016 + - perl-test-nowarnings=1.06 - perl-test-warnings=0.031 - perl-test2-suite=0.000163 - perl-try-tiny=0.31 @@ -250,40 +273,41 @@ dependencies: - pickleshare=0.7.5 - pigz=2.8 - pillow=11.1.0 - - pip=24.3.1 + - pip=25.0.1 - pixman=0.44.2 - pkgutil-resolve-name=1.3.10 - plac=1.4.3 - - platformdirs=4.3.6 - - plotly=5.24.1 + - platformdirs=4.3.7 + - plotly=6.0.1 - pluggy=1.5.0 - preseq=2.0.2 - - prompt-toolkit=3.0.48 - - psutil=6.1.1 + - prompt-toolkit=3.0.50 + - psutil=7.0.0 - pthread-stubs=0.4 - ptyprocess=0.7.0 - pulp=2.8.0 - pure_eval=0.2.3 - - py2bit=0.3.0 - - pyaml-env=1.2.1 - - pybedtools=0.11.0 - - pybigwig=0.3.23 + - py2bit=0.3.3 + - pyaml-env=1.2.2 + - pybedtools=0.12.0 + - pybigwig=0.3.24 - pycparser=2.22 - - pydantic=2.10.4 + - pydantic=2.10.6 - pydantic-core=2.27.2 - pyfaidx=0.8.1.3 - pygments=2.19.1 - - pyparsing=3.2.1 + - pyparsing=3.2.3 - pysam=0.22.1 - pysocks=1.7.1 - - pytest=8.3.4 + - pytest=8.3.5 - pytest-xdist=3.6.1 - python=3.11.11 - python-dateutil=2.9.0.post0 + - python-dotenv=1.1.0 - python-fastjsonschema=2.21.1 - - python-isal=1.7.1 + - python-isal=1.7.2 - python-kaleido=0.2.1 - - python-tzdata=2024.2 + - python-tzdata=2025.2 - python-zlib-ng=0.5.1 - python_abi=3.11 - pytz=2024.1 @@ -292,67 +316,76 @@ dependencies: - qhull=2020.2 - r-base=4.2.3 - readline=8.2 - - referencing=0.35.1 + - referencing=0.36.2 + - regex=2024.11.6 - requests=2.32.3 - reretry=0.11.8 - rich=13.9.4 - - rich-click=1.8.5 - - rpds-py=0.22.3 + - rich-click=1.8.8 + - roman-numerals-py=3.1.0 + - rpds-py=0.24.0 - rseqc=5.0.4 - salmon=1.10.3 - samtools=1.21 - - scipy=1.15.0 + - scipy=1.15.2 - seaborn=0.13.2 - seaborn-base=0.13.2 + - secretstorage=3.3.3 - sed=4.8 - - setuptools=75.6.0 + - setuptools=75.8.2 - shellingham=1.5.4 - - simplejson=3.19.3 + - simplejson=3.20.1 - six=1.17.0 - - slack-sdk=3.34.0 - - slack_sdk=3.34.0 + - slack-sdk=3.35.0 + - slack_sdk=3.35.0 - smart_open=7.1.0 - - smmap=5.0.0 - - snakemake=8.27.0 + - smmap=5.0.2 + - snakemake=9.1.3 - snakemake-interface-common=1.17.4 - - snakemake-interface-executor-plugins=9.3.3 + - snakemake-interface-executor-plugins=9.3.5 + - snakemake-interface-logger-plugins=1.2.3 - snakemake-interface-report-plugins=1.1.0 - - snakemake-interface-storage-plugins=3.3.0 - - snakemake-minimal=8.27.0 + - snakemake-interface-storage-plugins=4.2.1 + - snakemake-minimal=9.1.3 + - sniffio=1.3.1 - snowballstemmer=2.2.0 - soupsieve=2.5 - spectra=0.0.11 - - sphinx=8.1.3 + - sphinx=8.2.3 - sphinxcontrib-applehelp=2.0.0 - sphinxcontrib-devhelp=2.0.0 - sphinxcontrib-htmlhelp=2.1.0 - sphinxcontrib-jsmath=1.0.1 - sphinxcontrib-qthelp=2.0.0 - sphinxcontrib-serializinghtml=1.1.10 - - sqlite=3.47.2 - - sra-tools=3.1.1 + - sqlite=3.49.1 + - sra-tools=3.2.0 - stack_data=0.6.3 + - staden_io_lib=1.15.0 - star=2.7.11b - statsmodels=0.14.4 - subread=2.0.8 - sysroot_linux-64=2.17 - tabulate=0.9.0 - tbb=2022.0.0 - - tenacity=9.0.0 - throttler=1.2.2 + - tiktoken=0.9.0 - tk=8.6.13 - tktable=2.10 - tomli=2.2.1 + - tomli-w=1.2.0 + - tomlkit=0.13.2 - tqdm=4.67.1 - trackhub=1.0 - traitlets=5.14.3 - - typeguard=4.4.1 - - typer=0.15.1 - - typer-slim=0.15.1 - - typer-slim-standard=0.15.1 - - typing-extensions=4.12.2 - - typing_extensions=4.12.2 - - tzdata=2024b + - trove-classifiers=2025.3.19.19 + - typeguard=4.4.2 + - typer=0.15.2 + - typer-slim=0.15.2 + - typer-slim-standard=0.15.2 + - typing-extensions=4.13.0 + - typing_extensions=4.13.0 + - tzdata=2025b - ubiquerg=0.8.0 - ucsc-bedgraphtobigwig=472 - ucsc-bedsort=469 @@ -366,17 +399,20 @@ dependencies: - ucsc-stringify=472 - ucsc-twobittofa=472 - ucsc-wigtobigwig=472 - - unicodedata2=15.1.0 + - unicodedata2=16.0.0 - urllib3=2.3.0 + - userpath=1.9.2 + - uv=0.6.10 - veracitools=0.1.3 + - virtualenv=20.29.3 - wcwidth=0.2.13 - webencodings=0.5.1 - wheel=0.45.1 - - wrapt=1.17.0 + - wrapt=1.17.2 - xopen=2.0.2 - xorg-libice=1.1.2 - - xorg-libsm=1.2.5 - - xorg-libx11=1.8.10 + - xorg-libsm=1.2.6 + - xorg-libx11=1.8.12 - xorg-libxau=1.0.12 - xorg-libxdmcp=1.1.5 - xorg-libxext=1.3.6 @@ -386,13 +422,13 @@ dependencies: - xorg-libxrender=0.9.12 - xorg-libxt=1.3.1 - xorg-libxtst=1.2.5 - - xz=5.6.3 - - xz-gpl-tools=5.6.3 - - xz-tools=5.6.3 + - xz=5.6.4 + - xz-gpl-tools=5.6.4 + - xz-tools=5.6.4 - yaml=0.2.5 - - yte=1.5.5 + - yte=1.7.0 - zipp=3.21.0 - zlib=1.3.1 - - zlib-ng=2.2.3 + - zlib-ng=2.2.4 - zstandard=0.23.0 - - zstd=1.5.6 + - zstd=1.5.7 From 0937a0a56bb5136ebd96f6ebecd90c16007b2156 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 2 Oct 2025 17:01:41 +0000 Subject: [PATCH 085/196] add draft of decisions.rst --- docs/decisions.rst | 85 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 docs/decisions.rst diff --git a/docs/decisions.rst b/docs/decisions.rst new file mode 100644 index 00000000..7abbc78d --- /dev/null +++ b/docs/decisions.rst @@ -0,0 +1,85 @@ +Decision log +============ + +This document keeps track of the reasoning behind various architecture decisions. + +References +---------- +Here are use-cases we have that are common enough to warrant supporting: + +- References should support multiple workflows (ChIP-seq, RNA-seq, etc) + - This implies that the means the references dir should be in the + ``workflows`` directory or above. + - For example, this may mean a STAR index for RNA-seq, a bowtie2 index for + rRNA contamination, and another bowtie2 index for ChIP-seq. + +- References should support different organisms in different workflows. There + should beo only one organism per workflow though. + +- References should be re-created for each project. + - What we've found is that if we have a central location for the references + (shared by multiple deployments of lcdb-wf over the years) then we get + conflicts where one deployment's aligner version is more recent, causing + errors when using the index for an older version. + - To keep using this, we'd need to version indexes based on aligner version. + - However, when writing up methods for a paper we need to be able to trace + back what commands were run to generate the reference, including additional + patching that may have taken place (as is supported by the references + workflow). + - Re-using indexes is space- and time-efficient in the short term, but has + shown to be inefficient in time and reproducibility in the long term. + - Keeping everything in the same deployment director also helps with the + archiving process. + +Naming: + +- Top level should be organsim. Doesn't really matter in the case of + a single-organism workflow. +- Next should be what has historically been called "tag". This could be the + assembly name for genomic indexes, or some combination of assembly + + annotation for transcriptome. +- If we're assuming "deployment-local" references, these no longer have to be + globally unique. If we have a mouse reference with a transgene, we can just + call it "mouse/mm39" but have the transgene patched into it, and not worry + about conflicting (or worse, overwriting!) a central reference with the same + name that didn't have the transgene. +- Fasta files are included next to their respective index. + +This example uses the ``dmel`` organism and ``test`` tag which is configured by +default for tests. + +This uses ``$ORG/$TAG//$TOOL`` as the path +template. This lets us keep the fastq file used for building the various +indexes alongside the indexes. + +:: + + references_data/ + ├── dmel + ├── rRNA + │ └── genome + │ ├── bowtie2 + │ │ └── dmel_rRNA.* + │ └── dmel_rRNA.fasta + └── test + ├── annotation + │ ├── dmel_test.bed12 + │ ├── dmel_test.gtf + │ └── dmel_test.refflat + ├── genome + │ ├── bowtie2 + │ │ └── dmel_test.* + │ ├── star + │ │ └── dmel_test + │ │ └── + │ ├── dmel_test.chromsizes + │ ├── dmel_test.fasta + │ ├── dmel_test.fasta.fai + └── transcriptome + ├── kallisto + │ └── dmel_test + │ └── transcripts.idx + ├── salmon + │ └── dmel_test + │ └── + └── dmel_test.fasta From 4a570e09051e1060930939a13a457e9b768f835b Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 10 Jul 2025 09:28:19 -0400 Subject: [PATCH 086/196] default to conda rather than mamba as front-end --- deploy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy.py b/deploy.py index 7ad7e1ac..38df1687 100755 --- a/deploy.py +++ b/deploy.py @@ -367,7 +367,7 @@ def build_envs(dest, conda_frontend="mamba"): ap.add_argument( "--conda-frontend", help="Set program (conda or mamba) to use when creating environments. Default is %(default)s.", - default="mamba", + default="conda", ) ap.add_argument( "--rsync-args", From e03f8816dca74aaed151a0a385a8d786c9e877b9 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 10 Jul 2025 09:28:42 -0400 Subject: [PATCH 087/196] support additional packages during deployment i.e., snakemake-executor-plugin-cluster-generic --- deploy.py | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/deploy.py b/deploy.py index 38df1687..c5c7cb39 100755 --- a/deploy.py +++ b/deploy.py @@ -267,7 +267,7 @@ def deployment_json(source, dest): info("Wrote details of deployment to {log}".format(**locals())) -def build_envs(dest, conda_frontend="mamba"): +def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="conda"): """ Build conda environments. @@ -279,15 +279,25 @@ def build_envs(dest, conda_frontend="mamba"): the command line with --dest) in which the env and env-r yaml files should already exist. Envs will be created in here. + additional_main : list + Other packages to install, e.g., a snakemake plugin needed for + a cluster profile, into the main environment. + + additional_r : list + Other packages to install into the R environment. + conda_frontend : 'mamba' | 'conda' Which front-end to use (terminology borrowed from Snakemake) + """ mapping = [ - ("./env", "env.yml"), - ("./env-r", "env-r.yml"), + ("./env", "env.yml", additional_main), + ("./env-r", "env-r.yml", additional_r), ] - for env, yml in mapping: + for env, yml, additional in mapping: info("Building environment " + os.path.join(dest, env)) + if additional: + info(f"Adding {additional} to environment") try: # conda and mamba can be hard to kill, possibly because they're @@ -305,6 +315,8 @@ def build_envs(dest, conda_frontend="mamba"): "--file", yml, ] + if additional: + cmds += additional p = sp.Popen(cmds, universal_newlines=True, cwd=dest) p.wait() @@ -375,6 +387,20 @@ def build_envs(dest, conda_frontend="mamba"): default="-rlt" ) + ap.add_argument( + "--additional-main", + help="""Additional packages to install in main environment (only + relevant with --build-envs). For example, + 'snakemake-executor-plugin-cluster-generic' to support a cluster + profile.""", + nargs="+" + ) + ap.add_argument( + "--additional-r", + help="Additional packages to install in R environment (only relevant with --build-envs)", + nargs="+" + ) + ap.add_argument( "--mismatch-ok", action="store_true", @@ -398,7 +424,28 @@ def build_envs(dest, conda_frontend="mamba"): rsync(include, source, dest, args.rsync_args) deployment_json(source, dest) + if args.additional_main and additional_main_from_env_var: + print( + "ERROR: Unset LCDBWF_ADDITIONAL_MAIN env var if you want to use the --additional-main argument." + ) + sys.exit(1) + + if additional_main_from_env_var: + if args.additional_main: + print( + "ERROR: Unset LCDBWF_ADDITIONAL_MAIN env var if you want to use the --additional-main argument." + ) + sys.exit(1) + additional_main = [additional_main_from_env_var] + else: + additional_main = args.additional_main + if args.build_envs: - build_envs(dest, conda_frontend=args.conda_frontend) + build_envs( + dest, + additional_main=additional_main, + additional_r=args.additional_r, + conda_frontend=args.conda_frontend, + ) warning("Deployment complete in {args.dest}".format(**locals())) From e86bbc216d50fdaab0d23b71ced279b5d0204701 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 10 Jul 2025 09:53:53 -0400 Subject: [PATCH 088/196] pep8 on deploy.py --- deploy.py | 121 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 58 deletions(-) diff --git a/deploy.py b/deploy.py index c5c7cb39..4396654f 100755 --- a/deploy.py +++ b/deploy.py @@ -8,14 +8,13 @@ import subprocess as sp import datetime import json -import fnmatch import logging import hashlib from pathlib import Path from distutils import filelist # Determine default staging area, used in help -default_staging = "/tmp/{0}-lcdb-wf-staging".format(os.getenv('USER')) +default_staging = "/tmp/{0}-lcdb-wf-staging".format(os.getenv("USER")) usage = f""" This script assists in the deployment of relevant code from the lcdb-wf @@ -74,52 +73,51 @@ def error(s): logging.error(RED + s + RESET) -def write_include_file(source, flavor='all'): +def write_include_file(source, flavor="all"): # Patterns follow that of MANIFEST.in # (https://packaging.python.org/en/latest/guides/using-manifest-in/), # and distutils.filelist is used below to parse them. PATTERN_DICT = { - 'rnaseq': [ - 'include workflows/rnaseq/Snakefile', - 'recursive-include workflows/rnaseq/config *', - 'include workflows/rnaseq/rnaseq_trackhub.py', - 'recursive-include workflows/rnaseq/downstream *.Rmd', - 'recursive-include workflows/rnaseq/downstream *.yaml', + "rnaseq": [ + "include workflows/rnaseq/Snakefile", + "recursive-include workflows/rnaseq/config *", + "include workflows/rnaseq/rnaseq_trackhub.py", + "recursive-include workflows/rnaseq/downstream *.Rmd", + "recursive-include workflows/rnaseq/downstream *.yaml", ], - 'chipseq': [ - 'include workflows/chipseq/Snakefile', - 'recursive-include workflows/chipseq/config *', - 'include workflows/chipseq/chipseq_trackhub.py', + "chipseq": [ + "include workflows/chipseq/Snakefile", + "recursive-include workflows/chipseq/config *", + "include workflows/chipseq/chipseq_trackhub.py", ], - 'all': [ - 'recursive-include wrappers *', - 'recursive-include include *', - 'recursive-include lib *', - 'include env.yml env-r.yml .gitignore', - 'include workflows/references/Snakefile', - 'recursive-include workflows/references/config *', - 'global-exclude __pycache__', + "all": [ + "recursive-include wrappers *", + "recursive-include include *", + "recursive-include lib *", + "include env.yml env-r.yml .gitignore", + "include workflows/references/Snakefile", + "recursive-include workflows/references/config *", + "global-exclude __pycache__", + ], + "full": [ + "include workflows/colocalization/Snakefile", + "recursive-include workflows/colocalization/config *", + "recursive-include workflows/colocalization/scripts *", + "recursive-include workflows/figures *", + "recursive-include workflows/external *", ], - 'full': [ - 'include workflows/colocalization/Snakefile', - 'recursive-include workflows/colocalization/config *', - 'recursive-include workflows/colocalization/scripts *', - 'recursive-include workflows/figures *', - 'recursive-include workflows/external *', - ] - } patterns = [] - if flavor in ('full', 'rnaseq'): - patterns.extend(PATTERN_DICT['rnaseq']) - if flavor in ('full', 'chipseq'): - patterns.extend(PATTERN_DICT['chipseq']) - if flavor == 'full': - patterns.extend(PATTERN_DICT['full']) - patterns.extend(PATTERN_DICT['all']) + if flavor in ("full", "rnaseq"): + patterns.extend(PATTERN_DICT["rnaseq"]) + if flavor in ("full", "chipseq"): + patterns.extend(PATTERN_DICT["chipseq"]) + if flavor == "full": + patterns.extend(PATTERN_DICT["full"]) + patterns.extend(PATTERN_DICT["all"]) def fastwalk(path): """ @@ -128,13 +126,13 @@ def fastwalk(path): """ path = str(path) for root, dirs, files in os.walk(path, topdown=True): - if 'conda-meta' in dirs: + if "conda-meta" in dirs: dirs[:] = [] files[:] = [] for d in dirs: - yield os.path.join(root, d).replace(path + '/', '') + yield os.path.join(root, d).replace(path + "/", "") for f in files: - yield os.path.join(root, f).replace(path + '/', '') + yield os.path.join(root, f).replace(path + "/", "") f = filelist.FileList() f.allfiles = list(fastwalk(source)) @@ -153,9 +151,9 @@ def fastwalk(path): to_transfer = list(set(under_version_control).intersection(f.files)) include = tempfile.NamedTemporaryFile(delete=False).name - with open(include, 'w') as fout: - fout.write('\n\n') - fout.write('\n'.join(to_transfer)) + with open(include, "w") as fout: + fout.write("\n\n") + fout.write("\n".join(to_transfer)) return include @@ -188,8 +186,8 @@ def check_md5(f): full_here = Path(__file__).resolve() full_there = Path(dest) / "deploy.py" error( - "Files {full_here} and {full_there} do not match! ".format(**locals()) + - "The deploy script you are running appears to be out of date. " + f"Files {full_here} and {full_there} do not match! " + + "The deploy script you are running appears to be out of date. " "Please get an updated copy from https://github.com/lcdb/lcdb-wf, perhaps " "with 'wget https://raw.githubusercontent.com/lcdb/lcdb-wf/master/deploy.py'" ) @@ -322,16 +320,21 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co except KeyboardInterrupt: print("") - error("Killing running {conda_frontend} job, '".format(**locals()) + " ".join(cmds)) + error( + "Killing running {conda_frontend} job, '".format(**locals()) + + " ".join(cmds) + ) p.kill() sys.exit(1) if p.returncode: - error("Error running {conda_frontend}, '".format(**locals()) + " ".join(cmds)) + error( + "Error running {conda_frontend}, '".format(**locals()) + " ".join(cmds) + ) sys.exit(1) full_env = Path(dest) / env - info("Created env {full_env}".format(**locals())) + info(f"Created env {full_env}") if __name__ == "__main__": @@ -340,7 +343,9 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co ap.add_argument( "--flavor", default="full", - help="""Options are {0}. Default is full.""".format(['full', 'rnaseq', 'chipseq']), + help="""Options are {0}. Default is full.""".format( + ["full", "rnaseq", "chipseq"] + ), ) ap.add_argument( "--dest", help="""Destination directory in which to copy files""", required=True @@ -352,7 +357,7 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co help=f"""Make a new clone to a staging area (at the location specified by --staging which defaults to {default_staging}) and deploy from there. Useful if using this script as a standalone tool. You can also - use --branch to configure which branch to deploy from that clone.""" + use --branch to configure which branch to deploy from that clone.""", ) ap.add_argument( @@ -384,7 +389,7 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co ap.add_argument( "--rsync-args", help="Options for rsync when deploying to a new directory. Default is %(default)s.", - default="-rlt" + default="-rlt", ) ap.add_argument( @@ -393,25 +398,25 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co relevant with --build-envs). For example, 'snakemake-executor-plugin-cluster-generic' to support a cluster profile.""", - nargs="+" + nargs="+", ) ap.add_argument( "--additional-r", help="Additional packages to install in R environment (only relevant with --build-envs)", - nargs="+" + nargs="+", ) - ap.add_argument( - "--mismatch-ok", - action="store_true", - help="Used for testing") + ap.add_argument("--mismatch-ok", action="store_true", help="Used for testing") args = ap.parse_args() dest = args.dest flavor = args.flavor if args.staging and not args.clone: - print("ERROR: --staging was specified but --clone was not. Did you want to use --clone?", file=sys.stderr) - sys.exit(1) + print( + "ERROR: --staging was specified but --clone was not. Did you want to use --clone?", + file=sys.stderr, + ) + sys.exit(1) if args.clone: if args.staging is None: args.staging = default_staging From b1fc75e62bcc750ba6faafc8830ee6982d19dc04 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Mon, 14 Jul 2025 08:53:34 -0400 Subject: [PATCH 089/196] support for setting additional-main from env var --- deploy.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/deploy.py b/deploy.py index 4396654f..0c6b2e6e 100755 --- a/deploy.py +++ b/deploy.py @@ -339,6 +339,8 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co if __name__ == "__main__": + additional_main_from_env_var = os.getenv("LCDBWF_ADDITIONAL_MAIN", []) + ap = argparse.ArgumentParser(usage=usage) ap.add_argument( "--flavor", @@ -397,7 +399,8 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co help="""Additional packages to install in main environment (only relevant with --build-envs). For example, 'snakemake-executor-plugin-cluster-generic' to support a cluster - profile.""", + profile. You can use the env var LCDBWF_ADDITIONAL_MAIN to supply this + argument automatically instead.""", nargs="+", ) ap.add_argument( From 6eb46333f0254b66fcae1bed4a348239fc9993fb Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 29 Jul 2025 21:55:33 -0400 Subject: [PATCH 090/196] deploy.py actually installs additional --- deploy.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/deploy.py b/deploy.py index 0c6b2e6e..1981804c 100755 --- a/deploy.py +++ b/deploy.py @@ -294,8 +294,6 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co ] for env, yml, additional in mapping: info("Building environment " + os.path.join(dest, env)) - if additional: - info(f"Adding {additional} to environment") try: # conda and mamba can be hard to kill, possibly because they're @@ -313,8 +311,12 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co "--file", yml, ] + p = sp.Popen(cmds, universal_newlines=True, cwd=dest) + p.wait() + if additional: - cmds += additional + info(f"Adding {additional} to environment") + cmds = [conda_frontend, "install", "-y", "-p", env] + additional p = sp.Popen(cmds, universal_newlines=True, cwd=dest) p.wait() @@ -432,12 +434,6 @@ def build_envs(dest, additional_main=None, additional_r=None, conda_frontend="co rsync(include, source, dest, args.rsync_args) deployment_json(source, dest) - if args.additional_main and additional_main_from_env_var: - print( - "ERROR: Unset LCDBWF_ADDITIONAL_MAIN env var if you want to use the --additional-main argument." - ) - sys.exit(1) - if additional_main_from_env_var: if args.additional_main: print( From 93b89a6ee06c62f92678acbcb7f8ffdb686f97e0 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 2 Oct 2025 14:07:30 -0400 Subject: [PATCH 091/196] try disabling pre-install --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 66de1446..b216a899 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -104,7 +104,7 @@ variables: # https://docs.conda.io/projects/conda-build/en/latest/resources/link-scripts.html, # post-link scripts should not depend on any installed or # to-be-installed conda packages...but they do. - conda install -n base r-base yq + # conda install -n base r-base yq time conda env create -n $LCDBWF_ENV --file env.yml time conda env create -n $LCDBWF_ENV_R --file env-r.yml From d2ebe753bebcc18215fdb08bfe72c89f5199508a Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 3 Oct 2025 09:08:16 -0400 Subject: [PATCH 092/196] don't copy test runner for references --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b216a899..b28f0491 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -139,7 +139,6 @@ variables: cp $ORIG/workflows/chipseq/run_test.sh $DEPLOY/workflows/chipseq/run_test.sh cp $ORIG/workflows/rnaseq/run_test.sh $DEPLOY/workflows/rnaseq/run_test.sh cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh - cp $ORIG/workflows/references/run_test.sh $DEPLOY/workflows/references/run_test.sh mkdir $DEPLOY/ci mkdir $DEPLOY/test From 22d414fe0bc37d994c12cafbe70794bf80c53965 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 4 Oct 2025 00:56:22 +0000 Subject: [PATCH 093/196] fix typo --- workflows/rnaseq/Snakefile | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index dd736780..111b61b5 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -203,7 +203,6 @@ rule star: ) # move various hard-coded log files to log directory - logfile_extensions = logfiles = expand( prefix + "{ext}", ext=["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"] From e91b14ead8f28e572681272c7055222e315a6984 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 4 Oct 2025 01:34:50 +0000 Subject: [PATCH 094/196] new syntax style for markduplicates --- workflows/chipseq/Snakefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 5fbdbe1c..9c2767d4 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -302,11 +302,11 @@ rule markduplicates: "picard " "{params.java_args} " "MarkDuplicates " - "INPUT={input.bam} " - "OUTPUT={output.bam} " - "REMOVE_DUPLICATES=true " - "METRICS_FILE={output.metrics} " - "VALIDATION_STRINGENCY=LENIENT " + "-INPUT {input.bam} " + "-OUTPUT {output.bam} " + "-REMOVE_DUPLICATES true " + "-METRICS_FILE {output.metrics} " + "-VALIDATION_STRINGENCY LENIENT " "&> {log}" From a7d973782b36da110d20f4bab601b2e07d14e483 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 4 Oct 2025 01:35:10 +0000 Subject: [PATCH 095/196] refactor chipseq config --- workflows/chipseq/config/config.yaml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml index a8d10142..75466ad6 100644 --- a/workflows/chipseq/config/config.yaml +++ b/workflows/chipseq/config/config.yaml @@ -23,6 +23,9 @@ references_dir: 'references_data' peaks_dir: 'data/chipseq_peaks' +fasta: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" + postprocess: 'lib.utils.gzipped' chipseq: # The peak_calling section is a list of dicts, each one defining a single @@ -113,7 +116,3 @@ merged_bigwigs: aligner: index: 'bowtie2' tag: 'test' - -include_references: - - '../../include/reference_configs/Drosophila_melanogaster.yaml' - - '../../include/reference_configs/test.yaml' From 9ee9e06d301d2ceff4ea73a093fac541648c7f90 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 5 Oct 2025 17:48:44 +0000 Subject: [PATCH 096/196] invalidate cache --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b28f0491..bedc4c18 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -26,7 +26,7 @@ variables: save_cache: &save_cache save_cache: - key: v5-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }} + key: v0-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }} paths: - /opt/miniforge @@ -38,7 +38,7 @@ variables: restore_cache: &restore_cache restore_cache: keys: - - v5-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }} + - v0-{{ checksum "env.yml" }}-{{ checksum "env-r.yml" }} # -------------------------------------------------------------------------- # The path needs to be set each time; in jobs below this will be called as From d567601288cb77c62fef5dc5de0273e9e9fb98b6 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 5 Oct 2025 19:03:32 +0000 Subject: [PATCH 097/196] macs2 -> macs3 and change to macs (with no version) throughout --- docs/chipseq.rst | 2 +- docs/config-yaml.rst | 24 +++++++++---------- docs/developers.rst | 4 ++-- docs/faqs.rst | 2 +- docs/workflows.rst | 2 +- env.yml | 2 +- include/requirements.txt | 2 +- lib/chipseq.py | 8 +++---- .../{macs2_callpeak.py => macs_callpeak.py} | 2 +- .../complex-dataset-chipseq-config.yaml | 16 ++++++------- .../test_configs/test_chipseq_regression.yaml | 2 +- workflows/chipseq/Snakefile | 14 +++++------ workflows/chipseq/chipseq_trackhub.py | 2 +- workflows/chipseq/config/config.yaml | 4 ++-- 14 files changed, 43 insertions(+), 43 deletions(-) rename scripts/{macs2_callpeak.py => macs_callpeak.py} (99%) diff --git a/docs/chipseq.rst b/docs/chipseq.rst index 202e0375..5302e973 100644 --- a/docs/chipseq.rst +++ b/docs/chipseq.rst @@ -20,7 +20,7 @@ Specifically, the workflow does the following: - optionally merges bigWigs to create one signal track for all replicates - runs deepTools plotFingerprint on grouped IP and input for QC and evaluation of enrichment - - calls peaks using macs2, spp, and/or sicer, with support for multiple + - calls peaks using macs, spp, and/or sicer, with support for multiple peak-calling runs using different parameters to assist with assessing performance and to help make decisions for downstream analysis - optionally runs a template diffBind RMarkdown file used for differential binding analysis diff --git a/docs/config-yaml.rst b/docs/config-yaml.rst index c8026325..7d86ceef 100644 --- a/docs/config-yaml.rst +++ b/docs/config-yaml.rst @@ -124,7 +124,7 @@ The major differences between ChIP-seq and RNA-seq configs are: peak_calling: - label: gaf-embryo-1 - algorithm: macs2 + algorithm: macs ip: - gaf-embryo-1 control: @@ -138,7 +138,7 @@ The major differences between ChIP-seq and RNA-seq configs are: - input-embryo-1 - label: gaf-wingdisc-pooled - algorithm: macs2 + algorithm: macs ip: - gaf-wingdisc-1 - gaf-wingdisc-2 @@ -529,7 +529,7 @@ ChIP-seq-only fields ``algorithm``. This way, we can use the same label (e.g., `gaf-embryo-1`) across multiple peak-callers to help organize the output. - The currently-supported peak-callers are ``macs2``, ``spp``, and ``sicer``. + The currently-supported peak-callers are ``macs``, ``spp``, and ``sicer``. They each have corresponding wrappers in the ``wrappers`` directory. To add other peak-callers, see :ref:`new-peak-caller`. @@ -537,7 +537,7 @@ ChIP-seq-only fields assessing the peak-calling performance. Here is a minimal example of a peak-calling config section. It defines - a single peak-calling run using the `macs2` algorithm. Note that the + a single peak-calling run using the `macs` algorithm. Note that the ``ip:`` and ``control:`` keys are lists of **labels** from the ChIP-seq sample table's ``label`` column, **not sample IDs** from the first column. @@ -547,18 +547,18 @@ ChIP-seq-only fields peak_calling: - label: gaf-embryo-1 - algorithm: macs2 + algorithm: macs ip: - gaf-embryo-1 control: - input-embryo-1 The above peak-calling config will result in a file - ``data/chipseq_peaks/macs2/gaf-embryo-1/peaks.bed`` (that pattern is + ``data/chipseq_peaks/macs/gaf-embryo-1/peaks.bed`` (that pattern is defined in ``chipseq_patterns.yaml`` if you need to change it). We can specify additional command-line arguments that are passed verbatim - to `macs2` with the ``extra:`` section, for example: + to `macs` with the ``extra:`` section, for example: .. code-block:: yaml @@ -566,7 +566,7 @@ ChIP-seq-only fields peak_calling: - label: gaf-embryo-1 - algorithm: macs2 + algorithm: macs ip: - gaf-embryo-1 control: @@ -574,8 +574,8 @@ ChIP-seq-only fields extra: '--nomodel --extsize 147' - `macs2` supports multiple IP and input files, which internally are merged - by `macs2`. We can supply multiple IP and input labels for biological + `macs` supports multiple IP and input files, which internally are merged + by `macs`. We can supply multiple IP and input labels for biological replicates to get a set of peaks called on pooled samples. Note that we give it a different label so it doesn't overwrite the other peak-calling run we already have configured. @@ -586,7 +586,7 @@ ChIP-seq-only fields peak_calling: - label: gaf-embryo-1 - algorithm: macs2 + algorithm: macs ip: - gaf-embryo-1 control: @@ -595,7 +595,7 @@ ChIP-seq-only fields - label: gaf-embryo-pooled - algorithm: macs2 + algorithm: macs ip: - gaf-embryo-1 - gaf-embryo-2 diff --git a/docs/developers.rst b/docs/developers.rst index fc45b00d..9e459a97 100644 --- a/docs/developers.rst +++ b/docs/developers.rst @@ -96,7 +96,7 @@ Testing Adding a new peak-caller ------------------------ -First, write a wrapper for the peak-caller. You can use the ``macs2``, ``spp``, +First, write a wrapper for the peak-caller. You can use the ``macs``, ``spp``, and ``sicer`` wrappers as a guide. A wrapper should expect one or more sorted and indexed BAM files as IP, one or more sorted and indexed BAM files as input. The wrapper should create at least a sorted BED file of peaks, and can @@ -105,7 +105,7 @@ optionally create other supplemental files as well. Next, add the peak-caller to the top of ``lib/patterns_targets.py`` in the ``PEAK_CALLERS`` list. -Then write a rule for the peak-caller, again using ``macs2``, ``spp``, or +Then write a rule for the peak-caller, again using ``macs``, ``spp``, or ``sicer`` rules as a guide. Last, add additional lines in diff --git a/docs/faqs.rst b/docs/faqs.rst index 86d31cb0..77ac5020 100644 --- a/docs/faqs.rst +++ b/docs/faqs.rst @@ -99,7 +99,7 @@ accordingly. A partial exception to this is that the peak-calling for ChIP-seq supports specifying custom parameters for each peak-calling run. For example, when - running macs2 you can specify "--nomodel" for a single peak-calling run, or + running macs you can specify "--nomodel" for a single peak-calling run, or any other parameter supported by the peak-caller. However, the BAM files used in peak-calling still need to have used uniform diff --git a/docs/workflows.rst b/docs/workflows.rst index 3ab1ec2d..99bb44cb 100644 --- a/docs/workflows.rst +++ b/docs/workflows.rst @@ -99,7 +99,7 @@ Situtations where we use wrappers: These wrappers call the aligner, followed by samtools sort and view. The end result is that FASTQs go in, and a sorted BAM comes out. - Tools with legacy dependencies like Python 2.7 that must be run in an - independent environment (macs2, sicer, rseqc) + independent environment (sicer, rseqc) - R analyses (particularly spp and dupradar, which build up an R script incrementally before calling it). - Tools that need complicated setup, or handling output files hard-coded by the diff --git a/env.yml b/env.yml index a4341cb0..fe54c60d 100644 --- a/env.yml +++ b/env.yml @@ -194,7 +194,7 @@ dependencies: - libzlib=1.3.1 - logmuse=0.2.8 - logomaker=0.8.6 - - macs2=2.2.9.1 + - macs3=3.0.3 - make=4.4.1 - markdown=3.6 - markdown-it-py=3.0.0 diff --git a/include/requirements.txt b/include/requirements.txt index a2b21ee3..ebd02582 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -14,7 +14,7 @@ hisat2 intervalstats ipython kallisto -macs2 +macs3 multiqc pandas pandoc diff --git a/lib/chipseq.py b/lib/chipseq.py index 62608ed8..7015f83b 100644 --- a/lib/chipseq.py +++ b/lib/chipseq.py @@ -10,14 +10,14 @@ # [ # { # 'label': 'rep1', -# 'algorithm': 'macs2', +# 'algorithm': 'macs', # 'input': ['input_1'], # 'ip': ['ip_1'], # 'extra': '--gs dm', # }, # { # 'label': 'rep2', -# 'algorithm': 'macs2', +# 'algorithm': 'macs', # 'input': ['input_2'], # 'ip': ['ip_2'], # 'extra': '--gs dm', @@ -30,8 +30,8 @@ # This needs to be expanded out to the following patterns: # # [ -# 'data/chipseq_peaks/macs2/rep1/peaks.bigbed', -# 'data/chipseq_peaks/macs2/rep2/peaks.bigbed', +# 'data/chipseq_peaks/macs/rep1/peaks.bigbed', +# 'data/chipseq_peaks/macs/rep2/peaks.bigbed', # ] # # Which in turn needs these bams: diff --git a/scripts/macs2_callpeak.py b/scripts/macs_callpeak.py similarity index 99% rename from scripts/macs2_callpeak.py rename to scripts/macs_callpeak.py index d90c17d6..1f1eb120 100644 --- a/scripts/macs2_callpeak.py +++ b/scripts/macs_callpeak.py @@ -18,7 +18,7 @@ genome_count_flag = ' -g ' + effective_genome_count + ' ' cmds = ( - 'macs2 ' + 'macs3 ' 'callpeak ' '-c {snakemake.input.control} ' '-t {snakemake.input.ip} ' diff --git a/test/test_configs/complex-dataset-chipseq-config.yaml b/test/test_configs/complex-dataset-chipseq-config.yaml index 61406e94..ff724701 100644 --- a/test/test_configs/complex-dataset-chipseq-config.yaml +++ b/test/test_configs/complex-dataset-chipseq-config.yaml @@ -44,49 +44,49 @@ merged_bigwigs: chipseq: peak_calling: - label: BRD4-dBET6-1 - algorithm: macs2 + algorithm: macs ip: - BRD4-dBET6-1 control: - input-dBET6-1 - label: BRD4-dBET6-2 - algorithm: macs2 + algorithm: macs ip: - BRD4-dBET6-2 control: - input-dBET6-2 - label: BRD4-DMSO-1 - algorithm: macs2 + algorithm: macs ip: - BRD4-DMSO-1 control: - input-DMSO-1 - label: BRD4-DMSO-2 - algorithm: macs2 + algorithm: macs ip: - BRD4-DMSO-2 control: - input-DMSO-2 - label: MTHFD1-dBET6-1 - algorithm: macs2 + algorithm: macs ip: - MTHFD1-dBET6-1 control: - input-dBET6-1 - label: MTHFD1-dBET6-2 - algorithm: macs2 + algorithm: macs ip: - MTHFD1-dBET6-2 control: - input-dBET6-2 - label: MTHFD1-DMSO-1 - algorithm: macs2 + algorithm: macs ip: - MTHFD1-DMSO-1 control: - input-DMSO-1 - label: MTHFD1-DMSO-2 - algorithm: macs2 + algorithm: macs ip: - MTHFD1-DMSO-2 control: diff --git a/test/test_configs/test_chipseq_regression.yaml b/test/test_configs/test_chipseq_regression.yaml index 8ca61ed0..c59ab9bf 100644 --- a/test/test_configs/test_chipseq_regression.yaml +++ b/test/test_configs/test_chipseq_regression.yaml @@ -7,7 +7,7 @@ chipseq: peak_calling: - label: gaf-wingdisc-1 - algorithm: macs2 + algorithm: macs ip: - gaf-wingdisc-1 control: diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 9c2767d4..ce1243a9 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -443,28 +443,28 @@ rule fingerprint: -rule macs2: +rule macs: input: ip=lambda wc: expand( "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", - label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "ip"), + label=chipseq.samples_for_run(config, wc.macs_run, "macs", "ip"), ), control=lambda wc: expand( "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", - label=chipseq.samples_for_run(config, wc.macs2_run, "macs2", "control"), + label=chipseq.samples_for_run(config, wc.macs_run, "macs", "control"), ), chromsizes=rules.chromsizes.output, output: - bed="data/chipseq_peaks/macs2/{macs2_run}/peaks.bed", + bed="data/chipseq_peaks/macs/{macs_run}/peaks.bed", resources: mem="16g", runtime="2h", log: - "data/chipseq_peaks/macs2/{macs2_run}/peaks.bed.log", + "data/chipseq_peaks/macs/{macs_run}/peaks.bed.log", params: - block=lambda wc: chipseq.block_for_run(config, wc.macs2_run, "macs2"), + block=lambda wc: chipseq.block_for_run(config, wc.macs_run, "macs"), script: - "../../scripts/macs2_callpeak.py" + "../../scripts/macs_callpeak.py" rule epic2: diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py index d069b015..5726fc02 100644 --- a/workflows/chipseq/chipseq_trackhub.py +++ b/workflows/chipseq/chipseq_trackhub.py @@ -78,7 +78,7 @@ subgroups.append( SubGroupDefinition( name='algorithm', label='algorithm', mapping={ - 'macs2': 'macs2', + 'macs': 'macs', 'epic2': 'epic2', 'NA': 'NA', })) diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml index 75466ad6..d35898d2 100644 --- a/workflows/chipseq/config/config.yaml +++ b/workflows/chipseq/config/config.yaml @@ -48,7 +48,7 @@ chipseq: peak_calling: - label: gaf-embryo-1 - algorithm: macs2 + algorithm: macs ip: - gaf-embryo-1 control: @@ -61,7 +61,7 @@ chipseq: extra: '--nomodel --extsize 147' - label: gaf-wingdisc-pooled - algorithm: macs2 + algorithm: macs ip: - gaf-wingdisc-1 - gaf-wingdisc-2 From bee444e85b4c5c034d2c5425a84b53e9088d7599 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 5 Oct 2025 19:16:09 +0000 Subject: [PATCH 098/196] rm params.extra for cutadapt --- workflows/rnaseq/Snakefile | 2 -- 1 file changed, 2 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 111b61b5..5ec50596 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -95,7 +95,6 @@ rule cutadapt: "--minimum-length 25 " "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " - "{params.extra} " "{input.fastq[0]} " "{input.fastq[1]} " "&> {log}" @@ -109,7 +108,6 @@ rule cutadapt: "--overlap 6 " "--minimum-length 25 " "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " - "{params.extra} " "{input.fastq[0]} " "&> {log}" ) From f8f7143ef635041e04baefd8dd25e14f68a7480d Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 5 Oct 2025 20:29:36 +0000 Subject: [PATCH 099/196] update env.yml --- env.yml | 407 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 219 insertions(+), 188 deletions(-) diff --git a/env.yml b/env.yml index fe54c60d..f7f89425 100644 --- a/env.yml +++ b/env.yml @@ -4,24 +4,27 @@ channels: dependencies: - _libgcc_mutex=0.1 - _openmp_mutex=4.5 + - _python_abi3_support=1.0 - _r-mutex=1.0.1 - alabaster=1.0.0 - - alsa-lib=1.2.13 + - alsa-lib=1.2.14 - amply=0.1.6 + - anndata=0.12.2 - annotated-types=0.7.0 - - anyio=4.9.0 + - anyio=4.11.0 - appdirs=1.4.4 - - argcomplete=3.6.1 + - argcomplete=3.6.2 - argh=0.31.3 - argparse-dataclass=2.0.0 + - array-api-compat=1.12.0 - asttokens=3.0.0 - attrs=25.3.0 - babel=2.17.0 - backports=1.0 - backports.tarfile=1.2.0 - - beautifulsoup4=4.13.3 + - beautifulsoup4=4.14.2 - bedtools=2.31.1 - - binutils_impl_linux-64=2.43 + - binutils_impl_linux-64=2.44 - biopython=1.85 - boost-cpp=1.85.0 - bowtie=1.3.1 @@ -30,51 +33,57 @@ dependencies: - brotli-bin=1.1.0 - brotli-python=1.1.0 - bwidget=1.10.1 - - bx-python=0.13.0 + - bx-python=0.14.0 - bzip2=1.0.8 - - c-ares=1.34.4 - - ca-certificates=2025.1.31 + - c-ares=1.34.5 + - ca-certificates=2025.10.5 + - cached-property=1.5.2 + - cached_property=1.5.2 - cairo=1.18.4 - - certifi=2025.1.31 - - cffi=1.17.1 - - charset-normalizer=3.4.1 - - click=8.1.8 + - certifi=2025.10.5 + - cffi=2.0.0 + - charset-normalizer=3.4.3 + - click=8.3.0 - coin-or-cbc=2.10.12 - coin-or-cgl=0.60.9 - coin-or-clp=1.17.10 - coin-or-osi=0.108.11 - coin-or-utils=2.11.12 - - coincbc=2.10.12 - colorama=0.4.6 - coloredlogs=15.0.1 - colormath=3.0.0 - conda-inject=1.3.2 - - configargparse=1.7 + - configargparse=1.7.1 - connection_pool=0.0.3 - - contourpy=1.3.1 - - cryptography=44.0.2 - - curl=8.12.1 - - cutadapt=5.0 + - contourpy=1.3.3 + - cpython=3.11.13 + - crc32c=2.7.1 + - cryptography=46.0.2 + - curl=8.14.1 + - cutadapt=5.1 - cycler=0.12.1 - - dbus=1.13.6 + - cykhash=2.0.1 + - dbus=1.16.2 - decorator=5.2.1 - deeptools=3.5.6 - deeptoolsintervals=0.1.9 - - distlib=0.3.9 + - deprecated=1.2.18 + - distlib=0.4.0 - dnaio=1.2.2 - docutils=0.21.2 + - donfig=0.8.1.post1 - dpath=2.2.0 - editables=0.5 - eido=0.2.4 - epic2=0.0.52 - et_xmlfile=2.0.0 - - exceptiongroup=1.2.2 + - exceptiongroup=1.3.0 - execnet=2.1.1 - - executing=2.1.0 - - expat=2.6.4 + - executing=2.2.1 + - expat=2.7.1 - fastq-screen=0.16.0 - fastqc=0.12.1 - - filelock=3.18.0 + - filelock=3.19.1 - font-ttf-dejavu-sans-mono=2.37 - font-ttf-inconsolata=3.000 - font-ttf-source-code-pro=2.038 @@ -82,66 +91,70 @@ dependencies: - fontconfig=2.15.0 - fonts-conda-ecosystem=1 - fonts-conda-forge=1 - - fonttools=4.56.0 - - freetype=2.13.3 - - fribidi=1.0.10 - - gcc_impl_linux-64=14.2.0 + - fonttools=4.60.1 + - freetype=2.14.1 + - fribidi=1.0.16 + - gcc_impl_linux-64=15.2.0 - gffread=0.12.7 - gffutils=0.13 - - gfortran_impl_linux-64=14.2.0 + - gfortran_impl_linux-64=15.2.0 - giflib=5.2.2 - gitdb=4.0.12 - - gitpython=3.1.44 - - graphite2=1.3.13 + - gitpython=3.1.45 + - graphite2=1.3.14 - gsl=1.16 - - gxx_impl_linux-64=14.2.0 - - h11=0.14.0 - - h2=4.2.0 - - harfbuzz=11.0.0 - - hatch=1.14.0 + - gxx_impl_linux-64=15.2.0 + - h11=0.16.0 + - h2=4.3.0 + - h5py=3.13.0 + - harfbuzz=11.4.5 + - hatch=1.14.1 - hatchling=1.27.0 - hdf5=1.14.3 - hisat2=2.2.1 + - hmmlearn=0.3.3 - hpack=4.1.0 - html5lib=1.1 - - htslib=1.21 - - httpcore=1.0.7 + - htslib=1.22.1 + - httpcore=1.0.9 - httpx=0.28.1 - humanfriendly=10.0 - - humanize=4.12.2 + - humanize=4.13.0 - hyperframe=6.1.0 - hyperlink=21.0.0 - icu=75.1 - idna=3.10 - imagesize=1.4.1 - immutables=0.21 - - importlib-metadata=8.6.1 + - importlib-metadata=8.7.0 - importlib_resources=6.5.2 - iniconfig=2.0.0 - intervalstats=1.01 - - ipython=9.0.2 + - ipython=9.6.0 - ipython_pygments_lexers=1.1.1 - isa-l=2.31.1 - jaraco.classes=3.4.0 - jaraco.context=6.0.1 - - jaraco.functools=4.1.0 + - jaraco.functools=4.3.0 - jedi=0.19.2 - jeepney=0.9.0 - jinja2=3.1.6 - - jsonschema=4.23.0 - - jsonschema-specifications=2024.10.1 - - jupyter_core=5.7.2 + - joblib=1.5.2 + - jsonschema=4.25.1 + - jsonschema-specifications=2025.9.1 + - jupyter_core=5.8.1 - kaleido-core=0.2.1 - kallisto=0.51.1 - - kernel-headers_linux-64=3.10.0 + - kernel-headers_linux-64=5.14.0 - keyring=25.6.0 - - keyutils=1.6.1 - - kiwisolver=1.4.7 + - keyutils=1.6.3 + - kiwisolver=1.4.9 - krb5=1.21.3 - lcms2=2.17 - - ld_impl_linux-64=2.43 + - ld_impl_linux-64=2.44 + - legacy-api-wrap=1.4.1 - lerc=4.0.0 - - libaec=1.1.3 + - libaec=1.1.4 - libblas=3.9.0 - libboost=1.85.0 - libboost-devel=1.85.0 @@ -151,91 +164,100 @@ dependencies: - libbrotlienc=1.1.0 - libcblas=3.9.0 - libcups=2.3.3 - - libcurl=8.12.1 + - libcurl=8.14.1 - libdeflate=1.22 - libedit=3.1.20250104 - libev=4.33 - - libexpat=2.6.4 + - libexpat=2.7.1 - libffi=3.4.6 - - libgcc=14.2.0 - - libgcc-devel_linux-64=14.2.0 - - libgcc-ng=14.2.0 + - libfreetype=2.14.1 + - libfreetype6=2.14.1 + - libgcc=15.2.0 + - libgcc-devel_linux-64=15.2.0 + - libgcc-ng=15.2.0 - libgd=2.3.3 - libgff=2.0.0 - - libgfortran=14.2.0 - - libgfortran5=14.2.0 - - libglib=2.84.0 - - libgomp=14.2.0 - - libhwloc=2.11.2 + - libgfortran=15.2.0 + - libgfortran5=15.2.0 + - libglib=2.84.3 + - libgomp=15.2.0 + - libhwloc=2.12.1 - libiconv=1.18 - libjemalloc=5.3.0 - - libjpeg-turbo=3.0.0 + - libjpeg-turbo=3.1.0 - liblapack=3.9.0 - liblapacke=3.9.0 - - liblzma=5.6.4 - - liblzma-devel=5.6.4 - - libnghttp2=1.64.0 + - liblzma=5.8.1 + - liblzma-devel=5.8.1 + - libnghttp2=1.67.0 - libnsl=2.0.1 - - libopenblas=0.3.29 - - libopenssl-static=3.4.1 - - libpng=1.6.47 - - libsanitizer=14.2.0 - - libsqlite=3.49.1 + - libopenblas=0.3.30 + - libopenssl-static=3.5.4 + - libpng=1.6.50 + - libsanitizer=15.2.0 + - libsqlite=3.50.4 - libssh2=1.11.1 - - libstdcxx=14.2.0 - - libstdcxx-devel_linux-64=14.2.0 - - libstdcxx-ng=14.2.0 + - libstdcxx=15.2.0 + - libstdcxx-devel_linux-64=15.2.0 + - libstdcxx-ng=15.2.0 - libtiff=4.7.0 - - libuuid=2.38.1 - - libwebp-base=1.5.0 + - libuuid=2.41.2 + - libwebp-base=1.6.0 - libxcb=1.17.0 - libxcrypt=4.4.36 - - libxml2=2.13.7 + - libxml2=2.14.6 + - libxml2-16=2.14.6 - libzlib=1.3.1 + - llvmlite=0.45.1 - logmuse=0.2.8 - logomaker=0.8.6 - macs3=3.0.3 - make=4.4.1 - - markdown=3.6 - - markdown-it-py=3.0.0 - - markupsafe=3.0.2 + - mariadb-connector-c=3.4.7 + - markdown=3.9 + - markdown-it-py=4.0.0 + - markupsafe=3.0.3 - mathjax=2.7.7 - - matplotlib-base=3.10.1 + - matplotlib-base=3.10.6 - matplotlib-inline=0.1.7 - mdurl=0.1.2 - - more-itertools=10.6.0 - - multiqc=1.28 + - more-itertools=10.8.0 + - msgpack-python=1.1.1 + - multiqc=1.31 - munkres=1.1.4 - mysql-connector-c=6.1.11 - - narwhals=1.32.0 + - narwhals=2.6.0 - natsort=8.4.0 - nbformat=5.10.4 - ncbi-vdb=3.2.1 - ncurses=6.5 - - networkx=3.4.2 - - nspr=4.36 - - nss=3.110 - - numpy=2.2.4 - - numpydoc=1.8.0 - - openjdk=23.0.2 + - networkx=3.5 + - nspr=4.37 + - nss=3.117 + - numba=0.62.1 + - numcodecs=0.16.1 + - numpy=2.3.3 + - numpydoc=1.9.0 + - openjdk=24.0.2 - openjpeg=2.5.3 - openpyxl=3.1.5 - - openssl=3.4.1 + - openssl=3.5.4 - ossuuid=1.6.2 - - packaging=24.2 - - pandas=2.2.3 - - pandoc=3.6.4 - - pango=1.56.3 - - parso=0.8.4 + - packaging=25.0 + - pandas=2.3.3 + - pandoc=3.8.1 + - pango=1.56.4 + - parso=0.8.5 - pathspec=0.12.1 - patsy=1.0.1 - pbzip2=1.1.13 - - pcre2=10.44 + - pcre2=10.45 - pephubclient=0.4.4 - peppy=0.40.7 - perl=5.32.1 - perl-alien-build=2.84 - - perl-alien-libxml2=0.17 + - perl-alien-build-plugin-download-gitlab=0.01 + - perl-alien-libxml2=0.20 - perl-business-isbn=3.007 - perl-business-isbn-data=20210112.006 - perl-capture-tiny=0.48 @@ -249,7 +271,7 @@ dependencies: - perl-file-temp=0.2304 - perl-file-which=1.24 - perl-gd=2.83 - - perl-gdgraph=1.54 + - perl-gdgraph=1.56 - perl-gdtextutil=0.86 - perl-importer=0.026 - perl-parent=0.243 @@ -257,13 +279,13 @@ dependencies: - perl-pathtools=3.75 - perl-scope-guard=0.21 - perl-sub-info=0.002 - - perl-term-table=0.024 + - perl-term-table=0.025 - perl-test-fatal=0.016 - perl-test-nowarnings=1.06 - perl-test-warnings=0.031 - perl-test2-suite=0.000163 - perl-try-tiny=0.31 - - perl-uri=5.17 + - perl-uri=5.34 - perl-xml-libxml=2.0210 - perl-xml-namespacesupport=1.12 - perl-xml-sax=1.02 @@ -272,17 +294,17 @@ dependencies: - picard=2.27.5 - pickleshare=0.7.5 - pigz=2.8 - - pillow=11.1.0 - - pip=25.0.1 - - pixman=0.44.2 - - pkgutil-resolve-name=1.3.10 - - plac=1.4.3 - - platformdirs=4.3.7 - - plotly=6.0.1 - - pluggy=1.5.0 + - pillow=11.3.0 + - pip=25.2 + - pixman=0.46.4 + - plac=1.4.5 + - platformdirs=4.4.0 + - plotly=6.3.1 + - pluggy=1.6.0 + - polars-lts-cpu=1.33.1 - preseq=2.0.2 - - prompt-toolkit=3.0.50 - - psutil=7.0.0 + - prompt-toolkit=3.0.52 + - psutil=7.1.0 - pthread-stubs=0.4 - ptyprocess=0.7.0 - pulp=2.8.0 @@ -292,64 +314,70 @@ dependencies: - pybedtools=0.12.0 - pybigwig=0.3.24 - pycparser=2.22 - - pydantic=2.10.6 - - pydantic-core=2.27.2 - - pyfaidx=0.8.1.3 - - pygments=2.19.1 - - pyparsing=3.2.3 + - pydantic=2.11.10 + - pydantic-core=2.33.2 + - pyfaidx=0.9.0.3 + - pygments=2.19.2 + - pynndescent=0.5.13 + - pyparsing=3.2.5 - pysam=0.22.1 - pysocks=1.7.1 - - pytest=8.3.5 - - pytest-xdist=3.6.1 - - python=3.11.11 + - pytest=8.4.2 + - pytest-xdist=3.8.0 + - python=3.11.13 - python-dateutil=2.9.0.post0 - - python-dotenv=1.1.0 - - python-fastjsonschema=2.21.1 - - python-isal=1.7.2 + - python-dotenv=1.1.1 + - python-fastjsonschema=2.21.2 + - python-gil=3.11.13 + - python-isal=1.8.0 - python-kaleido=0.2.1 - python-tzdata=2025.2 - - python-zlib-ng=0.5.1 + - python-zlib-ng=1.0.0 - python_abi=3.11 - - pytz=2024.1 - - pyvcf3=1.0.3 - - pyyaml=6.0.2 + - pytz=2025.2 + - pyvcf3=1.0.4 + - pyyaml=6.0.3 - qhull=2020.2 - r-base=4.2.3 - readline=8.2 - referencing=0.36.2 - - regex=2024.11.6 - - requests=2.32.3 + - regex=2025.9.18 + - requests=2.32.5 - reretry=0.11.8 - - rich=13.9.4 - - rich-click=1.8.8 + - rich=14.1.0 + - rich-click=1.9.2 - roman-numerals-py=3.1.0 - - rpds-py=0.24.0 + - rpds-py=0.27.1 - rseqc=5.0.4 - salmon=1.10.3 - - samtools=1.21 - - scipy=1.15.2 + - samtools=1.22.1 + - scanpy=1.11.4 + - scikit-learn=1.7.2 + - scipy=1.16.2 - seaborn=0.13.2 - seaborn-base=0.13.2 - - secretstorage=3.3.3 - - sed=4.8 - - setuptools=75.8.2 + - secretstorage=3.4.0 + - sed=4.9 + - session-info2=0.2.2 + - setuptools=80.9.0 - shellingham=1.5.4 - - simplejson=3.20.1 + - simplejson=3.20.2 - six=1.17.0 - - slack-sdk=3.35.0 - - slack_sdk=3.35.0 - - smart_open=7.1.0 + - slack-sdk=3.36.0 + - slack_sdk=3.36.0 + - smart_open=7.3.1 - smmap=5.0.2 - - snakemake=9.1.3 - - snakemake-interface-common=1.17.4 - - snakemake-interface-executor-plugins=9.3.5 - - snakemake-interface-logger-plugins=1.2.3 - - snakemake-interface-report-plugins=1.1.0 - - snakemake-interface-storage-plugins=4.2.1 - - snakemake-minimal=9.1.3 + - snakemake=9.12.0 + - snakemake-interface-common=1.22.0 + - snakemake-interface-executor-plugins=9.3.9 + - snakemake-interface-logger-plugins=1.2.4 + - snakemake-interface-report-plugins=1.2.0 + - snakemake-interface-scheduler-plugins=2.0.1 + - snakemake-interface-storage-plugins=4.2.3 + - snakemake-minimal=9.12.0 - sniffio=1.3.1 - - snowballstemmer=2.2.0 - - soupsieve=2.5 + - snowballstemmer=3.0.1 + - soupsieve=2.8 - spectra=0.0.11 - sphinx=8.2.3 - sphinxcontrib-applehelp=2.0.0 @@ -358,57 +386,59 @@ dependencies: - sphinxcontrib-jsmath=1.0.1 - sphinxcontrib-qthelp=2.0.0 - sphinxcontrib-serializinghtml=1.1.10 - - sqlite=3.49.1 - - sra-tools=3.2.0 + - sqlite=3.50.4 + - sra-tools=3.2.1 - stack_data=0.6.3 - - staden_io_lib=1.15.0 + - staden_io_lib=1.15.1 - star=2.7.11b - - statsmodels=0.14.4 - - subread=2.0.8 - - sysroot_linux-64=2.17 + - statsmodels=0.14.5 + - subread=2.1.1 + - sysroot_linux-64=2.34 - tabulate=0.9.0 - - tbb=2022.0.0 + - tbb=2022.2.0 + - threadpoolctl=3.6.0 - throttler=1.2.2 - - tiktoken=0.9.0 + - tiktoken=0.11.0 - tk=8.6.13 - tktable=2.10 - tomli=2.2.1 - tomli-w=1.2.0 - - tomlkit=0.13.2 + - tomlkit=0.13.3 - tqdm=4.67.1 - trackhub=1.0 - traitlets=5.14.3 - - trove-classifiers=2025.3.19.19 - - typeguard=4.4.2 - - typer=0.15.2 - - typer-slim=0.15.2 - - typer-slim-standard=0.15.2 - - typing-extensions=4.13.0 - - typing_extensions=4.13.0 + - trove-classifiers=2025.9.11.17 + - typeguard=4.4.4 + - typer=0.19.2 + - typer-slim=0.19.2 + - typer-slim-standard=0.19.2 + - typing-extensions=4.15.0 + - typing-inspection=0.4.2 + - typing_extensions=4.15.0 - tzdata=2025b - ubiquerg=0.8.0 - - ucsc-bedgraphtobigwig=472 - - ucsc-bedsort=469 - - ucsc-bedtobigbed=473 - - ucsc-bigwigmerge=469 - - ucsc-fetchchromsizes=469 - - ucsc-genepredtobed=469 - - ucsc-gtftogenepred=469 - - ucsc-liftover=469 - - ucsc-oligomatch=469 - - ucsc-stringify=472 - - ucsc-twobittofa=472 - - ucsc-wigtobigwig=472 + - ucsc-bedgraphtobigwig=482 + - ucsc-bedsort=482 + - ucsc-bedtobigbed=482 + - ucsc-bigwigmerge=482 + - ucsc-fetchchromsizes=482 + - ucsc-genepredtobed=482 + - ucsc-gtftogenepred=482 + - ucsc-liftover=482 + - ucsc-oligomatch=482 + - ucsc-twobittofa=482 + - ucsc-wigtobigwig=482 + - umap-learn=0.5.9.post2 - unicodedata2=16.0.0 - - urllib3=2.3.0 + - urllib3=2.5.0 - userpath=1.9.2 - - uv=0.6.10 + - uv=0.8.22 - veracitools=0.1.3 - - virtualenv=20.29.3 - - wcwidth=0.2.13 + - virtualenv=20.34.0 + - wcwidth=0.2.14 - webencodings=0.5.1 - wheel=0.45.1 - - wrapt=1.17.2 + - wrapt=1.17.3 - xopen=2.0.2 - xorg-libice=1.1.2 - xorg-libsm=1.2.6 @@ -416,19 +446,20 @@ dependencies: - xorg-libxau=1.0.12 - xorg-libxdmcp=1.1.5 - xorg-libxext=1.3.6 - - xorg-libxfixes=6.0.1 + - xorg-libxfixes=6.0.2 - xorg-libxi=1.8.2 - xorg-libxrandr=1.5.4 - xorg-libxrender=0.9.12 - xorg-libxt=1.3.1 - xorg-libxtst=1.2.5 - - xz=5.6.4 - - xz-gpl-tools=5.6.4 - - xz-tools=5.6.4 + - xz=5.8.1 + - xz-gpl-tools=5.8.1 + - xz-tools=5.8.1 - yaml=0.2.5 - - yte=1.7.0 - - zipp=3.21.0 + - yte=1.8.1 + - zarr=3.1.3 + - zipp=3.23.0 - zlib=1.3.1 - - zlib-ng=2.2.4 - - zstandard=0.23.0 + - zlib-ng=2.2.5 + - zstandard=0.25.0 - zstd=1.5.7 From 22e3f5d6fd36b0a65ec59fb0e1154b36c02e9a67 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 5 Oct 2025 20:30:58 +0000 Subject: [PATCH 100/196] move relevant references rules to rnaseq --- workflows/rnaseq/Snakefile | 257 ++++++++++++++++++++++++++++++++++++- 1 file changed, 255 insertions(+), 2 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 5ec50596..35c12757 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -33,8 +33,6 @@ rule all: "data/rnaseq_aggregation/multiqc.html", -include: "../references/Snakefile" - # Optionally run `snakemake strand_check` to do a preliminary run on # automatically-subset data to evaluate strandedness. include: "strand_check.smk" @@ -62,6 +60,261 @@ rule symlinks: utils.make_relative_symlink(src, linkname) +rule fasta: + output: + temporary(f"{REFERENCES}/genome.fa.gz"), + log: + f"{REFERENCES}/logs/genome.fa.gz.log", + resources: + mem_mb="4g", + runtime="2h", + run: + utils.download_and_postprocess( + urls=config["fasta"]["url"], + postprocess=config["fasta"].get("postprocess", None), + outfile=output[0], + log=log, + ) + + +rule gtf: + output: + temporary(f"{REFERENCES}/annotation.gtf.gz"), + log: + f"{REFERENCES}/logs/annotation.gtf.gz.log", + resources: + mem="4g", + runtime="2h", + run: + utils.download_and_postprocess( + urls=config["gtf"]["url"], + postprocess=config["gtf"].get("postprocess", None), + outfile=output[0], + log=log, + ) + + +rule rrna_fasta: + output: + f"{REFERENCES}/rrna.fa.gz", + log: + f"{REFERENCES}/logs/rrna.fa.log", + resources: + mem="4g", + runtime="2h", + run: + utils.download_and_postprocess( + urls=config["rrna"]["url"], + postprocess=config["rrna"].get("postprocess", None), + outfile=output[0], + log=log, + ) + + +rule unzip: + input: + f"{REFERENCES}/{{prefix}}.gz", + output: + f"{REFERENCES}/{{prefix}}", + resources: + mem="4g", + runtime="2h", + shell: + "gunzip -c {input} > {output}" + + +rule rrna_index: + input: + f"{REFERENCES}/rrna.fa", + output: + f"{REFERENCES}/bowtie2/rrna.1.bt2", + f"{REFERENCES}/bowtie2/rrna.fa", + log: + f"{REFERENCES}/logs/bowtie2_rrna.log", + resources: + mem="32g", + disk="50g", + runtime="8h", + threads: 8 + run: + index = f"{REFERENCES}/bowtie2/rrna" + shell("bowtie2-build" " --threads {threads}" " {input}" " {index}" " &> {log}") + utils.make_relative_symlink(input[0], output[-1]) + + +rule star_index: + input: + fasta=f"{REFERENCES}/genome.fa", + gtf=f"{REFERENCES}/annotation.gtf", + output: + f"{REFERENCES}/star/Genome", + log: + f"{REFERENCES}/logs/star.log", + threads: 8 + resources: + mem="64g", + runtime="8h", + run: + genomedir = os.path.dirname(output[0]) + shell("rm -r {genomedir}") + shell("mkdir -p {genomedir}") + shell( + "STAR " + "--runMode genomeGenerate " + "--runThreadN {threads} " + "--genomeDir {genomedir} " + "--genomeFastaFiles {input.fasta} " + # NOTE: GTF is optional + "--sjdbGTFfile {input.gtf} " + # NOTE: STAR docs say that 100 should work well. + "--sjdbOverhang 100 " + # NOTE: for small genomes, may need to scale this down to + # min(14, log2(GenomeLength) / 2 - 1) + # --genomeSAindexNbases 14 + "&> {log}" + ) + # STAR writes a hard-coded Log.out file to the current working + # directory. So put that on the end of the log file for the rule and + # then clean up. + shell("cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out") + shell("ln -s {input.fasta} {genomedir}") + + +rule transcriptome_fasta: + input: + fasta=f"{REFERENCES}/genome.fa", + gtf=f"{REFERENCES}/annotation.gtf", + output: + f"{REFERENCES}/transcriptome.fa", + resources: + mem="4g", + runtime="2h", + shell: + "gffread {input.gtf} -w {output} -g {input.fasta}" + + +rule salmon_index: + input: + f"{REFERENCES}/transcriptome.fa", + output: + f"{REFERENCES}/salmon/versionInfo.json", + log: + f"{REFERENCES}/logs/salmon.log", + params: + outdir=f"{REFERENCES}/salmon", + resources: + mem="32g", + runtime="2h", + run: + outdir = os.path.dirname(output[0]) + shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}") + + +rule conversion_refflat: + input: + f"{REFERENCES}/annotation.gtf.gz", + output: + f"{REFERENCES}/annotation.refflat", + log: + f"{REFERENCES}/logs/annotation.refflat.log", + resources: + mem="2g", + runtime="2h", + shell: + "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp " + """&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} """ + "&& rm {output}.tmp " + + +rule conversion_bed12: + input: + f"{REFERENCES}/annotation.gtf.gz", + output: + f"{REFERENCES}/annotation.bed12", + resources: + mem="2g", + runtime="2h", + shell: + "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp " + "&& genePredToBed {output}.tmp {output} " + "&& rm {output}.tmp" + + +rule chromsizes: + input: + f"{REFERENCES}/genome.fa.gz", + output: + f"{REFERENCES}/genome.chromsizes", + log: + f"{REFERENCES}/logs/genome.chromsizes.log", + params: + java_args="-Xmx20g", + # java_args='-Xmx2g' # [TEST SETTINGS -1] + resources: + mem="24g", + runtime="2h", + shell: + "export LC_COLLATE=C; " + "rm -f {output}.tmp " + "&& picard " + "{params.java_args} " + "CreateSequenceDictionary R={input} O={output}.tmp &> {log} " + '&& grep "^@SQ" {output}.tmp ' + """| awk '{{print $2, $3}}' """ + '| sed "s/SN://g;s/ LN:/\\t/g" ' + "| sort -k1,1 > {output} " + "&& rm -f {output}.tmp " + + +rule mappings: + """ + Creates gzipped TSV mapping between attributes in the GTF. + """ + input: + gtf=f"{REFERENCES}/annotation.gtf.gz", + output: + f"{REFERENCES}/annotation.mapping.tsv.gz", + params: + include_featuretypes=lambda wildcards, output: conversion_kwargs[ + output[0] + ].get("include_featuretypes", []), + resources: + mem="2g", + runtime="2h", + run: + import gffutils + + # Will want to change the setting back to what it was originally when + # we're done + orig_setting = gffutils.constants.always_return_list + gffutils.constants.always_return_list = False + + include_featuretypes = params.include_featuretypes + + res = [] + for f in gffutils.DataIterator(input[0]): + + ft = f.featuretype + + if include_featuretypes and (ft not in include_featuretypes): + continue + + d = dict(f.attributes) + d["__featuretype__"] = ft + res.append(d) + + df = pandas.DataFrame(res) + + # Depending on how many attributes there were and the + # include_featuretypes settings, this may take a while. + df = df.drop_duplicates() + + df.to_csv(output[0], sep="\t", index=False, compression="gzip") + + # Restore original setting + gffutils.constants.always_return_list = orig_setting + + rule symlink_targets: input: expand( From 0793fa31ffdf86c86981ec070ab3042f630db622 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 5 Oct 2025 20:31:55 +0000 Subject: [PATCH 101/196] convert bowtie2 to shell block --- workflows/rnaseq/Snakefile | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 35c12757..07bdedca 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -485,25 +485,18 @@ rule rRNA: resources: mem="2g", runtime="2h", - run: - prefix = os.path.commonprefix(input.index).rstrip(".") - sam = output.bam.replace(".bam", ".sam") - shell( - "bowtie2 " - "-x {prefix} " - "-U {input.fastq} " - "--threads {threads} " - "-k 1 " - "--no-unal " - "-S {sam} " - "> {log} 2>&1" - ) - - shell( - "samtools view -Sb {sam} " - "| samtools sort - -o {output.bam} -O BAM " - "&& rm {sam}" - ) + shell: + "bowtie2 " + f"-x {REFERENCES}/bowtie2/rrna " + "-U {input.fastq} " + "--threads {threads} " + "-k 1 " + "--no-unal " + "-S {output.bam}.sam " + "> {log} 2>&1 " + "&& samtools view -Sb {output.bam}.sam " + "| samtools sort - -o {output.bam} -O BAM " + "&& rm {output.bam}.sam" rule fastq_count: From 84bb39f42ea4a8c72bfcef837e11be5afd480367 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 5 Oct 2025 20:32:15 +0000 Subject: [PATCH 102/196] move strand check and sra to bottom --- workflows/rnaseq/Snakefile | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 07bdedca..9ebfac5d 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -33,14 +33,6 @@ rule all: "data/rnaseq_aggregation/multiqc.html", -# Optionally run `snakemake strand_check` to do a preliminary run on -# automatically-subset data to evaluate strandedness. -include: "strand_check.smk" - -# If the sampletable is from SRA, handle it here. -include: "sra.smk" - - rule symlinks: input: lambda wc: ( @@ -959,3 +951,10 @@ rule multiqc: "{analysis_directory} " "&> {log} " ) + +# Optionally run `snakemake strand_check` to do a preliminary run on +# automatically-subset data to evaluate strandedness. +include: "strand_check.smk" + +# If the sampletable is from SRA, handle it here. +include: "sra.smk" From 822afe2d41fc951db01f57dfbc1d682ef52b603f Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 5 Oct 2025 20:33:18 +0000 Subject: [PATCH 103/196] simplify inputs --- workflows/rnaseq/Snakefile | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 9ebfac5d..25f50a38 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -314,7 +314,6 @@ rule symlink_targets: ), - rule cutadapt: input: fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), @@ -397,7 +396,7 @@ rule star: input: fastq=rules.cutadapt.output, index=rules.star_index.output, - annotation=f"{REFERENCES}/annotation.gtf", + annotation=f"{REFERENCES}/annotation.gtf.gz", output: bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), sjout=temporary( @@ -459,16 +458,7 @@ rule star: rule rRNA: input: fastq="data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz", - index=multiext( - f"{REFERENCES}/bowtie2/rrna", - ".1.bt2", - ".2.bt2", - ".3.bt2", - ".4.bt2", - ".rev.1.bt2", - ".rev.2.bt2", - ".fa", - ), + index=f"{REFERENCES}/bowtie2/rrna.1.bt2", output: bam="data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam", log: From 1c1f6f7e409fc8facbd7a42ba2c7558df26fd39c Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 5 Oct 2025 20:33:32 +0000 Subject: [PATCH 104/196] reflect changes to rule name --- workflows/rnaseq/strand_check.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/rnaseq/strand_check.smk b/workflows/rnaseq/strand_check.smk index 9c8a3467..bd7c45d4 100644 --- a/workflows/rnaseq/strand_check.smk +++ b/workflows/rnaseq/strand_check.smk @@ -1,7 +1,7 @@ rule sample_strand_check: input: fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), - index=expand(rules.bowtie2_index.output, label="genome"), + index=expand(rules.rrna_index.output, label="genome"), bed12=rules.conversion_bed12.output, output: strandedness="strand_check/{sample}/{sample}.strandedness", From cff4e095831c5a0fe6e8e520d228a2c613c47af9 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 5 Oct 2025 20:34:22 +0000 Subject: [PATCH 105/196] minor formatting --- workflows/rnaseq/Snakefile | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 25f50a38..4f5e907e 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -390,7 +390,6 @@ rule fastqc: shell("mv {out_html} {output.html}") - rule star: "Align with STAR (1-pass mode)" input: From 4122012e76a70b5b53a0a80005fec1b0603068e2 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 7 Oct 2025 15:26:42 +0000 Subject: [PATCH 106/196] include additional .smk files when deploying --- deploy.py | 3 +++ workflows/chipseq/sra.smk | 40 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 workflows/chipseq/sra.smk diff --git a/deploy.py b/deploy.py index 1981804c..8270348d 100755 --- a/deploy.py +++ b/deploy.py @@ -82,6 +82,8 @@ def write_include_file(source, flavor="all"): PATTERN_DICT = { "rnaseq": [ "include workflows/rnaseq/Snakefile", + "include workflows/rnaseq/strand_check.smk", + "include workflows/rnaseq/sra.smk", "recursive-include workflows/rnaseq/config *", "include workflows/rnaseq/rnaseq_trackhub.py", "recursive-include workflows/rnaseq/downstream *.Rmd", @@ -89,6 +91,7 @@ def write_include_file(source, flavor="all"): ], "chipseq": [ "include workflows/chipseq/Snakefile", + "include workflows/rnaseq/sra.smk", "recursive-include workflows/chipseq/config *", "include workflows/chipseq/chipseq_trackhub.py", ], diff --git a/workflows/chipseq/sra.smk b/workflows/chipseq/sra.smk new file mode 100644 index 00000000..5ee5f53b --- /dev/null +++ b/workflows/chipseq/sra.smk @@ -0,0 +1,40 @@ +if utils.detect_sra(sampletable): + sampletable["orig_filename"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1 + ) + + if is_paired: + sampletable["orig_filename_R2"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + sample=SAMPLES, + n=2, + ) + + rule fastq_dump: + output: + fastq=expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + n=n, + allow_missing=True, + ), + log: + "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", + params: + is_paired=is_paired, + # extra="-X 100000", # [enable for test] + resources: + mem="1g", + disk="1g", + runtime="2h", + run: + srr = sampletable.loc[wildcards.sample, "Run"] + extra = params.get("extra", "") + if is_paired: + shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") + shell("mv {srr}_1.fastq.gz {output[0]}") + shell("mv {srr}_2.fastq.gz {output[1]}") + else: + shell( + "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp" + ) + shell("mv {output[0]}.tmp {output[0]}") From d71877f1d7e3409459aebd10a25bfbd0ace00056 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 7 Oct 2025 15:27:18 +0000 Subject: [PATCH 107/196] update decision log --- docs/decisions.rst | 321 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 321 insertions(+) diff --git a/docs/decisions.rst b/docs/decisions.rst index 7abbc78d..ac808e63 100644 --- a/docs/decisions.rst +++ b/docs/decisions.rst @@ -83,3 +83,324 @@ indexes alongside the indexes. │ └── dmel_test │ └── └── dmel_test.fasta + +Params +------ +The ``params:`` directive allows `non-file parameters for rules +`__. +Much (perhaps all?) of what can be done in a ``params:`` directive can also be +done in the body of ``run:`` block. On one hand, it can be nice to have a plain +string ``shell:`` block, and put the complexity in the params. But on the other +hand, sometimes it is harder to follow what's happening in params than it would +be in Python in a ``run:`` block. + +This section talks about when and why we use params in lcdb-wf. + +One of the nice things sbout Snakemake is that the rules (in ``shell:`` blocks) +can be quite close to the equivalent command-line call. Since rules in these +Snakefiles are intended to be edited, it makes sense to keep them as close to +the command-line as is reasonable. + +Take the cutadapt rule, for example, where we typically would want to include +the adapters in the call, but it's not uncommon to add other arguments. Here +we're working with a simplified, single-end version of it: + +.. code-block:: python + + rule cutadapt: + input: + fastq='{sample}.fastq.gz" + output: + fastq='{sample}.cutadapt.fastq.gz' + threads: + 8 + shell: + "cutadapt " + "-o {output[0]} " + "-j {threads} " + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + "{input.fastq[0]} " + "&> {log}" + + +Here's an extreme way of adding params where we pull out each argument into +a separate params item. This isn't very flexible and has lots of repetition, so +we probably don't want this:: + +.. code-block:: python + + rule cutadapt: + input: + '{sample}.fastq.gz" + output: + '{sample}.cutadapt.fastq.gz' + threads: + 8 + params: + nextseq_trim="--nextseq-trim 20", + overlap="--overlap 6", + minimum_length=25, + a="AGATCGGAAGAGCACACGTCTGAACTCCAGTCA", + shell: + "cutadapt " + "-o {output} " + "-j {threads} " + "{params.nextseq_trim} " + "{params.overlap} " + "{params.minimum_length} " + "{params.a} " + "{input} " + "&> {log}" + +But we could add the arguments to be a single "extra" string and store that +in params, like this: + +.. code-block:: python + + rule cutadapt: + input: + '{sample}.fastq.gz" + output: + '{sample}.cutadapt.fastq.gz' + threads: + 8 + params: + extra=( + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + ) + shell: + "cutadapt " + "-o {output} " + "-j {threads} " + "{params.extra} " + "{input} " + "&> {log}" + +One thing that's nice about this is that the "changeable things" are visually in +a different location. When running Snakemake with `-p` then the params will be +filled in to make one long string, which we could use for debugging. + +But we want to support single- and paired-end reads, and the arguments to +cutadapt depend on that. Here's the actual rule: + +.. code-block:: python + + rule cutadapt: input: + fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), + output: + fastq=expand( + "data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz", n=n + ), + log: + "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", + threads: 6 + resources: + mem="2g", + runtime="2h", + run: + if is_paired: + shell( + "cutadapt " + "-o {output[0]} " + "-p {output[1]} " + "-j {threads} " + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " + "{input.fastq[0]} " + "{input.fastq[1]} " + "&> {log}" + ) + else: + shell( + "cutadapt " + "-o {output[0]} " + "-j {threads} " + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + "{input.fastq[0]} " + "&> {log}" + ) + +Notice that we have some shared arguments as well as a PE-specific adapter +argument. Converting this one to params would be something like the following: + +.. code-block:: python + + rule cutadapt: input: + fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), + output: + fastq=expand( + "data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.cutadapt.fastq.gz", n=n + ), + log: + "data/rnaseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", + threads: 6 + resources: + mem="2g", + runtime="2h", + params: + shared=( + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + ), + se_pe_specific=( + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " + ) if is_paired else "" + run: + if is_paired: + shell( + "cutadapt " + "-o {output[0]} " + "-p {output[1]} " + "-j {threads} " + "{params.shared} " + "{params.se_pe_specific} " + "{input.fastq[0]} " + "{input.fastq[1]} " + "&> {log}" + ) + else: + shell( + "cutadapt " + "-o {output[0]} " + "-j {threads} " + "{params.shared} " + "{params.se_pe_specific} " + "{input.fastq[0]} " + "&> {log}" + ) + +Note in this case we need to provide ``-o`` and ``-p`` arguments +separately for paired-end. So we still need to have the ``if is_paired`` clause +in the body of the rule. This one could be a little bit confusing with the +``se_pe_specific`` clause, but otherwise it supports both SE and PE. + +What if we split that out into params as well, so that everything SE or PE +specific is handled there? + +.. code-block:: python + + rule cutadapt: + input: + fastq=expand( + "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", + n=n, allow_missing=True), + output: + fastq=expand( + "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", + n=n, allow_missing=True), + log: + "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", + threads: 6 + resources: + mem="2g", + runtime="2h", + params: + extra=( + "--nextseq-trim 20 " + "--overlap 6 " + "--minimum-length 25 " + "-a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA " + ), + se_pe_specific=( + "-o {output[0]} " + "-p {output[1]} " + "-A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT " + "{input.fastq[0]} " + "{input.fastq[1]} " + if not is_paired else + "{input.fastq[0]} " + "-o {output[0]} " + ) + shell: + "cutadapt " + "-j {threads} " + "{params.se_pe_specific} " + "{params.extra} " + "&> {log}" + +Now it becomes a little harder to understand what's going on, and we may have +gone too far in pulling everything out into params. So maybe an absolute +principle of "everything in params" is not useful. + +Let's take another example, the featureCounts rule for RNA-seq: + +.. code-block:: python + + rule featurecounts: + input: + annotation=rules.gtf.output, + bam=rules.markduplicates.output.bam, + output: + "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", + log: + "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log", + threads: 8 + resources: + mem="16g", + runtime="2h", + params: + strand_arg={ + "unstranded": "-s0", + "fr-firststrand": "-s2", + "fr-secondstrand": "-s1", + }[config["stranded"]], + se_pe_specific=( + "-p --countReadPairs" if is_paired + else "" + ), + extra="", + run: + shell( + "featureCounts " + "{params.strand_arg} " + "{params.se_pe_specific} " + "{params.extra} " + "-T {threads} " + "-a {input.annotation} " + "-o {output} " + "{input.bam} " + "&> {log}" + ) + +Here, it is important to have ``strand_arg`` be in the params. To understand +why, imagine if instead we determined that argument inside the ``run:`` block, +and then we changed the config file's stranded entry (``config["stranded"]``). +Then this rule would NOT re-run because the code didn't change -- Snakemake +does not *evaluate* the code in a ``run:`` block to determine if it changed. +However, it *does* evaluate the params. So in this case, it's necessary to keep +the strand argument detection in the params to take advantage of this behavior, +and correctly re-run the rule if the config's strand argument has changed. + +Next, we would want to decide whether *all* arguments should go in ``params:``. +In this case, since we're sort of forced to split out ``strand_arg``, we might +as well split everything out. + +In the end we have these observations: + +- strand-specific arguments *must* be in ``params:`` +- some tools have SE/PE-specific arguments. These need an ``if`` clause + *somewhere*, whether in a ``run:`` block or in ``params:`` +- understandability and configuration flexibility are important goals of lcdb-wf +- factoring out *everything* into params weakens understandibility + + +Guidelines: + +- Stranded arguments must be in params +- SE/PE arguments should be handled inside a ``run:`` block +- Any other arguments should be written in a ``shell:`` block or a ``shell()`` + call directly, to visually match the equivalent command-line call From d4925c46bc661f27f1191ed2a810a7d6eb535587 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 7 Oct 2025 15:27:38 +0000 Subject: [PATCH 108/196] overhaul of rnaseq and chipseq; rm references --- workflows/chipseq/Snakefile | 241 ++++++++++++++++----------- workflows/references/Snakefile | 294 --------------------------------- workflows/rnaseq/Snakefile | 119 +++++-------- 3 files changed, 184 insertions(+), 470 deletions(-) delete mode 100644 workflows/references/Snakefile diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index ce1243a9..1ce812db 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -11,9 +11,6 @@ from lib import chipseq configfile: "config/config.yaml" -include: "../references/Snakefile" - - REFERENCES = config.get("reference_dir", "../../references") sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) @@ -27,6 +24,7 @@ peaks = chipseq.add_bams_to_peak_calling(config) wildcard_constraints: n="[1,2]", sample="|".join(SAMPLES), + ext=".fa|.gtf", localrules: @@ -34,43 +32,93 @@ localrules: symlink_targets, -rule targets: +rule all: input: "data/chipseq_aggregation/multiqc.html", expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=LABELS), [v["bed"] for k, v in peaks.items()], -if utils.detect_sra(sampletable): - sampletable['orig_filename'] = expand( - 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=1) +# If the sampletable is from SRA, handle it here. +include: "sra.smk" - if is_paired: - sampletable['orig_filename_R2'] = expand( - 'original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', sample=SAMPLES, n=2) +rule fasta: + output: + temporary(f"{REFERENCES}/genome.fa.gz"), + log: + f"{REFERENCES}/logs/genome.fa.gz.log", + resources: + mem_mb="4g", + runtime="2h", + run: + utils.download_and_postprocess( + urls=config["fasta"]["url"], + postprocess=config["fasta"].get("postprocess", None), + outfile=output[0], + log=log, + ) - rule fastq_dump: - output: - fastq=expand('original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz', n=n, allow_missing=True) - log: - 'original_data/sra_samples/{sample}/{sample}.fastq.gz.log' - params: - is_paired=is_paired, - # extra="-X 100000", # [enable for test] - resources: - mem="1g", - disk="1g", - runtime="2h", - run: - srr = sampletable.loc[wildcards.sample, "Run"] - extra = params.get("extra", "") - if is_paired: - shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") - shell("mv {srr}_1.fastq.gz {output[0]}") - shell("mv {srr}_2.fastq.gz {output[1]}") - else: - shell("fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp") - shell("mv {output[0]}.tmp {output[0]}") + +rule chromsizes: + input: + f"{REFERENCES}/genome.fa.gz", + output: + f"{REFERENCES}/genome.chromsizes", + log: + f"{REFERENCES}/logs/genome.chromsizes.log", + params: + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] + resources: + mem="24g", + runtime="2h", + shell: + "export LC_COLLATE=C; " + "rm -f {output}.tmp " + "&& picard " + "{params.java_args} " + "CreateSequenceDictionary R={input} O={output}.tmp &> {log} " + '&& grep "^@SQ" {output}.tmp ' + """| awk '{{print $2, $3}}' """ + '| sed "s/SN://g;s/ LN:/\\t/g" ' + "| sort -k1,1 > {output} " + "&& rm -f {output}.tmp " + + +rule unzip: + input: + f"{REFERENCES}/{{prefix}}{{ext}}.gz", + output: + f"{REFERENCES}/{{prefix}}{{ext}}", + resources: + mem="4g", + runtime="2h", + shell: + "gunzip -c {input} > {output}" + + +rule bowtie2_index: + input: + f"{REFERENCES}/genome.fa", + output: + f"{REFERENCES}/bowtie2/genome.1.bt2", + f"{REFERENCES}/bowtie2/genome.fa", + log: + f"{REFERENCES}/logs/bowtie2_genome.log", + resources: + mem="32g", + disk="50g", + runtime="8h", + threads: 8 + run: + prefix = subpath(output[0], strip_suffix=".1.bt2") + shell( + "bowtie2-build " + "--threads {threads} " + "{input} " + "{prefix} &> {log}" + ) + utils.make_relative_symlink(input[0], output[-1]) rule symlinks: @@ -119,9 +167,9 @@ rule cutadapt: if is_paired: shell( "cutadapt " + "-j {threads} " "-o {output[0]} " "-p {output[1]} " - "-j {threads} " "--nextseq-trim 20 " "--overlap 6 " "--minimum-length 25 " @@ -134,8 +182,8 @@ rule cutadapt: else: shell( "cutadapt " - "-o {output[0]} " "-j {threads} " + "-o {output[0]} " "--nextseq-trim 20 " "--overlap 6 " "--minimum-length 25 " @@ -145,20 +193,29 @@ rule cutadapt: ) + rule fastqc: input: - "{sample_dir}/{sample}/{sample}{suffix}", + "data/chipseq_samples/{sample}/{sample}{suffix}", threads: 1 output: - html="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.html", - zip="{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.zip", + html="data/chipseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.html", + zip="data/chipseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.zip", resources: mem="8g", runtime="2h", log: - "{sample_dir}/{sample}/fastqc/{sample}{suffix}_fastqc.log", + "data/chipseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log", run: + # Calculate the paths FastQC will create so we can move them to + # specified output files if needed. outdir = os.path.dirname(output.html) or "." + outfile = os.path.basename(input[0]) + for s in [".fastq", ".fq", ".gz", ".bam"]: + outfile = outfile.replace(s, "") + out_zip = os.path.join(outdir, outfile + "_fastqc.zip") + out_html = os.path.join(outdir, outfile + "_fastqc.html") + shell( "fastqc " "--noextract " @@ -167,13 +224,9 @@ rule fastqc: "{input} " "&> {log} " ) - outfile = os.path.basename(input[0]) - for s in [".fastq", ".fq", ".gz", ".bam"]: - outfile = outfile.replace(s, "") - out_zip = os.path.join(outdir, outfile + "_fastqc.zip") + if not os.path.abspath(out_zip) == os.path.abspath(output.zip): shell("mv {out_zip} {output.zip}") - out_html = os.path.join(outdir, outfile + "_fastqc.html") if not os.path.abspath(out_html) == os.path.abspath(output.html): shell("mv {out_html} {output.html}") @@ -182,16 +235,7 @@ rule bowtie2: input: fastq=expand( "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", n=n, allow_missing=True), - index=multiext( - f"{REFERENCES}/bowtie2/genome", - ".1.bt2", - ".2.bt2", - ".3.bt2", - ".4.bt2", - ".rev.1.bt2", - ".rev.2.bt2", - ".fa", - ), + index=f"{REFERENCES}/bowtie2/genome.1.bt2", output: bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"), log: @@ -201,28 +245,26 @@ rule bowtie2: mem="32g", runtime="2h", run: - prefix = os.path.commonprefix(input.index).rstrip(".") - sam = output.bam.replace(".bam", ".sam") - fastqs = ( - f"-1 {input.fastq[0]} -2 {input.fastq[1]}" - if is_paired - else f"-U {input.fastq}" - ) + prefix = subpath(input.index, strip_suffix=".1.bt2") + + if is_paired: + fastqs = f"-1 {input.fastq[0]} -2 {input.fastq[1]}" + else: + fastqs = f"-U {input.fastq}" + shell( "bowtie2 " - "-x {prefix} " + f"-x {prefix} " "{fastqs} " - "--no-unal " "--threads {threads} " - "-S {sam} " - "> {log} 2>&1" - ) - + "--no-unal " + "-S {output.bam}.sam " + "> {log} 2>&1 ") shell( - "samtools view -Sb {sam} " - "| samtools sort - -o {output.bam} -O BAM " - "&& rm {sam}" + "samtools view -Sb {output.bam}.sam " + "| samtools sort -O BAM - -o {output.bam}" ) + shell("rm {output.bam}.sam") rule unique: @@ -235,12 +277,15 @@ rule unique: mem="1g", runtime="2h", params: + shell: + "samtools view " + "-b " # NOTE: the quality score chosen here should reflect the scores output # by the aligner used. For example, STAR uses 255 as max mapping # quality. - extra="-q 20", - shell: - "samtools view -b {params.extra} {input} > {output}" + "-q 20 " + "{input} " + "> {output}" rule fastq_count: @@ -304,8 +349,8 @@ rule markduplicates: "MarkDuplicates " "-INPUT {input.bam} " "-OUTPUT {output.bam} " - "-REMOVE_DUPLICATES true " "-METRICS_FILE {output.metrics} " + "-REMOVE_DUPLICATES true " "-VALIDATION_STRINGENCY LENIENT " "&> {log}" @@ -379,20 +424,12 @@ rule bigwig: "-p {threads} " "--minMappingQuality 20 " "--ignoreDuplicates " - # Can't use the CPM normalization for testing due to <1000 reads total - # in example data - "--normalizeUsing CPM " # [disable for test] "--extendReads 300 " + "--normalizeUsing CPM " # [disable for test] "&> {log}" rule fingerprint: - """ - Runs deepTools plotFingerprint to assess how well the ChIP experiment - worked. - - Note: uses the merged techreps. - """ input: bams=lambda wc: expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=wc.ip_label), control=lambda wc: expand( @@ -429,20 +466,19 @@ rule fingerprint: # The JSDsample argument is disabled for testing as it dramatically # increases the run time. "{jsdsample_arg} " # [disable for test] - "--smartLabels " - "--extendReads=300 " - "--skipZeros " "--outQualityMetrics {output.metrics} " "--outRawCounts {output.raw_counts} " "--plotFile {output.plot} " # Default is 500k; use fewer to speed up testing: # '--numberOfSamples 50 ' # [enable for test] + "--smartLabels " + "--extendReads=300 " + "--skipZeros " "&> {log} " '&& sed -i "s/NA/0.0/g" {output.metrics} ' ) - rule macs: input: ip=lambda wc: expand( @@ -560,11 +596,11 @@ rule plotcorrelation: shell: "plotCorrelation " "--corData {input} " + "--plotFile {output.heatmap} " + "--outFileCorMatrix {output.tab} " "--corMethod spearman " "--whatToPlot heatmap " - "--plotFile {output.heatmap} " "--colorMap Reds " - "--outFileCorMatrix {output.tab}" # NOTE: if you're expecting negative correlation, try a divergent # colormap and setting the min/max to ensure that the colomap is # centered on zero: @@ -573,7 +609,7 @@ rule plotcorrelation: # '--zMax 1 ' -rule idxstats: +rule samtools_idxstats: input: bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai", @@ -588,7 +624,7 @@ rule idxstats: "samtools idxstats {input.bam} 2> {log} 1> {output.txt}" -rule flagstat: +rule samtools_flagstat: input: bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", bai="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.bai", @@ -620,15 +656,20 @@ rule samtools_stats: rule multiqc: input: - expand("data/chipseq_samples/{sample}/{sample}.cutadapt.bam", sample=SAMPLES), - expand("data/chipseq_samples/{sample}/fastqc/{sample}_R1.fastq.gz_fastqc.zip", sample=SAMPLES), - expand("data/chipseq_samples/{sample}/fastqc/{sample}_R1.cutadapt.fastq.gz_fastqc.zip", sample=SAMPLES), - expand("data/chipseq_samples/{sample}/fastqc/{sample}.cutadapt.unique.nodups.bam_fastqc.zip", sample=SAMPLES), - expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=sampletable.label), - expand("data/chipseq_samples/{sample}/samtools_stats_{sample}.txt", sample=SAMPLES), - expand("data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt", sample=SAMPLES), - expand("data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt", sample=SAMPLES), - expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=sampletable.label), + expand( + rules.fastqc.output.zip, + sample=SAMPLES, + suffix=["_R1.fastq.gz", "_R1.cutadapt.fastq.gz", ".cutadapt.bam"], + ), + expand(rules.cutadapt.output, sample=SAMPLES), + expand(rules.bowtie2.output, sample=SAMPLES), + expand(rules.markduplicates.output, sample=SAMPLES), + expand(rules.unique.output, sample=SAMPLES), + expand(rules.samtools_stats.output, sample=SAMPLES), + expand(rules.samtools_flagstat.output, sample=SAMPLES), + expand(rules.samtools_idxstats.output, sample=SAMPLES), + expand(rules.bigwig.output, label=sampletable.label), + expand(rules.merge_techreps.output, label=sampletable.label), expand( "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics", ip_label=sampletable.loc[sampletable.antibody != "input", "label"], @@ -638,8 +679,6 @@ rule multiqc: sample=SAMPLES ) if is_paired else [], [v["bigbed"] for v in peaks.values()], - "data/chipseq_aggregation/deeptools/plotcorrelation.tab", - "data/chipseq_aggregation/deeptools/multibigwigsummary.tab", config="config/multiqc_config.yaml", output: "data/chipseq_aggregation/multiqc.html", diff --git a/workflows/references/Snakefile b/workflows/references/Snakefile deleted file mode 100644 index 6ee892f8..00000000 --- a/workflows/references/Snakefile +++ /dev/null @@ -1,294 +0,0 @@ -import os -import sys -import pandas - -sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") -from lib import utils - -REFERENCES = config.get("reference_dir", "../../references") - - -def default_postprocess(origfn, newfn): - shell("mv {origfn} {newfn}") - - -rule fasta: - output: - temporary(f"{REFERENCES}/genome.fa.gz"), - log: - f"{REFERENCES}/logs/genome.fa.gz.log", - resources: - mem_mb="4g", - runtime="2h", - run: - utils.download_and_postprocess( - urls=config["fasta"]["url"], - postprocess=config["fasta"].get("postprocess", None), - outfile=output[0], - log=log, - ) - - -rule gtf: - output: - temporary(f"{REFERENCES}/annotation.gtf.gz"), - log: - f"{REFERENCES}/logs/annotation.gtf.gz.log", - resources: - mem="4g", - runtime="2h", - run: - utils.download_and_postprocess( - urls=config["gtf"]["url"], - postprocess=config["gtf"].get("postprocess", None), - outfile=output[0], - log=log, - ) - - -rule rrna: - output: - temporary(f"{REFERENCES}/rrna.fa.gz"), - log: - f"{REFERENCES}/logs/rrna.fa.gz.log", - resources: - mem="4g", - runtime="2h", - run: - utils.download_and_postprocess( - urls=config["rrna"]["url"], - postprocess=config["rrna"].get("postprocess", None), - outfile=output[0], - log=log, - ) - - -rule unzip: - input: - f"{REFERENCES}/{{prefix}}.gz", - output: - f"{REFERENCES}/{{prefix}}", - resources: - mem="4g", - runtime="2h", - shell: - "gunzip -c {input} > {output}" - - -rule bowtie2_index: - input: - f"{REFERENCES}/{{label}}.fa", - output: - multiext( - f"{REFERENCES}/bowtie2/{{label}}", - ".1.bt2", - ".2.bt2", - ".3.bt2", - ".4.bt2", - ".rev.1.bt2", - ".rev.2.bt2", - ".fa", - ), - log: - f"{REFERENCES}/logs/bowtie2_{{label}}.log", - resources: - mem="32g", - disk="50g", - runtime="8h", - threads: 8 - run: - index = os.path.commonprefix(output).rstrip(".") - shell("bowtie2-build" " --threads {threads}" " {input}" " {index}" " &> {log}") - utils.make_relative_symlink(input[0], output[-1]) - - -rule star_index: - input: - fasta=f"{REFERENCES}/genome.fa", - gtf=f"{REFERENCES}/annotation.gtf", - output: - f"{REFERENCES}/star/Genome", - log: - f"{REFERENCES}/logs/star.log", - threads: 8 - resources: - mem="64g", - runtime="8h", - run: - genomedir = os.path.dirname(output[0]) - shell("rm -r {genomedir}") - shell("mkdir -p {genomedir}") - shell( - "STAR " - "--runMode genomeGenerate " - "--runThreadN {threads} " - "--genomeDir {genomedir} " - "--genomeFastaFiles {input.fasta} " - # NOTE: GTF is optional - "--sjdbGTFfile {input.gtf} " - # NOTE: STAR docs say that 100 should work well. - "--sjdbOverhang 100 " - # NOTE: for small genomes, may need to scale this down to - # min(14, log2(GenomeLength) / 2 - 1) - # --genomeSAindexNbases 14 - "&> {log}" - ) - # STAR writes a hard-coded Log.out file to the current working - # directory. So put that on the end of the log file for the rule and - # then clean up. - shell("cat {genomedir}/Log.out >> {log} && rm {genomedir}/Log.out") - shell("ln -s {input.fasta} {genomedir}") - - -rule transcriptome_fasta: - input: - fasta=f"{REFERENCES}/genome.fa", - gtf=f"{REFERENCES}/annotation.gtf", - output: - f"{REFERENCES}/transcriptome.fa", - resources: - mem="4g", - runtime="2h", - shell: - "gffread {input.gtf} -w {output} -g {input.fasta}" - - -rule salmon_index: - input: - f"{REFERENCES}/transcriptome.fa", - output: - f"{REFERENCES}/salmon/versionInfo.json", - log: - f"{REFERENCES}/logs/salmon.log", - params: - outdir=f"{REFERENCES}/salmon", - resources: - mem="32g", - runtime="2h", - run: - outdir = os.path.dirname(output[0]) - shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}") - - -rule kallisto_index: - output: - f"{REFERENCES}/kallisto/transcripts.idx", - input: - f"{REFERENCES}/genome.fa", - log: - f"{REFERENCES}/logs/kallisto.log", - resources: - mem="32g", - runtime="2h", - shell: - "kallisto index " - "--index {output} " - "{input} " - "&> {log}" - - -rule conversion_refflat: - input: - f"{REFERENCES}/annotation.gtf", - output: - f"{REFERENCES}/annotation.refflat", - log: - f"{REFERENCES}/logs/annotation.refflat.log", - resources: - mem="2g", - runtime="2h", - shell: - "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp " - """&& awk '{{print $1"\t"$0}}' {output}.tmp > {output} """ - "&& rm {output}.tmp " - - -rule conversion_bed12: - input: - f"{REFERENCES}/annotation.gtf", - output: - f"{REFERENCES}/annotation.bed12", - resources: - mem="2g", - runtime="2h", - shell: - "gtfToGenePred -ignoreGroupsWithoutExons {input} {output}.tmp " - "&& genePredToBed {output}.tmp {output} " - "&& rm {output}.tmp" - - -rule chromsizes: - input: - f"{REFERENCES}/genome.fa", - output: - f"{REFERENCES}/genome.chromsizes", - log: - f"{REFERENCES}/logs/genome.chromsizes.log", - params: - # NOTE: Be careful with the memory here; make sure you have enough - # and/or it matches the resources you're requesting - java_args="-Xmx20g", - # java_args='-Xmx2g' # [TEST SETTINGS -1] - resources: - mem="24g", - runtime="2h", - shell: - "export LC_COLLATE=C; " - "rm -f {output}.tmp " - "&& picard " - "{params.java_args} " - "CreateSequenceDictionary R={input} O={output}.tmp &> {log} " - '&& grep "^@SQ" {output}.tmp ' - """| awk '{{print $2, $3}}' """ - '| sed "s/SN://g;s/ LN:/\\t/g" ' - "| sort -k1,1 > {output} " - "&& rm -f {output}.tmp " - - -rule mappings: - """ - Creates gzipped TSV mapping between attributes in the GTF. - """ - input: - gtf=f"{REFERENCES}/annotation.gtf", - output: - f"{REFERENCES}/annotation.mapping.tsv.gz", - params: - include_featuretypes=lambda wildcards, output: conversion_kwargs[ - output[0] - ].get("include_featuretypes", []), - resources: - mem="2g", - runtime="2h", - run: - import gffutils - - # Will want to change the setting back to what it was originally when - # we're done - orig_setting = gffutils.constants.always_return_list - gffutils.constants.always_return_list = False - - include_featuretypes = params.include_featuretypes - - res = [] - for f in gffutils.DataIterator(input[0]): - - ft = f.featuretype - - if include_featuretypes and (ft not in include_featuretypes): - continue - - d = dict(f.attributes) - d["__featuretype__"] = ft - res.append(d) - - df = pandas.DataFrame(res) - - # Depending on how many attributes there were and the - # include_featuretypes settings, this may take a while. - df = df.drop_duplicates() - - df.to_csv(output[0], sep="\t", index=False, compression="gzip") - - # Restore original setting - gffutils.constants.always_return_list = orig_setting diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 4f5e907e..650d9cfd 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -16,6 +16,7 @@ sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" n = ["1", "2"] if is_paired else ["1"] SAMPLES = sampletable.index +sample_dir = "data/rnaseq_samples" wildcard_constraints: @@ -129,8 +130,13 @@ rule rrna_index: runtime="8h", threads: 8 run: - index = f"{REFERENCES}/bowtie2/rrna" - shell("bowtie2-build" " --threads {threads}" " {input}" " {index}" " &> {log}") + prefix = subpath(output[0], strip_suffix=".1.bt2") + shell( + "bowtie2-build " + "--threads {threads} " + "{input} " + "{prefix} &> {log}" + ) utils.make_relative_symlink(input[0], output[-1]) @@ -199,7 +205,12 @@ rule salmon_index: runtime="2h", run: outdir = os.path.dirname(output[0]) - shell("salmon index " "--transcripts {input} " "--index {outdir} " "&> {log}") + shell( + "salmon index " + "--transcripts {input} " + "--index {outdir} " + "&> {log}" + ) rule conversion_refflat: @@ -240,8 +251,8 @@ rule chromsizes: log: f"{REFERENCES}/logs/genome.chromsizes.log", params: - java_args="-Xmx20g", - # java_args='-Xmx2g' # [TEST SETTINGS -1] + java_args="-Xmx20g", # [disable for test] + # java_args='-Xmx2g' # [enable for test] resources: mem="24g", runtime="2h", @@ -275,10 +286,6 @@ rule mappings: runtime="2h", run: import gffutils - - # Will want to change the setting back to what it was originally when - # we're done - orig_setting = gffutils.constants.always_return_list gffutils.constants.always_return_list = False include_featuretypes = params.include_featuretypes @@ -303,9 +310,6 @@ rule mappings: df.to_csv(output[0], sep="\t", index=False, compression="gzip") - # Restore original setting - gffutils.constants.always_return_list = orig_setting - rule symlink_targets: input: @@ -370,22 +374,26 @@ rule fastqc: log: "data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log", run: + # Calculate the paths FastQC will create so we can move them to + # specified output files if needed. outdir = os.path.dirname(output.html) or "." + outfile = os.path.basename(input[0]) + for s in [".fastq", ".fq", ".gz", ".bam"]: + outfile = outfile.replace(s, "") + out_zip = os.path.join(outdir, outfile + "_fastqc.zip") + out_html = os.path.join(outdir, outfile + "_fastqc.html") + shell( "fastqc " "--noextract " "--quiet " "--outdir {outdir} " "{input} " - "2> {log} " + "&> {log} " ) - outfile = os.path.basename(input[0]) - for s in [".fastq", ".fq", ".gz", ".bam"]: - outfile = outfile.replace(s, "") - out_zip = os.path.join(outdir, outfile + "_fastqc.zip") + if not os.path.abspath(out_zip) == os.path.abspath(output.zip): shell("mv {out_zip} {output.zip}") - out_html = os.path.join(outdir, outfile + "_fastqc.html") if not os.path.abspath(out_html) == os.path.abspath(output.html): shell("mv {out_html} {output.html}") @@ -466,19 +474,23 @@ rule rRNA: resources: mem="2g", runtime="2h", - shell: - "bowtie2 " - f"-x {REFERENCES}/bowtie2/rrna " - "-U {input.fastq} " - "--threads {threads} " - "-k 1 " - "--no-unal " - "-S {output.bam}.sam " - "> {log} 2>&1 " - "&& samtools view -Sb {output.bam}.sam " - "| samtools sort - -o {output.bam} -O BAM " - "&& rm {output.bam}.sam" - + params: + run: + prefix = subpath(input.index, strip_suffix=".1.bt2") + shell( + "bowtie2 " + f"-x {prefix} " + "-U {input.fastq} " + "--threads {threads} " + "--no-unal " + "-k 1 " + "-S {output.bam}.sam " + "> {log} 2>&1 ") + shell( + "samtools view -Sb {output.bam}.sam " + "| samtools sort -O BAM - -o {output.bam}" + ) + shell("rm {output.bam}.sam") rule fastq_count: input: @@ -564,11 +576,8 @@ rule featurecounts: "fr-firststrand": "-s2 ", "fr-secondstrand": "-s1 ", }[config["stranded"]], - extra="", run: - p_arg = "" - if is_paired: - p_arg = "-p --countReadPairs " + p_arg = "-p --countReadPairs " if is_paired else "" shell( "featureCounts " "{params.strand_arg} " @@ -715,45 +724,6 @@ rule salmon: ) -rule kallisto: - input: - fastq=rules.cutadapt.output, - index=REFERENCES + "/kallisto/transcripts.idx", - output: - "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5", - log: - "data/rnaseq_samples/{sample}/{sample}.kallisto/abundance.h5.log", - threads: 8 - resources: - mem="32g", - runtime="2h", - params: - strand_arg={ - "unstranded": "", - "fr-firststrand": "--rf-stranded", - "fr-secondstrand": "--fr-stranded", - }[config["stranded"]], - extra=( - "--bootstrap-samples 100" - if is_paired - else "--single --fragment-length 300 --sd 20 --bootstrap-samples 100" - ), - run: - outdir = os.path.dirname(output[0]) - shell( - "kallisto quant " - "--index {input.index} " - "--output-dir {outdir} " - "--threads {threads} " - "--bootstrap-samples 100 " - "--threads {threads} " - "{params.strand_arg} " - "{params.extra} " - "{input.fastq} " - "&> {log}" - ) - - rule rseqc_infer_experiment: input: bam=rules.markduplicates.output, @@ -903,7 +873,6 @@ rule multiqc: ), expand(rules.markduplicates.output, sample=SAMPLES), expand(rules.salmon.output, sample=SAMPLES), - expand(rules.kallisto.output, sample=SAMPLES), expand(rules.preseq.output, sample=SAMPLES), expand(rules.collectrnaseqmetrics.output, sample=SAMPLES), expand(rules.samtools_stats.output, sample=SAMPLES), From 3b5a3beec95886bc3059167a6944ef160c63235b Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 8 Oct 2025 13:03:43 +0000 Subject: [PATCH 109/196] fix paths in deploy --- deploy.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/deploy.py b/deploy.py index 8270348d..11969246 100755 --- a/deploy.py +++ b/deploy.py @@ -91,7 +91,7 @@ def write_include_file(source, flavor="all"): ], "chipseq": [ "include workflows/chipseq/Snakefile", - "include workflows/rnaseq/sra.smk", + "include workflows/chipseq/sra.smk", "recursive-include workflows/chipseq/config *", "include workflows/chipseq/chipseq_trackhub.py", ], @@ -100,8 +100,7 @@ def write_include_file(source, flavor="all"): "recursive-include include *", "recursive-include lib *", "include env.yml env-r.yml .gitignore", - "include workflows/references/Snakefile", - "recursive-include workflows/references/config *", + "recursive-include scripts *", "global-exclude __pycache__", ], "full": [ From fc9af3afe814d43351febc7c760c6f258dff7b26 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 8 Oct 2025 14:02:54 +0000 Subject: [PATCH 110/196] references use params to properly trigger from config changes --- workflows/chipseq/Snakefile | 8 ++++++-- workflows/rnaseq/Snakefile | 21 +++++++++++++++------ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 1ce812db..59292ef3 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -42,6 +42,7 @@ rule all: # If the sampletable is from SRA, handle it here. include: "sra.smk" + rule fasta: output: temporary(f"{REFERENCES}/genome.fa.gz"), @@ -50,10 +51,13 @@ rule fasta: resources: mem_mb="4g", runtime="2h", + params: + urls=config["fasta"]["url"], + postprocess=config["fasta"].get("postprocess", None) run: utils.download_and_postprocess( - urls=config["fasta"]["url"], - postprocess=config["fasta"].get("postprocess", None), + urls=params.urls, + postprocess=params.postprocess, outfile=output[0], log=log, ) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 650d9cfd..5e8fb069 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -61,10 +61,13 @@ rule fasta: resources: mem_mb="4g", runtime="2h", + params: + urls=config["fasta"]["url"], + postprocess=config["fasta"].get("postprocess", None) run: utils.download_and_postprocess( - urls=config["fasta"]["url"], - postprocess=config["fasta"].get("postprocess", None), + urls=params.urls, + postprocess=params.postprocess, outfile=output[0], log=log, ) @@ -78,10 +81,13 @@ rule gtf: resources: mem="4g", runtime="2h", + params: + urls=config["gtf"]["url"], + postprocess=config["gtf"].get("postprocess", None) run: utils.download_and_postprocess( - urls=config["gtf"]["url"], - postprocess=config["gtf"].get("postprocess", None), + urls=params.urls, + postprocess=params.postprocess, outfile=output[0], log=log, ) @@ -95,10 +101,13 @@ rule rrna_fasta: resources: mem="4g", runtime="2h", + params: + urls=config["rrna"]["url"], + postprocess=config["rrna"].get("postprocess", None) run: utils.download_and_postprocess( - urls=config["rrna"]["url"], - postprocess=config["rrna"].get("postprocess", None), + urls=params.urls, + postprocess=params.postprocess, outfile=output[0], log=log, ) From 82eb7c2851b968626085b4ee3e76ef9290acd73f Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 8 Oct 2025 14:03:29 +0000 Subject: [PATCH 111/196] no longer mark references as temporary --- workflows/chipseq/Snakefile | 2 +- workflows/rnaseq/Snakefile | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 59292ef3..06dcd8cf 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -45,7 +45,7 @@ include: "sra.smk" rule fasta: output: - temporary(f"{REFERENCES}/genome.fa.gz"), + f"{REFERENCES}/genome.fa.gz", log: f"{REFERENCES}/logs/genome.fa.gz.log", resources: diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 5e8fb069..a70ea5a6 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -55,7 +55,7 @@ rule symlinks: rule fasta: output: - temporary(f"{REFERENCES}/genome.fa.gz"), + f"{REFERENCES}/genome.fa.gz", log: f"{REFERENCES}/logs/genome.fa.gz.log", resources: @@ -75,7 +75,7 @@ rule fasta: rule gtf: output: - temporary(f"{REFERENCES}/annotation.gtf.gz"), + f"{REFERENCES}/annotation.gtf.gz", log: f"{REFERENCES}/logs/annotation.gtf.gz.log", resources: @@ -224,7 +224,7 @@ rule salmon_index: rule conversion_refflat: input: - f"{REFERENCES}/annotation.gtf.gz", + f"{REFERENCES}/annotation.gtf", output: f"{REFERENCES}/annotation.refflat", log: @@ -240,7 +240,7 @@ rule conversion_refflat: rule conversion_bed12: input: - f"{REFERENCES}/annotation.gtf.gz", + f"{REFERENCES}/annotation.gtf", output: f"{REFERENCES}/annotation.bed12", resources: @@ -283,7 +283,7 @@ rule mappings: Creates gzipped TSV mapping between attributes in the GTF. """ input: - gtf=f"{REFERENCES}/annotation.gtf.gz", + gtf=f"{REFERENCES}/annotation.gtf", output: f"{REFERENCES}/annotation.mapping.tsv.gz", params: @@ -412,7 +412,7 @@ rule star: input: fastq=rules.cutadapt.output, index=rules.star_index.output, - annotation=f"{REFERENCES}/annotation.gtf.gz", + annotation=f"{REFERENCES}/annotation.gtf", output: bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), sjout=temporary( @@ -569,7 +569,7 @@ rule markduplicates: rule featurecounts: input: - annotation=rules.gtf.output, + annotation=f"{REFERENCES}/annotation.gtf", bam=rules.markduplicates.output.bam, output: "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", From 99feb61924b86abb7d03b53673efdaf25fa18a40 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 8 Oct 2025 14:03:47 +0000 Subject: [PATCH 112/196] add featurecounts aggreation to multiqc input --- workflows/rnaseq/Snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index a70ea5a6..80284e27 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -892,6 +892,7 @@ rule multiqc: expand(rules.bigwig_pos.output, sample=SAMPLES), expand(rules.bigwig_neg.output, sample=SAMPLES), rules.rrna_libsizes_table.output, + rules.aggregate_featurecounts.output, ), config="config/multiqc_config.yaml", output: From 523d3c34c72566e863cade9326c168e851d7be31 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 8 Oct 2025 14:04:20 +0000 Subject: [PATCH 113/196] rm kallisto from rnaseq --- workflows/rnaseq/downstream/config.yaml | 4 ---- workflows/rnaseq/downstream/rnaseq.Rmd | 32 +++++++------------------ 2 files changed, 8 insertions(+), 28 deletions(-) diff --git a/workflows/rnaseq/downstream/config.yaml b/workflows/rnaseq/downstream/config.yaml index 2ec85070..a8a826b3 100644 --- a/workflows/rnaseq/downstream/config.yaml +++ b/workflows/rnaseq/downstream/config.yaml @@ -134,10 +134,6 @@ toggle: # `salmon=TRUE` argument to lcdbwf::make_dds. salmon: FALSE - # Import Kallisto results instead of featureCounts? See similar notes above - # for Salmon. - kallisto: FALSE - # Create diagnostic plots for all dds objects? dds_diagnostics: TRUE diff --git a/workflows/rnaseq/downstream/rnaseq.Rmd b/workflows/rnaseq/downstream/rnaseq.Rmd index d21d13f2..50404469 100644 --- a/workflows/rnaseq/downstream/rnaseq.Rmd +++ b/workflows/rnaseq/downstream/rnaseq.Rmd @@ -15,7 +15,6 @@ knitr::opts_chunk$set( message=FALSE, cache.extra_file_dep_1 = file.info('../config/sampletable.tsv')$mtime, cache.extra_file_dep_2 = file.info('../data/rnaseq_aggregation/featurecounts.txt')$mtime, - cache.extra_file_dep_3 = file.info('../data/rnaseq_samples/*/*.kallisto/abundance.h5')$mtime, cache.extra_file_dep_4 = file.info('../data/rnaseq_samples/*/*.salmon/quant.sf')$mtime ) ``` @@ -90,7 +89,7 @@ colData <- read.table(config$main$sampletable, sep='\t', header=TRUE, stringsAsF rownames(colData) <- colData[,1] ``` -```{r dds_initial, cache=TRUE, config=c(config$main, config$toggle$salmon, config$toggle$kallisto)} +```{r dds_initial, cache=TRUE, config=c(config$main, config$toggle$salmon)} # Convert featureCounts gene-level counts into DESeq2 object, and run # variance-stabiliizing transform. dds_initial <- lcdbwf:::make_dds( @@ -106,7 +105,7 @@ vsd <- varianceStabilizingTransformation(dds_initial, blind=TRUE) Here is the sample table with metadata used for this analysis: ```{r print_coldata} -exclude.for.printing <- c('featurecounts.path', 'salmon.path', 'kallisto.path', +exclude.for.printing <- c('featurecounts.path', 'salmon.path', 'orig_filename', 'orig_filename_R2', 'layout', 'sizeFactor') colData(dds_initial) %>% @@ -138,8 +137,8 @@ for(group in config$plotting$covariates_for_plots){ } ``` -```{r sizefactors, results='asis', eval=!(config$toggle$salmon | config$toggle$kallisto)} -# Note that when loading Salmon or Kallisto, DESeq2 does not calculate size +```{r sizefactors, results='asis', eval=!(config$toggle$salmon)} +# Note that when loading Salmon, DESeq2 does not calculate size # factors. lcdbwf:::mdcat(text$sizefactors) @@ -180,13 +179,8 @@ lst <- list( design=~group, salmon=TRUE), - # Example 4: use kallisto - kallisto=list( - sampletable=colData, - design=~group, - kallisto=TRUE), - # Example 5: use LRT + # Example 4: use LRT LRT=list( sampletable=colData, design=~group, @@ -265,20 +259,10 @@ contr_03a_salmon <- lcdbwf:::make_results( ) ``` + ```{r results_04, dependson='dds_list', cache=TRUE} # Example 4: -# - like example 3, but kallisto instead of salmon -contr_03_kallisto <- lcdbwf:::make_results( - dds_name="kallisto", - contrast=c('group', 'treatment', 'control'), - type='normal', - label='Using Kallisto' -) -``` - -```{r results_05, dependson='dds_list', cache=TRUE} -# Example 5: -# - Examples 1-4 use the default DESeq2 test, Wald. +# - Examples 1-3 use the default DESeq2 test, Wald. # - Here, we use the nBinomLRT (LRT) test. # NOTE: Use 'type=NULL' to skip LFC shrinkage as # make_results sets all LRT LFC values to 0. @@ -288,7 +272,7 @@ contr_03_kallisto <- lcdbwf:::make_results( # make_results detects the 'test' type from the # dds object specified with 'dds_name'. -contr_05_lrt <- lcdbwf:::make_results( +contr_04_lrt <- lcdbwf:::make_results( dds_name="LRT", type=NULL, label='Using LRT' From 6548c60ab67279c6c2824428192a43e95e9b92eb Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 8 Oct 2025 14:04:27 +0000 Subject: [PATCH 114/196] disable results diagnostics by default --- workflows/rnaseq/downstream/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/rnaseq/downstream/config.yaml b/workflows/rnaseq/downstream/config.yaml index a8a826b3..740984c4 100644 --- a/workflows/rnaseq/downstream/config.yaml +++ b/workflows/rnaseq/downstream/config.yaml @@ -139,7 +139,7 @@ toggle: # Create diagnostic plots for results objects? If TRUE, will check the # config$plotting$diagnostics_for_results list. - results_diagnostics: TRUE + results_diagnostics: FALSE # ANNOTATION ------------------------------------------------------------------- # Configuration specific to annotations and databases From 19be1d9ced7b0f6c1b2e55567e4d772c15d4e8d9 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 08:38:25 -0400 Subject: [PATCH 115/196] more updates to decision log --- docs/decisions.rst | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/docs/decisions.rst b/docs/decisions.rst index ac808e63..74ddb01f 100644 --- a/docs/decisions.rst +++ b/docs/decisions.rst @@ -404,3 +404,31 @@ Guidelines: - SE/PE arguments should be handled inside a ``run:`` block - Any other arguments should be written in a ``shell:`` block or a ``shell()`` call directly, to visually match the equivalent command-line call + +Arguments for and against a separate references workflow +-------------------------------------------------------- + +RNA-seq, ChIP-seq, and the upcoming variant calling all need to do something +with references, including possibly patching them. So we have to deal with this +inherent complexity. It initially made sense to put such common rules in the +separate references workflow. + +However, only a subset of the rules in the references workflow are actually +shared across RNA-seq and ChIP-seq -- currently, only the bowtie2 index +(genome-wide ChIP-seq alignment; rRNA screening for RNA-seq), the fasta rule, +chromsizes, and the generic unzip rule. The others (gtf, mappings, +conversion_bed12, conversion_refflat, kallisto_index, salmon_index, +transcriptome_fasta, star_index, rrna) are all unique to RNA-seq. So the +current references workflow is actually mostly an RNA-seq-only references +workflow. + +Furthermore, much of the complexity is handled in the +lib.utils.download_and_postprocess function, rather than in the workflow rules. +We already are using the utils module separately in the ChIP-seq and RNA-seq +workflows, so there's no additional overhead to import it. + +Last, having a workflow split across two Snakefiles hampers the ability to +understand the complete workflow. + +Taken together, it made more sense to eliminate the references workflow +entirely, and port the rules to the respective workflows. From 72d77579ddb878072da92edbecff9b91f3ab8d22 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 08:55:44 -0400 Subject: [PATCH 116/196] rm kallisto throughout --- docs/config-yaml.rst | 7 ++--- env.yml | 1 - include/reference_configs/Danio_rerio.yaml | 1 - .../Dictyostelium_discoideum.yaml | 1 - .../Drosophila_melanogaster.yaml | 2 -- include/reference_configs/Gallus_gallus.yaml | 1 - include/reference_configs/Homo_sapiens.yaml | 3 --- include/reference_configs/Macaca_mulatta.yaml | 1 - include/reference_configs/Mus_musculus.yaml | 2 -- .../Plodia_interpunctella.yaml | 1 - .../reference_configs/Rattus_norvegicus.yaml | 1 - .../Saccharomyces_cerevisiae.yaml | 1 - .../Schizosaccharomyces_pombe.yaml | 1 - include/reference_configs/test.yaml | 1 - include/requirements.txt | 1 - lib/lcdbwf/R/dds.R | 27 +++++-------------- .../complex-dataset-rnaseq-config.yaml | 3 --- test/test_configs/test_file_uri.yaml | 4 --- 18 files changed, 10 insertions(+), 49 deletions(-) diff --git a/docs/config-yaml.rst b/docs/config-yaml.rst index 7d86ceef..f492c70a 100644 --- a/docs/config-yaml.rst +++ b/docs/config-yaml.rst @@ -319,9 +319,10 @@ Required for RNA-seq ``stranded`` field `````````````````` This field specifies the strandedness of the library. This is used by - various rule to set the parameters correctly. For example, - ``featureCounts`` will use ``-s0``, ``-s1``, or ``-s2`` accordingly; - ``kallisto`` will use ``--fr-stranded`` if needed, and so on. + various rule to set the parameters correctly. For example, if this is set to ``fr-firststrand`` then + ``featureCounts`` will use ``-s2``; CollectRnaSeqMetrics will use + ``STRAND=SECOND_READ_TRANSCRIPTION_STRAND``, and deepTools bamCoverage will + use ``-filterRNAstrand reverse``. This field can take the following options: diff --git a/env.yml b/env.yml index f7f89425..41739e83 100644 --- a/env.yml +++ b/env.yml @@ -144,7 +144,6 @@ dependencies: - jsonschema-specifications=2025.9.1 - jupyter_core=5.8.1 - kaleido-core=0.2.1 - - kallisto=0.51.1 - kernel-headers_linux-64=5.14.0 - keyring=25.6.0 - keyutils=1.6.3 diff --git a/include/reference_configs/Danio_rerio.yaml b/include/reference_configs/Danio_rerio.yaml index 038ef0ff..64f653df 100644 --- a/include/reference_configs/Danio_rerio.yaml +++ b/include/reference_configs/Danio_rerio.yaml @@ -23,7 +23,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' rRNA: genome: diff --git a/include/reference_configs/Dictyostelium_discoideum.yaml b/include/reference_configs/Dictyostelium_discoideum.yaml index 9037d0f6..f703343d 100644 --- a/include/reference_configs/Dictyostelium_discoideum.yaml +++ b/include/reference_configs/Dictyostelium_discoideum.yaml @@ -17,7 +17,6 @@ references: transcriptome: indexes: - "salmon" - - 'kallisto' rRNA: genome: diff --git a/include/reference_configs/Drosophila_melanogaster.yaml b/include/reference_configs/Drosophila_melanogaster.yaml index e228df7a..0e61fcad 100644 --- a/include/reference_configs/Drosophila_melanogaster.yaml +++ b/include/reference_configs/Drosophila_melanogaster.yaml @@ -40,7 +40,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' # Note: the mappings from r6.23 still work well for r6.28. r6-28: @@ -71,4 +70,3 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' diff --git a/include/reference_configs/Gallus_gallus.yaml b/include/reference_configs/Gallus_gallus.yaml index a618a5a9..13d6d49a 100644 --- a/include/reference_configs/Gallus_gallus.yaml +++ b/include/reference_configs/Gallus_gallus.yaml @@ -24,7 +24,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' rRNA: genome: diff --git a/include/reference_configs/Homo_sapiens.yaml b/include/reference_configs/Homo_sapiens.yaml index 58d292ec..ff6720f1 100644 --- a/include/reference_configs/Homo_sapiens.yaml +++ b/include/reference_configs/Homo_sapiens.yaml @@ -29,7 +29,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' gencode-v25: @@ -65,7 +64,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' gencode-v19: @@ -90,7 +88,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' rRNA: genome: diff --git a/include/reference_configs/Macaca_mulatta.yaml b/include/reference_configs/Macaca_mulatta.yaml index 111674c7..acefce08 100644 --- a/include/reference_configs/Macaca_mulatta.yaml +++ b/include/reference_configs/Macaca_mulatta.yaml @@ -24,7 +24,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' rRNA: genome: diff --git a/include/reference_configs/Mus_musculus.yaml b/include/reference_configs/Mus_musculus.yaml index ef0eb30f..316bb389 100644 --- a/include/reference_configs/Mus_musculus.yaml +++ b/include/reference_configs/Mus_musculus.yaml @@ -28,7 +28,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' gencode_m12: @@ -52,7 +51,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' rRNA: genome: diff --git a/include/reference_configs/Plodia_interpunctella.yaml b/include/reference_configs/Plodia_interpunctella.yaml index 214e907f..ea3c59ca 100644 --- a/include/reference_configs/Plodia_interpunctella.yaml +++ b/include/reference_configs/Plodia_interpunctella.yaml @@ -25,7 +25,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' rRNA: genome: diff --git a/include/reference_configs/Rattus_norvegicus.yaml b/include/reference_configs/Rattus_norvegicus.yaml index 3405d9f3..e12db2a4 100644 --- a/include/reference_configs/Rattus_norvegicus.yaml +++ b/include/reference_configs/Rattus_norvegicus.yaml @@ -23,7 +23,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' rRNA: genome: diff --git a/include/reference_configs/Saccharomyces_cerevisiae.yaml b/include/reference_configs/Saccharomyces_cerevisiae.yaml index 1f536797..c965f7a5 100644 --- a/include/reference_configs/Saccharomyces_cerevisiae.yaml +++ b/include/reference_configs/Saccharomyces_cerevisiae.yaml @@ -29,7 +29,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' rRNA: genome: diff --git a/include/reference_configs/Schizosaccharomyces_pombe.yaml b/include/reference_configs/Schizosaccharomyces_pombe.yaml index bbef64c3..74dcca1a 100644 --- a/include/reference_configs/Schizosaccharomyces_pombe.yaml +++ b/include/reference_configs/Schizosaccharomyces_pombe.yaml @@ -22,7 +22,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' rRNA: genome: diff --git a/include/reference_configs/test.yaml b/include/reference_configs/test.yaml index a8f80b77..dc68f72d 100644 --- a/include/reference_configs/test.yaml +++ b/include/reference_configs/test.yaml @@ -38,7 +38,6 @@ references: transcriptome: indexes: - 'salmon' - - 'kallisto' metadata: reference_genome_build: 'dm6' diff --git a/include/requirements.txt b/include/requirements.txt index ebd02582..dfcb8601 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -13,7 +13,6 @@ gffutils hisat2 intervalstats ipython -kallisto macs3 multiqc pandas diff --git a/lib/lcdbwf/R/dds.R b/lib/lcdbwf/R/dds.R index ae28ef6f..4e6b46bf 100644 --- a/lib/lcdbwf/R/dds.R +++ b/lib/lcdbwf/R/dds.R @@ -4,7 +4,6 @@ salmon.path.func <- function (x) file.path('..', 'data', 'rnaseq_samples', x, paste0(x, '.salmon'), 'quant.sf') -kallisto.path.func <- function (x) file.path('..', 'data', 'rnaseq_samples', x, paste0(x, '.salmon'), 'quant.sf') @@ -34,11 +33,10 @@ kallisto.path.func <- function (x) file.path('..', 'data', 'rnaseq_samples', x, #' @param strip_dotted_version If TRUE, then remove Ensembl-style dotted #' version numbers from gene IDs (ENSG000001.1 -> ENSG000001) #' -#' @param salmon_pattern, kallisto_pattern Specify the patterns to locations of -#' Salmon or Kallisto files. Use the special placeholder string +#' @param salmon_pattern Specify the pattern to locations of +#' Salmon files. Use the special placeholder string #' `__SAMPLENAME__` which will be replaced with the sample name. Only -#' relevant if one of config$toggle$salmon or config$toggle$kallisto are -#' TRUE. +#' relevant if config$toggle$salmon is TRUE #' #' @param ... Additional arguments will be passed on to the DESeq() call (e.g., #' parallel, fitType, etc) @@ -48,7 +46,6 @@ make_dds <- function(design_data, config=NULL, collapse_by=NULL, strip_dotted_version=NULL, featureCounts='../data/rnaseq_aggregation/featurecounts.txt', salmon_pattern="../data/rnaseq_samples/__SAMPLENAME__/__SAMPLENAME__.salmon/quant.sf", - kallisto_pattern="../data/rnaseq_samples/__SAMPLENAME__/__SAMPLENAME__.kallisto/abundance.h5", ...){ # Note we're using pluck() here for the convenience of setting defaults @@ -65,33 +62,27 @@ make_dds <- function(design_data, config=NULL, collapse_by=NULL, } location <- purrr::pluck(design_data, 'filename', .default=featureCounts) salmon <- purrr::pluck(design_data, 'salmon') - kallisto <- purrr::pluck(design_data, 'kallisto') subset_counts <- purrr::pluck(design_data, 'subset_counts') sample_func <- purrr::pluck(design_data, 'sample_func', .default=lcdbwf_samplename) # Allow overriding of config values. if (!is.null(config)){ if (is.null(salmon)) salmon <- config$toggle$salmon - if (is.null(kallisto)) kallisto <- config$toggle$kallisto if (is.null(collapse_by)) collapse_by <- config$main$collapse_by if (is.null(strip_dotted_version)) strip_dotted_version <- config$main$strip_dotted_version } - if (salmon & kallisto){ - stop("Both salmon and kallisto are set to TRUE, not sure how to handle this.") - } - - if (salmon | kallisto){ + if (salmon) { # If these arguments were provided, the corresponding loading functions # don't accept them so we need to remove. Issue a warning as well. if (!is.null(subset_counts) | !is.null(sample_func)){ - warning("Salmon or Kallisto was specified, but additional arguments ", + warning("Salmon was specified, but additional arguments ", "were provided to the loading function.") subset_counts <- NULL sample_func <- NULL } - # For Salmon and Kallisto, we need a tx2gene dataframe. We can get this + # For Salmon, we need a tx2gene dataframe. We can get this # from a TxDb, which in turn can be retrieved from AnnotationHub, which in # turn can be configured with the config object. Luckily, we have the # config object here! @@ -104,12 +95,6 @@ make_dds <- function(design_data, config=NULL, collapse_by=NULL, coldata$salmon.path <- sapply(coldata$samplename, function (x) gsub("__SAMPLENAME__", x, salmon_pattern)) txi <- tximport::tximport(coldata[, 'salmon.path'], type='salmon', tx2gene=tx2gene, ignoreTxVersion=strip_dotted_version) dds <- DESeq2::DESeqDataSetFromTximport(txi, colData=coldata, design=design) - - } else if (kallisto) { - coldata$kallisto.path <- sapply(coldata$samplename, function (x) gsub("__SAMPLENAME__", x, kallisto_pattern)) - txi <- tximport::tximport(coldata[, 'kallisto.path'], type='kallisto', tx2gene=tx2gene, ignoreTxVersion=strip_dotted_version) - dds <- DESeq2::DESeqDataSetFromTximport(txi, colData=coldata, design=design) - } else { dds <- lcdbwf:::DESeqDataSetFromCombinedFeatureCounts( location, diff --git a/test/test_configs/complex-dataset-rnaseq-config.yaml b/test/test_configs/complex-dataset-rnaseq-config.yaml index d4a3ed90..ee7264b8 100644 --- a/test/test_configs/complex-dataset-rnaseq-config.yaml +++ b/test/test_configs/complex-dataset-rnaseq-config.yaml @@ -23,9 +23,6 @@ gtf: salmon: tag: "gencode-v28" -kallisto: - tag: "gencode-v28" - fastq_screen: - label: rRNA organism: human diff --git a/test/test_configs/test_file_uri.yaml b/test/test_configs/test_file_uri.yaml index 571078c6..2315525a 100644 --- a/test/test_configs/test_file_uri.yaml +++ b/test/test_configs/test_file_uri.yaml @@ -24,10 +24,6 @@ gtf: salmon: tag: "test" -kallisto: - tag: "test" - - fastq_screen: - label: test organism: filebased From a999bab442bf5c2a365cd73be97402bf481e2316 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 09:56:27 -0400 Subject: [PATCH 117/196] modify gene patterns test settings --- workflows/rnaseq/downstream/gene-patterns.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/rnaseq/downstream/gene-patterns.Rmd b/workflows/rnaseq/downstream/gene-patterns.Rmd index 4c425102..c46e790c 100644 --- a/workflows/rnaseq/downstream/gene-patterns.Rmd +++ b/workflows/rnaseq/downstream/gene-patterns.Rmd @@ -100,8 +100,8 @@ col <- NULL # NOTE: This is set very low for test data. Default is 15.--------------------- # Minimum cluster size. -# minc <- 1 # [ TEST SETTINGS +1 ] minc <- 15 +# minc <- 1 # [ enable for test ] # NOTE: This is a very low value used for getting the degPatterns to run ----- low.minc <- 1 From f7996d597b64c17d3e7378c9a0d06210b76d6084 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 10:01:19 -0400 Subject: [PATCH 118/196] sra & strand_check rules directly in respective snakefiles --- workflows/chipseq/Snakefile | 42 +++++++++++ workflows/chipseq/sra.smk | 40 ---------- workflows/rnaseq/Snakefile | 118 +++++++++++++++++++++++++++++- workflows/rnaseq/sra.smk | 40 ---------- workflows/rnaseq/strand_check.smk | 75 ------------------- 5 files changed, 157 insertions(+), 158 deletions(-) delete mode 100644 workflows/chipseq/sra.smk delete mode 100644 workflows/rnaseq/sra.smk delete mode 100644 workflows/rnaseq/strand_check.smk diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 06dcd8cf..82a460f3 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -707,3 +707,45 @@ rule multiqc: "{analysis_directory} " "&> {log} " ) + + +if utils.detect_sra(sampletable): + sampletable["orig_filename"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1 + ) + + if is_paired: + sampletable["orig_filename_R2"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + sample=SAMPLES, + n=2, + ) + + rule fastq_dump: + output: + fastq=expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + n=n, + allow_missing=True, + ), + log: + "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", + params: + is_paired=is_paired, + # extra="-X 100000", # [enable for test] + resources: + mem="1g", + disk="1g", + runtime="2h", + run: + srr = sampletable.loc[wildcards.sample, "Run"] + extra = params.get("extra", "") + if is_paired: + shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") + shell("mv {srr}_1.fastq.gz {output[0]}") + shell("mv {srr}_2.fastq.gz {output[1]}") + else: + shell( + "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp" + ) + shell("mv {output[0]}.tmp {output[0]}") diff --git a/workflows/chipseq/sra.smk b/workflows/chipseq/sra.smk deleted file mode 100644 index 5ee5f53b..00000000 --- a/workflows/chipseq/sra.smk +++ /dev/null @@ -1,40 +0,0 @@ -if utils.detect_sra(sampletable): - sampletable["orig_filename"] = expand( - "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1 - ) - - if is_paired: - sampletable["orig_filename_R2"] = expand( - "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", - sample=SAMPLES, - n=2, - ) - - rule fastq_dump: - output: - fastq=expand( - "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", - n=n, - allow_missing=True, - ), - log: - "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", - params: - is_paired=is_paired, - # extra="-X 100000", # [enable for test] - resources: - mem="1g", - disk="1g", - runtime="2h", - run: - srr = sampletable.loc[wildcards.sample, "Run"] - extra = params.get("extra", "") - if is_paired: - shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") - shell("mv {srr}_1.fastq.gz {output[0]}") - shell("mv {srr}_2.fastq.gz {output[1]}") - else: - shell( - "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp" - ) - shell("mv {output[0]}.tmp {output[0]}") diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 80284e27..eb4eb884 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -922,7 +922,119 @@ rule multiqc: # Optionally run `snakemake strand_check` to do a preliminary run on # automatically-subset data to evaluate strandedness. -include: "strand_check.smk" +rule sample_strand_check: + input: + fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), + index=expand(rules.rrna_index.output, label="genome"), + bed12=rules.conversion_bed12.output, + output: + strandedness="strand_check/{sample}/{sample}.strandedness", + bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"), + bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"), + fastqs=temporary( + expand( + "strand_check/{sample}/{sample}_R{n}.strandedness.fastq", + n=n, + allow_missing=True, + ) + ), + log: + "strand_check/{sample}/{sample}.strandedness.log", + threads: 6 + resources: + mem="8g", + runtime="2h", + run: + prefix = os.path.commonprefix(input.index).rstrip(".") + nreads = int(1e5 * 4) + if is_paired: + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" + ) + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}" + ) + fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} " + else: + shell( + "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" + ) + fastqs = f"-U {output.fastqs[0]} " + shell( + "bowtie2 " + "-x {prefix} " + "{fastqs} " + "--no-unal " + "--threads {threads} 2> {log} " + "| samtools view -Sb - " + "| samtools sort - -o {output.bam} " + ) + shell("samtools index {output.bam}") + shell( + "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}" + ) + + +rule strand_check: + input: + expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES), + output: + html="strand_check/strandedness.html", + filelist=temporary("strand_check/filelist"), + log: + "strand_check/strandedness.log", + resources: + mem="1g", + runtime="2h", + run: + with open(output.filelist, "w") as fout: + for i in input: + fout.write(i + "\n") + shell( + "multiqc " + "--force " + "--module rseqc " + "--file-list {output.filelist} " + "--filename {output.html} &> {log}" + ) -# If the sampletable is from SRA, handle it here. -include: "sra.smk" +if utils.detect_sra(sampletable): + sampletable["orig_filename"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1 + ) + + if is_paired: + sampletable["orig_filename_R2"] = expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + sample=SAMPLES, + n=2, + ) + + rule fastq_dump: + output: + fastq=expand( + "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", + n=n, + allow_missing=True, + ), + log: + "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", + params: + is_paired=is_paired, + # extra="-X 100000", # [enable for test] + resources: + mem="1g", + disk="1g", + runtime="2h", + run: + srr = sampletable.loc[wildcards.sample, "Run"] + extra = params.get("extra", "") + if is_paired: + shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") + shell("mv {srr}_1.fastq.gz {output[0]}") + shell("mv {srr}_2.fastq.gz {output[1]}") + else: + shell( + "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp" + ) + shell("mv {output[0]}.tmp {output[0]}") diff --git a/workflows/rnaseq/sra.smk b/workflows/rnaseq/sra.smk deleted file mode 100644 index 5ee5f53b..00000000 --- a/workflows/rnaseq/sra.smk +++ /dev/null @@ -1,40 +0,0 @@ -if utils.detect_sra(sampletable): - sampletable["orig_filename"] = expand( - "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1 - ) - - if is_paired: - sampletable["orig_filename_R2"] = expand( - "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", - sample=SAMPLES, - n=2, - ) - - rule fastq_dump: - output: - fastq=expand( - "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", - n=n, - allow_missing=True, - ), - log: - "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", - params: - is_paired=is_paired, - # extra="-X 100000", # [enable for test] - resources: - mem="1g", - disk="1g", - runtime="2h", - run: - srr = sampletable.loc[wildcards.sample, "Run"] - extra = params.get("extra", "") - if is_paired: - shell("fastq-dump {srr} --gzip --split-files {extra} &> {log}") - shell("mv {srr}_1.fastq.gz {output[0]}") - shell("mv {srr}_2.fastq.gz {output[1]}") - else: - shell( - "fastq-dump {srr} -Z {extra} 2> {log} | gzip -c > {output[0]}.tmp" - ) - shell("mv {output[0]}.tmp {output[0]}") diff --git a/workflows/rnaseq/strand_check.smk b/workflows/rnaseq/strand_check.smk deleted file mode 100644 index bd7c45d4..00000000 --- a/workflows/rnaseq/strand_check.smk +++ /dev/null @@ -1,75 +0,0 @@ -rule sample_strand_check: - input: - fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), - index=expand(rules.rrna_index.output, label="genome"), - bed12=rules.conversion_bed12.output, - output: - strandedness="strand_check/{sample}/{sample}.strandedness", - bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"), - bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"), - fastqs=temporary( - expand( - "strand_check/{sample}/{sample}_R{n}.strandedness.fastq", - n=n, - allow_missing=True, - ) - ), - log: - "strand_check/{sample}/{sample}.strandedness.log", - threads: 6 - resources: - mem="8g", - runtime="2h", - run: - prefix = os.path.commonprefix(input.index).rstrip(".") - nreads = int(1e5 * 4) - if is_paired: - shell( - "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" - ) - shell( - "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}" - ) - fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} " - else: - shell( - "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" - ) - fastqs = f"-U {output.fastqs[0]} " - shell( - "bowtie2 " - "-x {prefix} " - "{fastqs} " - "--no-unal " - "--threads {threads} 2> {log} " - "| samtools view -Sb - " - "| samtools sort - -o {output.bam} " - ) - shell("samtools index {output.bam}") - shell( - "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}" - ) - - -rule strand_check: - input: - expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES), - output: - html="strand_check/strandedness.html", - filelist=temporary("strand_check/filelist"), - log: - "strand_check/strandedness.log", - resources: - mem="1g", - runtime="2h", - run: - with open(output.filelist, "w") as fout: - for i in input: - fout.write(i + "\n") - shell( - "multiqc " - "--force " - "--module rseqc " - "--file-list {output.filelist} " - "--filename {output.html} &> {log}" - ) From ad559319f125e23e7b9ddf072495aebf46fd54b4 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 10:33:34 -0400 Subject: [PATCH 119/196] rm include sra.smk --- workflows/chipseq/Snakefile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 82a460f3..1393e034 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -39,10 +39,6 @@ rule all: [v["bed"] for k, v in peaks.items()], -# If the sampletable is from SRA, handle it here. -include: "sra.smk" - - rule fasta: output: f"{REFERENCES}/genome.fa.gz", @@ -709,6 +705,7 @@ rule multiqc: ) +# If the sampletable is from SRA, handle it here. if utils.detect_sra(sampletable): sampletable["orig_filename"] = expand( "original_data/sra_samples/{sample}/{sample}_R{n}.fastq.gz", sample=SAMPLES, n=1 From 7023a8d3313a4316cc0f7de04a44eab9bfd74932 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 10:34:05 -0400 Subject: [PATCH 120/196] rm *.smk from deployment --- deploy.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/deploy.py b/deploy.py index 11969246..6e98596b 100755 --- a/deploy.py +++ b/deploy.py @@ -82,8 +82,6 @@ def write_include_file(source, flavor="all"): PATTERN_DICT = { "rnaseq": [ "include workflows/rnaseq/Snakefile", - "include workflows/rnaseq/strand_check.smk", - "include workflows/rnaseq/sra.smk", "recursive-include workflows/rnaseq/config *", "include workflows/rnaseq/rnaseq_trackhub.py", "recursive-include workflows/rnaseq/downstream *.Rmd", @@ -91,7 +89,6 @@ def write_include_file(source, flavor="all"): ], "chipseq": [ "include workflows/chipseq/Snakefile", - "include workflows/chipseq/sra.smk", "recursive-include workflows/chipseq/config *", "include workflows/chipseq/chipseq_trackhub.py", ], From 687af4f80f2a8a4305bbd8c7ea0f548a4bda6f99 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 15:21:34 +0000 Subject: [PATCH 121/196] rm strand_check --- docs/config-yaml.rst | 21 ---------- lib/utils.py | 4 +- test/workflow_test_params.yaml | 14 ------- workflows/rnaseq/Snakefile | 77 ---------------------------------- 4 files changed, 1 insertion(+), 115 deletions(-) diff --git a/docs/config-yaml.rst b/docs/config-yaml.rst index f492c70a..ad1d3fb3 100644 --- a/docs/config-yaml.rst +++ b/docs/config-yaml.rst @@ -343,27 +343,6 @@ Required for RNA-seq Rules that require information about strand will check the config file at run time and raise an error if this field doesn't exist. - If you don't know the strandedness of the library, run the Snakefile in - such a way to only run the ``strand_check`` rule: - - .. code-block:: bash - - snakemake -j 2 strand_check - - Or, when using the Slurm wrapper on cluster, - - .. code-block:: bash - - sbatch ../../include/WRAPPER_SLURM strand_check - - When complete, there will be a MultiQC HTML file in the ``strand_check/`` - directory that you can inspect to make your choice. - - This will align the first 10,000 reads to the specified reference and run - RSeQC's ``infer_experiment.py`` on the results and then run MultiQC on just - those output files. - - .. versionadded:: 1.8 Optional fields ~~~~~~~~~~~~~~~ diff --git a/lib/utils.py b/lib/utils.py index 0e5cc9e2..c7e4bf7b 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -712,9 +712,7 @@ def strand_arg_lookup(config, lookup): raise ConfigurationError( "Starting in v1.8, 'stranded' is required in the config file. " "Values can be 'unstranded', 'fr-firststrand' (R1 aligns antisense to original transcript), " - "or 'fr-secondstrand' (R1 aligns sense to original transcript). If you are not sure, " - "run the workflow with only the 'strand_check' rule, like " - "'snakemake -j 5 strand_check'." + "or 'fr-secondstrand' (R1 aligns sense to original transcript)." ) if config.stranded not in lookup: keys = list(lookup.keys()) diff --git a/test/workflow_test_params.yaml b/test/workflow_test_params.yaml index 5d74fac9..2255cdda 100644 --- a/test/workflow_test_params.yaml +++ b/test/workflow_test_params.yaml @@ -27,20 +27,6 @@ rnaseq: --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml --config sampletable=__ORIG__/test/test_configs/test_sra_sampletable_SE_only.tsv - strandedness-pe: - desc: Tests running the strandedness pre-check using paired-end data. - args: | - --until strand_check - --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml - --config sampletable=__ORIG__/test/test_configs/test_pe_sampletable.tsv - - strandedness-se: - desc: Tests running the strandedness pre-check using single-ended data. - args: | - --until strand_check - --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml - --config sampletable=__ORIG__/test/test_configs/two_samples.tsv - star-2pass: desc: Tests running STAR in 2-pass mode. Only runs until the star_pass2 rule. args: | diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index eb4eb884..10be673d 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -920,83 +920,6 @@ rule multiqc: "&> {log} " ) -# Optionally run `snakemake strand_check` to do a preliminary run on -# automatically-subset data to evaluate strandedness. -rule sample_strand_check: - input: - fastq=expand("data/rnaseq_samples/{{sample}}/{{sample}}_R{n}.fastq.gz", n=n), - index=expand(rules.rrna_index.output, label="genome"), - bed12=rules.conversion_bed12.output, - output: - strandedness="strand_check/{sample}/{sample}.strandedness", - bam=temporary("strand_check/{sample}/{sample}.strandedness.bam"), - bai=temporary("strand_check/{sample}/{sample}.strandedness.bam.bai"), - fastqs=temporary( - expand( - "strand_check/{sample}/{sample}_R{n}.strandedness.fastq", - n=n, - allow_missing=True, - ) - ), - log: - "strand_check/{sample}/{sample}.strandedness.log", - threads: 6 - resources: - mem="8g", - runtime="2h", - run: - prefix = os.path.commonprefix(input.index).rstrip(".") - nreads = int(1e5 * 4) - if is_paired: - shell( - "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" - ) - shell( - "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[1]}" - ) - fastqs = f"-1 {output.fastqs[0]} -2 {output.fastqs[1]} " - else: - shell( - "set +o pipefail; zcat {input.fastq[0]} | head -n {nreads} > {output.fastqs[0]}" - ) - fastqs = f"-U {output.fastqs[0]} " - shell( - "bowtie2 " - "-x {prefix} " - "{fastqs} " - "--no-unal " - "--threads {threads} 2> {log} " - "| samtools view -Sb - " - "| samtools sort - -o {output.bam} " - ) - shell("samtools index {output.bam}") - shell( - "infer_experiment.py -r {input.bed12} -i {output.bam} > {output} 2> {log}" - ) - - -rule strand_check: - input: - expand("strand_check/{sample}/{sample}.strandedness", sample=SAMPLES), - output: - html="strand_check/strandedness.html", - filelist=temporary("strand_check/filelist"), - log: - "strand_check/strandedness.log", - resources: - mem="1g", - runtime="2h", - run: - with open(output.filelist, "w") as fout: - for i in input: - fout.write(i + "\n") - shell( - "multiqc " - "--force " - "--module rseqc " - "--file-list {output.filelist} " - "--filename {output.html} &> {log}" - ) if utils.detect_sra(sampletable): sampletable["orig_filename"] = expand( From 1e56148e1d63976a848fe3982a8871b5da0edee9 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 12:59:38 -0400 Subject: [PATCH 122/196] rm more strandedness --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bedc4c18..5959188e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -262,7 +262,6 @@ variables: # configs from the original clone rather than the deployed directory. $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k -p -j2 --use-conda --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --strandedness-pe -k -p -j2 --use-conda --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass -k -p -j2 --use-conda --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass -k -p -j2 --use-conda --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k -p -j2 --use-conda --orig $ORIG From a9d467709712cf67d889a86ae1743d71d9eac949 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 15:23:12 -0400 Subject: [PATCH 123/196] rm now-irrelvant tests --- test/workflow_test_params.yaml | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/test/workflow_test_params.yaml b/test/workflow_test_params.yaml index 2255cdda..5c483e59 100644 --- a/test/workflow_test_params.yaml +++ b/test/workflow_test_params.yaml @@ -1,4 +1,5 @@ # This file configures arguments for running various workflows that are pulled +# # into the test/lcdb-wf-test runner script automatically. It is a way of # # NOTE: @@ -27,22 +28,6 @@ rnaseq: --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml --config sampletable=__ORIG__/test/test_configs/test_sra_sampletable_SE_only.tsv - star-2pass: - desc: Tests running STAR in 2-pass mode. Only runs until the star_pass2 rule. - args: | - --until star_pass2 - --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml - --config sampletable=__ORIG__/test/test_configs/star_2pass.tsv - --config aligner="star-twopass" - - hisat2: - desc: Tests running HISAT2 - args: | - --until hisat2 - --configfile __ORIG__/test/test_configs/test_rnaseq_config.yaml - --config sampletable=__ORIG__/test/test_configs/hisat2.tsv - --config aligner=hisat2 - pe: desc: Tests paired-end data args: | From 0b2b3ce946f99c9aa425faaaa2717c19d1e0f3e9 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 16:04:25 -0400 Subject: [PATCH 124/196] rm more star 2pass tests --- .circleci/config.yml | 1 - test/lcdb-wf-test | 2 -- test/test_configs/star_2pass.tsv | 3 --- 3 files changed, 6 deletions(-) delete mode 100644 test/test_configs/star_2pass.tsv diff --git a/.circleci/config.yml b/.circleci/config.yml index 5959188e..8195050c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -262,7 +262,6 @@ variables: # configs from the original clone rather than the deployed directory. $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k -p -j2 --use-conda --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-2pass -k -p -j2 --use-conda --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass -k -p -j2 --use-conda --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k -p -j2 --use-conda --orig $ORIG diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test index 21f6978c..8e8525fb 100755 --- a/test/lcdb-wf-test +++ b/test/lcdb-wf-test @@ -141,8 +141,6 @@ class Runner(object): %(prog)s rnaseq --run-workflow --sra-se %(prog)s rnaseq --run-workflow --strandedness-pe %(prog)s rnaseq --run-workflow --strandedness-se - %(prog)s rnaseq --run-workflow --star-2pass - %(prog)s rnaseq --run-workflow --hisat2 %(prog)s rnaseq --run-workflow --pe # Since there are a lot of parameters here, see diff --git a/test/test_configs/star_2pass.tsv b/test/test_configs/star_2pass.tsv deleted file mode 100644 index 8cf98eb0..00000000 --- a/test/test_configs/star_2pass.tsv +++ /dev/null @@ -1,3 +0,0 @@ -samplename group layout orig_filename -sample1-star-2pass control SE data/example_data/rnaseq_sample1PE_1.fq.gz -sample2-star-2pass control SE data/example_data/rnaseq_sample2.fq.gz From 9902f642a113e0f68345a00777d59bf8a915cc46 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 9 Oct 2025 19:50:25 -0400 Subject: [PATCH 125/196] rm another test --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 8195050c..bee8727d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -262,7 +262,6 @@ variables: # configs from the original clone rather than the deployed directory. $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k -p -j2 --use-conda --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --star-1pass -k -p -j2 --use-conda --orig $ORIG $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k -p -j2 --use-conda --orig $ORIG From 8ffc2bab2778f5988761a469d0beb19576237495 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 10 Oct 2025 20:12:58 -0400 Subject: [PATCH 126/196] decision log on references --- docs/decisions.rst | 162 +++++++++++++++++++++++++-------------------- 1 file changed, 89 insertions(+), 73 deletions(-) diff --git a/docs/decisions.rst b/docs/decisions.rst index 74ddb01f..5dcb3460 100644 --- a/docs/decisions.rst +++ b/docs/decisions.rst @@ -7,82 +7,98 @@ References ---------- Here are use-cases we have that are common enough to warrant supporting: -- References should support multiple workflows (ChIP-seq, RNA-seq, etc) - - This implies that the means the references dir should be in the - ``workflows`` directory or above. - - For example, this may mean a STAR index for RNA-seq, a bowtie2 index for - rRNA contamination, and another bowtie2 index for ChIP-seq. - -- References should support different organisms in different workflows. There - should beo only one organism per workflow though. - -- References should be re-created for each project. - - What we've found is that if we have a central location for the references - (shared by multiple deployments of lcdb-wf over the years) then we get - conflicts where one deployment's aligner version is more recent, causing - errors when using the index for an older version. - - To keep using this, we'd need to version indexes based on aligner version. - - However, when writing up methods for a paper we need to be able to trace - back what commands were run to generate the reference, including additional - patching that may have taken place (as is supported by the references - workflow). - - Re-using indexes is space- and time-efficient in the short term, but has - shown to be inefficient in time and reproducibility in the long term. - - Keeping everything in the same deployment director also helps with the - archiving process. - -Naming: - -- Top level should be organsim. Doesn't really matter in the case of - a single-organism workflow. -- Next should be what has historically been called "tag". This could be the - assembly name for genomic indexes, or some combination of assembly - + annotation for transcriptome. -- If we're assuming "deployment-local" references, these no longer have to be - globally unique. If we have a mouse reference with a transgene, we can just - call it "mouse/mm39" but have the transgene patched into it, and not worry - about conflicting (or worse, overwriting!) a central reference with the same - name that didn't have the transgene. -- Fasta files are included next to their respective index. - -This example uses the ``dmel`` organism and ``test`` tag which is configured by -default for tests. - -This uses ``$ORG/$TAG//$TOOL`` as the path -template. This lets us keep the fastq file used for building the various -indexes alongside the indexes. +**References should support multiple workflows (ChIP-seq, RNA-seq, etc)** + +- This implies that the means the references dir should be in the ``workflows`` + directory or above. +- For example, this may mean a STAR index for RNA-seq, a bowtie2 index for rRNA + contamination, and another bowtie2 index for ChIP-seq. + +**References should support different organisms in different workflows. There +should be only one organism per workflow though.** + +- For example, ``workflows/mouse-rnaseq`` and ``workflows/human-rnaseq`` should + be supported in the same project. + + +**References should be re-created for each project.** + +- Historically we had a central location for the references (shared by multiple + deployments of lcdb-wf over the years) but we got conflicts where one + deployment's aligner version was more recent, causing errors when using the + index for an older version. +- To keep using this, we'd need to version indexes based on aligner version. +- However, when writing up methods for a paper we need to be able to trace + back what commands were run to generate the reference, including additional + patching that may have taken place (as is supported by the references + workflow). +- Re-using indexes is space- and time-efficient in the short term, but has + shown to be inefficient in time and reproducibility in the long term. +- Keeping everything in the same deployment directory also helps with the + archiving process. +- We were hesitant to update the references in the central location due to + being unsure of what was depending on them. +- Overall, making the decision that the time and space cost to re-make + references for each project is worth the gain in simplicity and isolation. + +Reference nomenclature and directory structure +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Options considered: + +1. ``references`` (top-level of project, shared by all workflows) +2. ``workflows//references`` (workflow-specific) + +The location ``workflows/references`` is functionally similar to top-level +``references`` (in a parent directory of individual workflows) but references +is no longer a workflow so it doesn't make sense to have it right in the +``workflows`` directory. + +Recall that in lcdb-wf <2.0, we have organism and then tag. For example, we +might have configurations available for different human genome assemblies +(hg19, hg38) and in the central location we needed to differentiate between +them (e.g. ``references/human/hg19/``). + +If we assume a single organism per workflow, and that the references are +workflow-specific, then we don't need any of this. +``workflows//references/genome.fa`` for example should cover it. + +This becomes inefficient in the case where there are multiple workflows, all +for the same organism and all the same workflow type. However in such cases, +manually creating symlinks can get around this, and I think it's an acceptable +workaround for the benefit of simplified references more generally. :: - references_data/ - ├── dmel - ├── rRNA - │ └── genome - │ ├── bowtie2 - │ │ └── dmel_rRNA.* - │ └── dmel_rRNA.fasta - └── test - ├── annotation - │ ├── dmel_test.bed12 - │ ├── dmel_test.gtf - │ └── dmel_test.refflat - ├── genome - │ ├── bowtie2 - │ │ └── dmel_test.* - │ ├── star - │ │ └── dmel_test - │ │ └── - │ ├── dmel_test.chromsizes - │ ├── dmel_test.fasta - │ ├── dmel_test.fasta.fai - └── transcriptome - ├── kallisto - │ └── dmel_test - │ └── transcripts.idx - ├── salmon - │ └── dmel_test - │ └── - └── dmel_test.fasta + workflows/rnaseq/references + genome.fasta + genome.chromsizes + rrna.fasta + annotation.gtf + annotation.bed12 + annotation.refflat + transcriptome.fasta + star/ + genome.fasta + + bowtie2/ + rrna.fasta + + salmon/ + transcriptome.fasta + + +For ChIP-seq: + +:: + + workflows/chipseq/references + genome.fasta + genome.chromsizes + bowtie2/ + genome.fasta + + Params ------ From 82f91a9517d5bddf0b6924562e13460540b56a63 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 10 Oct 2025 20:15:05 -0400 Subject: [PATCH 127/196] config file cleanup --- workflows/chipseq/config/config.yaml | 46 ---------------------------- workflows/rnaseq/config/config.yaml | 17 +++++----- 2 files changed, 7 insertions(+), 56 deletions(-) diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml index d35898d2..268dcf59 100644 --- a/workflows/chipseq/config/config.yaml +++ b/workflows/chipseq/config/config.yaml @@ -1,28 +1,5 @@ -# NOTE: all paths are relative to the calling Snakefile. -# -# sampletable: TSV file defining sample metadata. -# First column must have header name "samplename". sampletable: 'config/sampletable.tsv' -# Which key in the `references` dict below to use -organism: 'dmel' - -# What reference genome used -# Check the assembly in https://www.ncbi.nlm.nih.gov/datasets/genome/ -# options: -# - 'mm10' for mouse -# - 'hg38' or 'hg19' for human -# - 'dm6' for drosophila -# - 'danRer11' for zebrafish -# - 'sacCer3' for yeast -# - 'rn6' for rat -# genome: 'dm6' - -# If not specified here, use the environment variable REFERENCES_DIR. -references_dir: 'references_data' - -peaks_dir: 'data/chipseq_peaks' - fasta: url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" postprocess: 'lib.utils.gzipped' @@ -70,7 +47,6 @@ chipseq: - input-wingdisc-2 extra: '--nomodel --extsize 147' - - label: gaf-wingdisc-pooled-1 algorithm: epic2 ip: @@ -94,25 +70,3 @@ chipseq: - gaf-wingdisc-2 control: [] extra: '' - -fastq_screen: - - label: rRNA - organism: dmel - tag: test - - label: Fly - organism: dmel - tag: test - -merged_bigwigs: - input-wingdisc: - - input-wingdisc-1 - - input-wingdisc-2 - gaf-wingdisc: - - gaf-wingdisc-1 - - gaf-wingdisc-2 - gaf-embryo: - - gaf-embryo-1 - -aligner: - index: 'bowtie2' - tag: 'test' diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml index 26f5aba9..657c92a4 100644 --- a/workflows/rnaseq/config/config.yaml +++ b/workflows/rnaseq/config/config.yaml @@ -1,3 +1,10 @@ +sampletable: 'config/sampletable.tsv' + +# See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info. +stranded: 'fr-firststrand' # for dUTP libraries +# 'fr-secondstrand' # for ligation libraries +# 'unstranded' # for libraries without strand specificity + fasta: url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" postprocess: 'lib.utils.gzipped' @@ -13,13 +20,3 @@ rrna: postprocess: function: 'lib.utils.filter_fastas' args: 'Drosophila melanogaster' - - -sampletable: 'config/sampletable.tsv' - -patterns: 'config/rnaseq_patterns.yaml' - -# See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info. -stranded: 'fr-firststrand' # for dUTP libraries -# 'fr-secondstrand' # for ligation libraries -# 'unstranded' # for libraries without strand specificity From c89b1eb4a1c39df8fababa054f6c45eb02f99c5c Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 11 Oct 2025 00:38:52 +0000 Subject: [PATCH 128/196] simplify references for rnaseq --- workflows/rnaseq/Snakefile | 75 +++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 10be673d..2ac4a01b 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -10,7 +10,6 @@ from lib import utils configfile: "config/config.yaml" -REFERENCES = config.get("reference_dir", "../../references") sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" @@ -55,9 +54,9 @@ rule symlinks: rule fasta: output: - f"{REFERENCES}/genome.fa.gz", + "references/genome.fa.gz", log: - f"{REFERENCES}/logs/genome.fa.gz.log", + "references/logs/genome.fa.gz.log", resources: mem_mb="4g", runtime="2h", @@ -75,9 +74,9 @@ rule fasta: rule gtf: output: - f"{REFERENCES}/annotation.gtf.gz", + "references/annotation.gtf.gz", log: - f"{REFERENCES}/logs/annotation.gtf.gz.log", + "references/logs/annotation.gtf.gz.log", resources: mem="4g", runtime="2h", @@ -95,9 +94,9 @@ rule gtf: rule rrna_fasta: output: - f"{REFERENCES}/rrna.fa.gz", + "references/rrna.fa.gz", log: - f"{REFERENCES}/logs/rrna.fa.log", + "references/logs/rrna.fa.log", resources: mem="4g", runtime="2h", @@ -115,9 +114,9 @@ rule rrna_fasta: rule unzip: input: - f"{REFERENCES}/{{prefix}}.gz", + "references/{prefix}.gz", output: - f"{REFERENCES}/{{prefix}}", + "references/{prefix}", resources: mem="4g", runtime="2h", @@ -127,12 +126,12 @@ rule unzip: rule rrna_index: input: - f"{REFERENCES}/rrna.fa", + "references/rrna.fa", output: - f"{REFERENCES}/bowtie2/rrna.1.bt2", - f"{REFERENCES}/bowtie2/rrna.fa", + "references/bowtie2/rrna.1.bt2", + "references/bowtie2/rrna.fa", log: - f"{REFERENCES}/logs/bowtie2_rrna.log", + "references/logs/bowtie2_rrna.log", resources: mem="32g", disk="50g", @@ -151,12 +150,12 @@ rule rrna_index: rule star_index: input: - fasta=f"{REFERENCES}/genome.fa", - gtf=f"{REFERENCES}/annotation.gtf", + fasta="references/genome.fa", + gtf="references/annotation.gtf", output: - f"{REFERENCES}/star/Genome", + "references/star/Genome", log: - f"{REFERENCES}/logs/star.log", + "references/logs/star.log", threads: 8 resources: mem="64g", @@ -189,10 +188,10 @@ rule star_index: rule transcriptome_fasta: input: - fasta=f"{REFERENCES}/genome.fa", - gtf=f"{REFERENCES}/annotation.gtf", + fasta="references/genome.fa", + gtf="references/annotation.gtf", output: - f"{REFERENCES}/transcriptome.fa", + "references/transcriptome.fa", resources: mem="4g", runtime="2h", @@ -202,13 +201,13 @@ rule transcriptome_fasta: rule salmon_index: input: - f"{REFERENCES}/transcriptome.fa", + "references/transcriptome.fa", output: - f"{REFERENCES}/salmon/versionInfo.json", + "references/salmon/versionInfo.json", log: - f"{REFERENCES}/logs/salmon.log", + "references/logs/salmon.log", params: - outdir=f"{REFERENCES}/salmon", + outdir="references/salmon", resources: mem="32g", runtime="2h", @@ -224,11 +223,11 @@ rule salmon_index: rule conversion_refflat: input: - f"{REFERENCES}/annotation.gtf", + "references/annotation.gtf", output: - f"{REFERENCES}/annotation.refflat", + "references/annotation.refflat", log: - f"{REFERENCES}/logs/annotation.refflat.log", + "references/logs/annotation.refflat.log", resources: mem="2g", runtime="2h", @@ -240,9 +239,9 @@ rule conversion_refflat: rule conversion_bed12: input: - f"{REFERENCES}/annotation.gtf", + "references/annotation.gtf", output: - f"{REFERENCES}/annotation.bed12", + "references/annotation.bed12", resources: mem="2g", runtime="2h", @@ -254,11 +253,11 @@ rule conversion_bed12: rule chromsizes: input: - f"{REFERENCES}/genome.fa.gz", + "references/genome.fa.gz", output: - f"{REFERENCES}/genome.chromsizes", + "references/genome.chromsizes", log: - f"{REFERENCES}/logs/genome.chromsizes.log", + "references/logs/genome.chromsizes.log", params: java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g' # [enable for test] @@ -283,9 +282,9 @@ rule mappings: Creates gzipped TSV mapping between attributes in the GTF. """ input: - gtf=f"{REFERENCES}/annotation.gtf", + gtf="references/annotation.gtf", output: - f"{REFERENCES}/annotation.mapping.tsv.gz", + "references/annotation.mapping.tsv.gz", params: include_featuretypes=lambda wildcards, output: conversion_kwargs[ output[0] @@ -412,7 +411,7 @@ rule star: input: fastq=rules.cutadapt.output, index=rules.star_index.output, - annotation=f"{REFERENCES}/annotation.gtf", + annotation="references/annotation.gtf", output: bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), sjout=temporary( @@ -474,7 +473,7 @@ rule star: rule rRNA: input: fastq="data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz", - index=f"{REFERENCES}/bowtie2/rrna.1.bt2", + index="references/bowtie2/rrna.1.bt2", output: bam="data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam", log: @@ -569,7 +568,7 @@ rule markduplicates: rule featurecounts: input: - annotation=f"{REFERENCES}/annotation.gtf", + annotation="references/annotation.gtf", bam=rules.markduplicates.output.bam, output: "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", @@ -703,7 +702,7 @@ rule preseq: rule salmon: input: fastq=rules.cutadapt.output, - index=REFERENCES + "/salmon/versionInfo.json", + index="references/salmon/versionInfo.json", output: "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf", log: From 7fa4b69dda381b8dbfe3870535804847f281c2bc Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 11 Oct 2025 00:43:37 +0000 Subject: [PATCH 129/196] simplify references for chipseq --- workflows/chipseq/Snakefile | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 1393e034..08aa75b9 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -11,7 +11,6 @@ from lib import chipseq configfile: "config/config.yaml" -REFERENCES = config.get("reference_dir", "../../references") sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" @@ -41,9 +40,9 @@ rule all: rule fasta: output: - f"{REFERENCES}/genome.fa.gz", + "references/genome.fa.gz", log: - f"{REFERENCES}/logs/genome.fa.gz.log", + "references/logs/genome.fa.gz.log", resources: mem_mb="4g", runtime="2h", @@ -61,11 +60,11 @@ rule fasta: rule chromsizes: input: - f"{REFERENCES}/genome.fa.gz", + "references/genome.fa.gz", output: - f"{REFERENCES}/genome.chromsizes", + "references/genome.chromsizes", log: - f"{REFERENCES}/logs/genome.chromsizes.log", + "references/logs/genome.chromsizes.log", params: java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g' # [enable for test] @@ -87,9 +86,9 @@ rule chromsizes: rule unzip: input: - f"{REFERENCES}/{{prefix}}{{ext}}.gz", + "references/{prefix}{ext}.gz", output: - f"{REFERENCES}/{{prefix}}{{ext}}", + "references/{prefix}{ext}", resources: mem="4g", runtime="2h", @@ -99,12 +98,12 @@ rule unzip: rule bowtie2_index: input: - f"{REFERENCES}/genome.fa", + "references/genome.fa", output: - f"{REFERENCES}/bowtie2/genome.1.bt2", - f"{REFERENCES}/bowtie2/genome.fa", + "references/bowtie2/genome.1.bt2", + "references/bowtie2/genome.fa", log: - f"{REFERENCES}/logs/bowtie2_genome.log", + "references/logs/bowtie2_genome.log", resources: mem="32g", disk="50g", @@ -235,7 +234,7 @@ rule bowtie2: input: fastq=expand( "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", n=n, allow_missing=True), - index=f"{REFERENCES}/bowtie2/genome.1.bt2", + index="references/bowtie2/genome.1.bt2", output: bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"), log: From 2989fcd609288e4d781911ac7ac3f60e60713fe2 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 11 Oct 2025 00:51:29 +0000 Subject: [PATCH 130/196] snakefmt on rnaseq --- workflows/rnaseq/Snakefile | 46 +++++++++++++------------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 2ac4a01b..6f5d3153 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -62,7 +62,7 @@ rule fasta: runtime="2h", params: urls=config["fasta"]["url"], - postprocess=config["fasta"].get("postprocess", None) + postprocess=config["fasta"].get("postprocess", None), run: utils.download_and_postprocess( urls=params.urls, @@ -82,7 +82,7 @@ rule gtf: runtime="2h", params: urls=config["gtf"]["url"], - postprocess=config["gtf"].get("postprocess", None) + postprocess=config["gtf"].get("postprocess", None), run: utils.download_and_postprocess( urls=params.urls, @@ -102,7 +102,7 @@ rule rrna_fasta: runtime="2h", params: urls=config["rrna"]["url"], - postprocess=config["rrna"].get("postprocess", None) + postprocess=config["rrna"].get("postprocess", None), run: utils.download_and_postprocess( urls=params.urls, @@ -139,12 +139,7 @@ rule rrna_index: threads: 8 run: prefix = subpath(output[0], strip_suffix=".1.bt2") - shell( - "bowtie2-build " - "--threads {threads} " - "{input} " - "{prefix} &> {log}" - ) + shell("bowtie2-build --threads {threads} {input} {prefix} &> {log}") utils.make_relative_symlink(input[0], output[-1]) @@ -213,12 +208,7 @@ rule salmon_index: runtime="2h", run: outdir = os.path.dirname(output[0]) - shell( - "salmon index " - "--transcripts {input} " - "--index {outdir} " - "&> {log}" - ) + shell("salmon index --transcripts {input} --index {outdir} &> {log}") rule conversion_refflat: @@ -259,8 +249,8 @@ rule chromsizes: log: "references/logs/genome.chromsizes.log", params: - java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g' # [enable for test] + java_args="-Xmx20g", # [disable for test] resources: mem="24g", runtime="2h", @@ -294,6 +284,7 @@ rule mappings: runtime="2h", run: import gffutils + gffutils.constants.always_return_list = False include_featuretypes = params.include_featuretypes @@ -382,8 +373,6 @@ rule fastqc: log: "data/rnaseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log", run: - # Calculate the paths FastQC will create so we can move them to - # specified output files if needed. outdir = os.path.dirname(output.html) or "." outfile = os.path.basename(input[0]) for s in [".fastq", ".fq", ".gz", ".bam"]: @@ -442,7 +431,6 @@ rule star: "{tmpdir_arg} " "--outSAMtype BAM SortedByCoordinate " "--outStd BAM_SortedByCoordinate > {output.bam} " - # NOTE: The STAR docs indicate that the following parameters are # standard options for ENCODE long-RNA-seq pipeline. Comments are from # the STAR docs. @@ -462,12 +450,9 @@ rule star: # move various hard-coded log files to log directory logfiles = expand( prefix + "{ext}", - ext=["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"] - ) - shell( - "mkdir -p {outdir}/star_logs " - "&& mv {logfiles} {outdir}/star_logs" + ext=["Log.progress.out", "Log.out", "Log.final.out", "Log.std.out"], ) + shell("mkdir -p {outdir}/star_logs && mv {logfiles} {outdir}/star_logs") rule rRNA: @@ -482,7 +467,6 @@ rule rRNA: resources: mem="2g", runtime="2h", - params: run: prefix = subpath(input.index, strip_suffix=".1.bt2") shell( @@ -493,13 +477,15 @@ rule rRNA: "--no-unal " "-k 1 " "-S {output.bam}.sam " - "> {log} 2>&1 ") + "> {log} 2>&1 " + ) shell( "samtools view -Sb {output.bam}.sam " "| samtools sort -O BAM - -o {output.bam}" ) shell("rm {output.bam}.sam") + rule fastq_count: input: fastq="{sample_dir}/{sample}/{sample}{suffix}.fastq.gz", @@ -553,8 +539,8 @@ rule markduplicates: runtime="2h", disk="100g", params: - java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g' # [enable for test] + java_args="-Xmx20g", # [disable for test] shell: "picard " "{params.java_args} " @@ -610,7 +596,7 @@ rule aggregate_featurecounts: threads: 1 resources: mem="8g", - runtime="1h" + runtime="1h", run: for i, file in enumerate(input): df = pd.read_csv(file, sep="\t", comment="#") @@ -659,8 +645,8 @@ rule collectrnaseqmetrics: mem="32g", runtime="2h", params: + # java_args='-Xmx2g', # [enable for test] java_args="-Xmx20g", # [disable for test] - # java_args='-Xmx2g', # [enable for test] strand_arg={ "unstranded": "STRAND=NONE ", "fr-firststrand": "STRAND=SECOND_READ_TRANSCRIPTION_STRAND ", @@ -942,8 +928,8 @@ if utils.detect_sra(sampletable): log: "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", params: - is_paired=is_paired, # extra="-X 100000", # [enable for test] + is_paired=is_paired, resources: mem="1g", disk="1g", From 288c7077029c8376c71d42f42ebf6fdd7cea846a Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 11 Oct 2025 00:56:07 +0000 Subject: [PATCH 131/196] snakefmt on chipseq --- workflows/chipseq/Snakefile | 98 ++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 46 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 08aa75b9..46a37a5e 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -34,7 +34,10 @@ localrules: rule all: input: "data/chipseq_aggregation/multiqc.html", - expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=LABELS), + expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", + label=LABELS, + ), [v["bed"] for k, v in peaks.items()], @@ -48,7 +51,7 @@ rule fasta: runtime="2h", params: urls=config["fasta"]["url"], - postprocess=config["fasta"].get("postprocess", None) + postprocess=config["fasta"].get("postprocess", None), run: utils.download_and_postprocess( urls=params.urls, @@ -66,8 +69,8 @@ rule chromsizes: log: "references/logs/genome.chromsizes.log", params: - java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g' # [enable for test] + java_args="-Xmx20g", # [disable for test] resources: mem="24g", runtime="2h", @@ -111,12 +114,7 @@ rule bowtie2_index: threads: 8 run: prefix = subpath(output[0], strip_suffix=".1.bt2") - shell( - "bowtie2-build " - "--threads {threads} " - "{input} " - "{prefix} &> {log}" - ) + shell("bowtie2-build --threads {threads} {input} {prefix} &> {log}") utils.make_relative_symlink(input[0], output[-1]) @@ -128,8 +126,11 @@ rule symlinks: else sampletable.loc[wc.sample, ["orig_filename"]] ), output: - expand("data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", n=n, - allow_missing=True), + expand( + "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", + n=n, + allow_missing=True, + ), threads: 1 resources: mem="1g", @@ -151,11 +152,15 @@ rule cutadapt: input: fastq=expand( "data/chipseq_samples/{sample}/{sample}_R{n}.fastq.gz", - n=n, allow_missing=True), + n=n, + allow_missing=True, + ), output: fastq=expand( "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", - n=n, allow_missing=True), + n=n, + allow_missing=True, + ), log: "data/chipseq_samples/{sample}/{sample}_cutadapt.fastq.gz.log", threads: 6 @@ -192,7 +197,6 @@ rule cutadapt: ) - rule fastqc: input: "data/chipseq_samples/{sample}/{sample}{suffix}", @@ -206,8 +210,6 @@ rule fastqc: log: "data/chipseq_samples/{sample}/fastqc/{sample}{suffix}_fastqc.log", run: - # Calculate the paths FastQC will create so we can move them to - # specified output files if needed. outdir = os.path.dirname(output.html) or "." outfile = os.path.basename(input[0]) for s in [".fastq", ".fq", ".gz", ".bam"]: @@ -233,7 +235,10 @@ rule fastqc: rule bowtie2: input: fastq=expand( - "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", n=n, allow_missing=True), + "data/chipseq_samples/{sample}/{sample}_R{n}.cutadapt.fastq.gz", + n=n, + allow_missing=True, + ), index="references/bowtie2/genome.1.bt2", output: bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"), @@ -258,7 +263,8 @@ rule bowtie2: "--threads {threads} " "--no-unal " "-S {output.bam}.sam " - "> {log} 2>&1 ") + "> {log} 2>&1 " + ) shell( "samtools view -Sb {output.bam}.sam " "| samtools sort -O BAM - -o {output.bam}" @@ -275,13 +281,9 @@ rule unique: resources: mem="1g", runtime="2h", - params: shell: "samtools view " "-b " - # NOTE: the quality score chosen here should reflect the scores output - # by the aligner used. For example, STAR uses 255 as max mapping - # quality. "-q 20 " "{input} " "> {output}" @@ -331,17 +333,17 @@ rule markduplicates: bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.bam", output: bam="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam", - metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics" + metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.metrics", log: - "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.log" + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.log", threads: 1 resources: mem="32g", disk="100g", runtime="2h", params: - java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g' # [enable for test] + java_args="-Xmx20g", # [disable for test] shell: "picard " "{params.java_args} " @@ -364,15 +366,15 @@ rule merge_techreps: bam="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", metrics="data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.metrics", log: - "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.log" + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.log", threads: 1 resources: mem="32g", disk="100g", runtime="2h", params: - java_args="-Xmx32g", # [disable for test] # java_args='-Xmx2g' # [enable for test] + java_args="-Xmx32g", # [disable for test] script: "../../scripts/merge_and_dedup.py" @@ -386,14 +388,14 @@ if is_paired: pdf="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.pdf", metrics="data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics", log: - "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics.log" + "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics.log", threads: 1 resources: mem="32g", runtime="2h", params: - java_args="-Xmx20g", # [disable for test] # java_args='-Xmx2g' # [enable for test] + java_args="-Xmx20g", # [disable for test] shell: "picard " "{params.java_args} " @@ -424,20 +426,24 @@ rule bigwig: "--minMappingQuality 20 " "--ignoreDuplicates " "--extendReads 300 " - "--normalizeUsing CPM " # [disable for test] + "--normalizeUsing CPM " "&> {log}" rule fingerprint: input: - bams=lambda wc: expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=wc.ip_label), + bams=lambda wc: expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", + label=wc.ip_label, + ), control=lambda wc: expand( "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", label=chipseq.merged_input_for_ip(sampletable, wc.ip_label), ), bais=lambda wc: expand( "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", - label=wc.ip_label), + label=wc.ip_label, + ), control_bais=lambda wc: expand( "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", label=chipseq.merged_input_for_ip(sampletable, wc.ip_label), @@ -478,6 +484,7 @@ rule fingerprint: ) + rule macs: input: ip=lambda wc: expand( @@ -527,7 +534,7 @@ rule epic2: mem="16g", runtime="2h", log: - "data/chipseq_peaks/epic2/{epic2_run}/peaks.bed.log" + "data/chipseq_peaks/epic2/{epic2_run}/peaks.bed.log", params: block=lambda wc: chipseq.block_for_run(config, wc.epic2_run, "epic2"), is_paired=is_paired, @@ -558,7 +565,10 @@ rule multibigwigsummary: Summarize the bigWigs across genomic bins """ input: - expand("data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", label=sampletable.label), + expand( + "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.bam.bigwig", + label=sampletable.label, + ), output: npz="data/chipseq_aggregation/deeptools/multibigwigsummary_matrix.npz", tab="data/chipseq_aggregation/deeptools/multibigwigsummary.tab", @@ -600,12 +610,6 @@ rule plotcorrelation: "--corMethod spearman " "--whatToPlot heatmap " "--colorMap Reds " - # NOTE: if you're expecting negative correlation, try a divergent - # colormap and setting the min/max to ensure that the colomap is - # centered on zero: - # '--colorMap RdBu_r ' - # '--zMin -1 ' - # '--zMax 1 ' rule samtools_idxstats: @@ -618,7 +622,7 @@ rule samtools_idxstats: mem="16g", runtime="2h", log: - "data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt.log" + "data/chipseq_samples/{sample}/samtools_idxstats_{sample}.txt.log", shell: "samtools idxstats {input.bam} 2> {log} 1> {output.txt}" @@ -633,7 +637,7 @@ rule samtools_flagstat: mem="8g", runtime="2h", log: - "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt.log" + "data/chipseq_samples/{sample}/samtools_flagstat_{sample}.txt.log", shell: "samtools flagstat {input.bam} > {output}" @@ -648,7 +652,7 @@ rule samtools_stats: mem="8g", runtime="2h", log: - "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt.log" + "data/chipseq_samples/{sample}/samtools_stats_{sample}.txt.log", shell: "samtools stats {input.bam} > {output}" @@ -675,8 +679,10 @@ rule multiqc: ), expand( "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics", - sample=SAMPLES - ) if is_paired else [], + sample=SAMPLES, + ) + if is_paired + else [], [v["bigbed"] for v in peaks.values()], config="config/multiqc_config.yaml", output: @@ -727,8 +733,8 @@ if utils.detect_sra(sampletable): log: "original_data/sra_samples/{sample}/{sample}.fastq.gz.log", params: - is_paired=is_paired, # extra="-X 100000", # [enable for test] + is_paired=is_paired, resources: mem="1g", disk="1g", From 7f281bc1748101b8cbd3ca4ae38ab27242f04989 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 11 Oct 2025 01:09:17 +0000 Subject: [PATCH 132/196] update .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index ab3fd51e..b1f7c8ca 100644 --- a/.gitignore +++ b/.gitignore @@ -66,3 +66,6 @@ workflows/rnaseq/downstream/rnaseq.html ._* Rplots.pdf /lib/include/* + +workflows/*/references + From 917c90f8ec3f4d7eaf5ee57b986e3ebdcac69d0e Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 11 Oct 2025 09:44:49 -0400 Subject: [PATCH 133/196] hard-code peaks dir in chipseq_trackhub.py --- workflows/chipseq/chipseq_trackhub.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py index 5726fc02..4e520be2 100644 --- a/workflows/chipseq/chipseq_trackhub.py +++ b/workflows/chipseq/chipseq_trackhub.py @@ -38,6 +38,8 @@ # details config = yaml.load(open(args.config), Loader=yaml.FullLoader) +peaks_dir = "data/chipseq_peaks" + if args.additional_configs: for cfg in args.additional_configs: update_config(config, yaml.load(open(cfg), Loader=yaml.FullLoader)) @@ -208,14 +210,14 @@ def decide_color(samplename): # ASSUMPTION: BED filename pattern bed_filename = os.path.join( - config['peaks_dir'], + peaks_dir, algorithm, label, 'peaks.bed') # ASSUMPTION: bigBed filename pattern bigbed_filename = os.path.join( - config['peaks_dir'], + peaks_dir, algorithm, label, 'peaks.bigbed') @@ -241,7 +243,7 @@ def decide_color(samplename): if algorithm == "sicer": subgroup['peaks'] = 'no' - prefilter_wig = glob.glob(os.path.join(config['peaks_dir'], + prefilter_wig = glob.glob(os.path.join(peaks_dir, algorithm, label, '*prefilter.bigWig')) @@ -249,7 +251,7 @@ def decide_color(samplename): prefilter_wig = prefilter_wig[0] else: raise ValueError('SICER output for {0} has no prefilter bigWig file'.format(label)) - postfilter_wig = glob.glob(os.path.join(config['peaks_dir'], + postfilter_wig = glob.glob(os.path.join(peaks_dir, algorithm, label, '*postfilter.bigWig')) From e298d290ba281cad9488b946ab57064e92a7a69b Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 11 Oct 2025 15:41:54 +0000 Subject: [PATCH 134/196] use .gz for those rules that can --- workflows/chipseq/Snakefile | 14 +------------- workflows/rnaseq/Snakefile | 12 ++++++------ 2 files changed, 7 insertions(+), 19 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 46a37a5e..8b2b9004 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -87,21 +87,9 @@ rule chromsizes: "&& rm -f {output}.tmp " -rule unzip: - input: - "references/{prefix}{ext}.gz", - output: - "references/{prefix}{ext}", - resources: - mem="4g", - runtime="2h", - shell: - "gunzip -c {input} > {output}" - - rule bowtie2_index: input: - "references/genome.fa", + "references/genome.fa.gz", output: "references/bowtie2/genome.1.bt2", "references/bowtie2/genome.fa", diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 6f5d3153..8ccb0ba4 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -116,7 +116,7 @@ rule unzip: input: "references/{prefix}.gz", output: - "references/{prefix}", + temporary("references/{prefix}"), resources: mem="4g", runtime="2h", @@ -126,10 +126,10 @@ rule unzip: rule rrna_index: input: - "references/rrna.fa", + "references/rrna.fa.gz", output: "references/bowtie2/rrna.1.bt2", - "references/bowtie2/rrna.fa", + "references/bowtie2/rrna.fa.gz", log: "references/logs/bowtie2_rrna.log", resources: @@ -213,7 +213,7 @@ rule salmon_index: rule conversion_refflat: input: - "references/annotation.gtf", + "references/annotation.gtf.gz", output: "references/annotation.refflat", log: @@ -229,7 +229,7 @@ rule conversion_refflat: rule conversion_bed12: input: - "references/annotation.gtf", + "references/annotation.gtf.gz", output: "references/annotation.bed12", resources: @@ -272,7 +272,7 @@ rule mappings: Creates gzipped TSV mapping between attributes in the GTF. """ input: - gtf="references/annotation.gtf", + gtf="references/annotation.gtf.gz", output: "references/annotation.mapping.tsv.gz", params: From e4642674ed0ef9ddb152b6e653a57438d6e0b5be Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sat, 11 Oct 2025 15:57:52 +0000 Subject: [PATCH 135/196] temporarily name-sort PE bams for featurecounts --- workflows/rnaseq/Snakefile | 56 ++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 8ccb0ba4..b610419c 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -552,14 +552,36 @@ rule markduplicates: "&> {log}" -rule featurecounts: +rule namesorted_bam: input: - annotation="references/annotation.gtf", bam=rules.markduplicates.output.bam, output: - "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", + temporary( + "data/rnaseq_samples/{sample}/{sample}.cutadapt.markdups.namesort.bam" + ), + threads: 1 + resources: + mem="16g", + runtime="2h", + shell: + "samtools sort -n -o {output} {input}" + + +rule featurecounts: + input: + annotation="references/annotation.gtf.gz", + bam=expand( + ( + rules.namesorted_bam.output + if is_paired + else rules.markduplicates.output.bam + ), + sample=SAMPLES, + ), + output: + "data/rnaseq_aggregation/featurecounts.txt", log: - "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt.log", + "data/rnaseq_aggregation/featurecounts.txt.log", threads: 8 resources: mem="16g", @@ -584,30 +606,6 @@ rule featurecounts: ) -rule aggregate_featurecounts: - input: - expand( - "data/rnaseq_samples/{sample}/{sample}_featurecounts.txt", sample=SAMPLES - ), - output: - "data/rnaseq_aggregation/featurecounts.txt", - log: - "data/rnaseq_aggregation/featurecounts.txt.log", - threads: 1 - resources: - mem="8g", - runtime="1h", - run: - for i, file in enumerate(input): - df = pd.read_csv(file, sep="\t", comment="#") - df = df.set_index("Geneid", drop=False) - if i == 0: - final = df - continue - final[df.columns[-1]] = df[df.columns[-1]] - final.to_csv(output[0], sep="\t", index=False) - - rule rrna_libsizes_table: input: rrna=expand( @@ -877,7 +875,7 @@ rule multiqc: expand(rules.bigwig_pos.output, sample=SAMPLES), expand(rules.bigwig_neg.output, sample=SAMPLES), rules.rrna_libsizes_table.output, - rules.aggregate_featurecounts.output, + rules.featurecounts.output, ), config="config/multiqc_config.yaml", output: From 087e008f94a78d2eec923575b2782ef7a71268ac Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 14 Oct 2025 22:16:00 +0000 Subject: [PATCH 136/196] improve mappings.tsv generation --- lib/utils.py | 63 ++++++++++++++++++++++++++++++++++++++ workflows/rnaseq/Snakefile | 41 ++++++------------------- 2 files changed, 73 insertions(+), 31 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index c7e4bf7b..8593f534 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -4,6 +4,7 @@ import gzip import os import re +import sys import subprocess import warnings from collections.abc import Iterable @@ -15,6 +16,8 @@ from Bio import SeqIO from snakemake.io import expand, regex_from_filepattern from snakemake.shell import shell +import gffutils +import csv # Small helper functions @@ -1190,4 +1193,64 @@ def wrapper_for(path): def detect_sra(sampletable): return 'Run' in sampletable.columns and any(sampletable['Run'].str.startswith('SRR')) + +def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None, include_attributes=None): + """ + Create a TSV file of attributes found in a GTF file. + + Parameters + ---------- + + gtf, tsv : str + Input and output filenames respectively + + exclude_featuretypes, include_featuretypes : list + Mutually exclusive; use these to restrict the features considered. + E.g., we likely don't need entries for start_codon if those are in the + GTF. + + include_attributes : list + Restrict the attributes reported in the TSV. Should at least have + a column for gene ID and transcript ID in order for downstream RNA-seq + work. + """ + + if exclude_featuretypes and include_featuretypes: + raise ValueError("Both include_featuretypes and exclude_featuretypes were specified.") + + res = [] + keys = set(['__featuretype__']) + seen = set() + for f in gffutils.DataIterator(gtf): + ft = f.featuretype + if exclude_featuretypes and ft in exclude_featuretypes: + continue + if include_featuretypes and ft not in include_featuretypes: + continue + d = dict(f.attributes) + keys.update(d.keys()) + d["__featuretype__"] = ft + h = hash(str(d)) + if h in seen: + continue + seen.update([h]) + res.append(d) + + def unlist_dict(d): + for k, v in d.items(): + if isinstance(v, list): + d[k] = "|".join(v) + return d + + if include_attributes: + sorted_keys = sorted(include_attributes) + else: + sorted_keys = sorted(keys) + with open(tsv, 'w') as fout: + writer = csv.DictWriter(fout, fieldnames=sorted_keys, restval="", delimiter='\t') + writer.writeheader() + for row in res: + writer.writerow(unlist_dict(row)) + + # vim: ft=python diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index b610419c..ab82deeb 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -31,6 +31,7 @@ localrules: rule all: input: "data/rnaseq_aggregation/multiqc.html", + "references/annotation.mapping.tsv", rule symlinks: @@ -268,46 +269,24 @@ rule chromsizes: rule mappings: - """ - Creates gzipped TSV mapping between attributes in the GTF. - """ input: gtf="references/annotation.gtf.gz", output: - "references/annotation.mapping.tsv.gz", - params: - include_featuretypes=lambda wildcards, output: conversion_kwargs[ - output[0] - ].get("include_featuretypes", []), + tsv="references/annotation.mapping.tsv", resources: mem="2g", runtime="2h", run: - import gffutils - - gffutils.constants.always_return_list = False - - include_featuretypes = params.include_featuretypes - - res = [] - for f in gffutils.DataIterator(input[0]): - - ft = f.featuretype - - if include_featuretypes and (ft not in include_featuretypes): - continue - - d = dict(f.attributes) - d["__featuretype__"] = ft - res.append(d) - - df = pandas.DataFrame(res) + mappings_args = dict( + exclude_featuretypes=None, + include_featuretypes=None, + include_attributes=None, + ) + print(config["annotation"].get("mappings", {})) - # Depending on how many attributes there were and the - # include_featuretypes settings, this may take a while. - df = df.drop_duplicates() + mappings_args.update(config["annotation"].get("mappings", {})) - df.to_csv(output[0], sep="\t", index=False, compression="gzip") + utils.mappings_tsv(input.gtf, output.tsv, **mappings_args) rule symlink_targets: From a2cc1855a9ed682f37df50438f8610b5b3578697 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 14 Oct 2025 22:16:33 +0000 Subject: [PATCH 137/196] minor refactoring in utils.py --- lib/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index 8593f534..b74fc7e4 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -265,7 +265,7 @@ def extract_wildcards(pattern, target): return m.groupdict() -def _is_gzipped(fn): +def is_gzipped(fn): """ Filename-independent method of checking if a file is gzipped or not. Uses the magic number. @@ -280,7 +280,7 @@ def openfile(tmp, mode): """ Returns an open file handle; auto-detects gzipped files. """ - if _is_gzipped(tmp): + if is_gzipped(tmp): return gzip.open(tmp, mode) else: return open(tmp, mode) @@ -783,6 +783,8 @@ def twobit_to_fasta(tmpfiles, outfile): shell("cat {fastas} | gzip -c > {outfile}") shell("rm {fastas}") +def default_postprocess(origfn, newfn): + shell("mv {origfn} {newfn}") def download_and_postprocess(urls, postprocess, outfile, log): """ @@ -865,8 +867,6 @@ def func(infiles, outfile, *args, **kwargs): """ - def default_postprocess(origfn, newfn): - shell("mv {origfn} {newfn}") if not isinstance(postprocess, list): postprocess = [postprocess] @@ -990,7 +990,7 @@ def default_postprocess(origfn, newfn): for i in to_delete: if os.path.exists(i): shell("rm {i}") - if not _is_gzipped(outfile): + if not is_gzipped(outfile): raise ValueError(f"{outfile} does not appear to be gzipped.") @@ -1181,7 +1181,7 @@ def gff2gtf(gff, gtf): """ Converts a gff file to a gtf format using the gffread function from Cufflinks """ - if _is_gzipped(gff[0]): + if is_gzipped(gff[0]): shell("gzip -d -S .gz.0.tmp {gff} -c | gffread - -T -o- | gzip -c > {gtf}") else: shell("gffread {gff} -T -o- | gzip -c > {gtf}") From a4485ec0ccbbd40085b2afcea7cf9582d9664432 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 14 Oct 2025 22:17:30 +0000 Subject: [PATCH 138/196] fasta -> genome and gtf -> annotation in configs --- workflows/rnaseq/Snakefile | 10 +++++----- workflows/rnaseq/config/config.yaml | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index ab82deeb..b299dcdd 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -62,8 +62,8 @@ rule fasta: mem_mb="4g", runtime="2h", params: - urls=config["fasta"]["url"], - postprocess=config["fasta"].get("postprocess", None), + urls=config["genome"]["url"], + postprocess=config["genome"].get("postprocess", None), run: utils.download_and_postprocess( urls=params.urls, @@ -73,7 +73,7 @@ rule fasta: ) -rule gtf: +rule annotation: output: "references/annotation.gtf.gz", log: @@ -82,8 +82,8 @@ rule gtf: mem="4g", runtime="2h", params: - urls=config["gtf"]["url"], - postprocess=config["gtf"].get("postprocess", None), + urls=config["annotation"]["url"], + postprocess=config["annotation"].get("postprocess", None), run: utils.download_and_postprocess( urls=params.urls, diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml index 657c92a4..9047b4ab 100644 --- a/workflows/rnaseq/config/config.yaml +++ b/workflows/rnaseq/config/config.yaml @@ -5,11 +5,11 @@ stranded: 'fr-firststrand' # for dUTP libraries # 'fr-secondstrand' # for ligation libraries # 'unstranded' # for libraries without strand specificity -fasta: +genome: url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" postprocess: 'lib.utils.gzipped' -gtf: +annotation: url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf" postprocess: 'lib.utils.gzipped' From ba3c7ce21a752839fbb31570f0908d4da2cbb4ed Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 14 Oct 2025 22:18:14 +0000 Subject: [PATCH 139/196] add faidx rule --- workflows/rnaseq/Snakefile | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index b299dcdd..c0574925 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -72,6 +72,16 @@ rule fasta: log=log, ) +rule faidx: + input: + "references/genome.fa" + output: + "references/genome.fa.fai" + resources: + mem_mb="4g", + runtime="2h", + shell: + "samtools faidx {input}" rule annotation: output: From 8a320202ff0d6f985c4e34574725cfbd9a86e08d Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 14 Oct 2025 22:19:09 +0000 Subject: [PATCH 140/196] pep8 on postprocess.utils --- lib/postprocess/utils.py | 64 ++++++++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py index f8fc64a6..1e254ef1 100644 --- a/lib/postprocess/utils.py +++ b/lib/postprocess/utils.py @@ -44,7 +44,7 @@ def extract_from_zip(tmpfiles, outfile, path_in_zip): shutil.rmtree(extraction_dir) -def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand = "None"): +def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand="None"): """ Matches string to the 9th field of GTF and an optional strand that defaults to None; if the pattern is found and the provided strand match then the line is excluded @@ -63,21 +63,26 @@ def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand = "None"): optstrand : str String to match to the strand. Default is None """ - regex_strmatch = re.compile(r'|'.join(strmatch)) + regex_strmatch = re.compile(r"|".join(strmatch)) - with gzip.open(outfile, 'wt') as fout: + with gzip.open(outfile, "wt") as fout: for tmpfn in tmpfiles: - with openfile(tmpfn, 'rt') as tmp: + with openfile(tmpfn, "rt") as tmp: for line in tmp: if line.startswith("#"): fout.write(line) else: - toks = line.split('\t') - if not (regex_strmatch.search(toks[8]) != None and toks[6] == optstrand): + toks = line.split("\t") + if not ( + regex_strmatch.search(toks[8]) != None + and toks[6] == optstrand + ): fout.write(line) + # match_gtf_9th(['/home/esnaultcm/Downloads/Rattus_norvegicus.Rnor_6.0.94.gtf.gz'], "test.gz", ['ENSRNOG00000046319'], '-') + def convert_gtf_chroms(tmpfiles, outfile, conv_table): """ Convert chrom names in GTF file according to conversion table. @@ -95,28 +100,32 @@ def convert_gtf_chroms(tmpfiles, outfile, conv_table): read lookup table, so it can be file://, a path relative to the snakefile, or an http://, https://, or ftp:// URL. """ - lookup = pd.read_csv( - conv_table, sep='\t', header=None, names=('a', 'b') - ).set_index('a')['b'].to_dict() + lookup = ( + pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b")) + .set_index("a")["b"] + .to_dict() + ) - with gzip.open(outfile, 'wt') as fout: + with gzip.open(outfile, "wt") as fout: for tmpfn in tmpfiles: - with openfile(tmpfn, 'rt') as tmp: + with openfile(tmpfn, "rt") as tmp: for line in tmp: if not line.startswith("#"): - toks = line.split('\t') + toks = line.split("\t") chrom = toks[0] if chrom in lookup.keys(): - toks[0]= lookup[chrom] - line = '\t'.join(toks) + toks[0] = lookup[chrom] + line = "\t".join(toks) else: raise ValueError( 'Chromosome "{chrom}" not found in conversion table ' - '"{conv_table}"' - .format(chrom=chrom, conv_table=conv_table) + '"{conv_table}"'.format( + chrom=chrom, conv_table=conv_table + ) ) fout.write(line) + def convert_fasta_chroms(tmpfiles, outfile, conv_table): """ Convert chrom names in fasta file according to conversion table. @@ -135,26 +144,29 @@ def convert_fasta_chroms(tmpfiles, outfile, conv_table): snakefile, or an http://, https://, or ftp:// URL. """ - lookup = pd.read_csv( - conv_table, sep='\t', header=None, names=('a', 'b') - ).set_index('a')['b'].to_dict() + lookup = ( + pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b")) + .set_index("a")["b"] + .to_dict() + ) - with gzip.open(outfile, 'wt') as fout: + with gzip.open(outfile, "wt") as fout: for tmpfn in tmpfiles: - with openfile(tmpfn, 'rt') as tmp: + with openfile(tmpfn, "rt") as tmp: for line in tmp: if line.startswith(">"): line = line.rstrip("\n") - toks = line.split(' ') + toks = line.split(" ") chrom = toks[0].lstrip(">") chrom = chrom.rstrip("\n") if chrom in lookup.keys(): - toks[0]= ">" + lookup[chrom] - line = ' '.join(toks) + "\n" + toks[0] = ">" + lookup[chrom] + line = " ".join(toks) + "\n" else: raise ValueError( 'Chromosome "{chrom}" not found in conversion table ' - '"{conv_table}"' - .format(chrom=chrom, conv_table=conv_table) + '"{conv_table}"'.format( + chrom=chrom, conv_table=conv_table + ) ) fout.write(line) From 69f3ed6533e10740c35a12a146b865ae41fe4393 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 14 Oct 2025 22:19:27 +0000 Subject: [PATCH 141/196] support gtf and fasta filtering on regexps to support hg19 from gencode, which doesn't provide primary assembly and associated gtf --- lib/postprocess/utils.py | 112 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 5 deletions(-) diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py index 1e254ef1..18fa5296 100644 --- a/lib/postprocess/utils.py +++ b/lib/postprocess/utils.py @@ -1,16 +1,118 @@ -import sys +import gzip +import logging import os import re -import gzip -import zipfile -import shutil +import sys import tempfile +import zipfile + +import gffutils import pandas as pd +from snakemake.shell import shell here = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, os.path.join(here, "../../lib")) -from utils import openfile +from .. import utils as u + +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + + +def ensure_single_unzipped(tmpfiles, outfile): + """ + Sometimes it makes things easier in downstream code to assume there's + a single uncompressed file to work with. + """ + all_gzipped = all([u.is_gzipped(i) for i in tmpfiles]) + none_gzipped = all([not u.is_gzipped(i) for i in tmpfiles]) + + if all_gzipped: + shell("zcat {tmpfiles} > {outfile}") + return outfile + + elif none_gzipped: + shell("cat {tmpfiles} > {outfile}") + return outfile + else: + raise ValueError("Mixture of compressed and uncompressed files") + + +def _patterns(include_patterns, exclude_patterns): + """ + Return a function that will include/exclude strings based on the patterns + provided. + """ + + if include_patterns and exclude_patterns: + raise ValueError("include_patterns and exclude_patterns are mutually exclusive") + patterns = [] + if include_patterns: + for p in include_patterns: + patterns.append(re.compile(p)) + + def keep(s): + for p in patterns: + if p.search(s): + logger.info(f"Keeping {s} because it matches {p}") + return True + return False + + elif exclude_patterns: + for p in exclude_patterns: + patterns.append(re.compile(p)) + + def keep(s): + for p in patterns: + if p.search(s): + logger.info(f"Excluding {s} because it matches {p}") + return False + return True + + else: + raise ValueError( + "Expecting exactly one of include_patterns or exclude_patterns" + ) + + return keep + + +def filter_fasta_chroms( + tmpfiles, outfile, include_patterns=None, exclude_patterns=None +): + # samtools won't work with gzip (only bgzip) files, so the lowest common + # denominator is to use uncompressed. + working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp") + if include_patterns and exclude_patterns: + raise ValueError("include_patterns and exclude_patterns are mutually exclusive") + + logger.info(f"Finding chrom names and putting them in {working_file}.record_names") + shell( + 'grep ">" {working_file} | cut -f1 -d " " | sed "s/>//g" > {working_file}.record_names' + ) + + keep = _patterns(include_patterns, exclude_patterns) + with open(outfile + ".keep", "w") as fout, open( + working_file + ".record_names", "r" + ) as fin: + for line in fin: + line = line.replace(">", "").strip() + chrom = line.split()[0] + if keep(chrom): + fout.write(chrom + "\n") + shell("samtools faidx -r {outfile}.keep {working_file} | bgzip -c > {outfile}") + # shell("rm {outfile}.tmp {outfile}.tmp.fai {outfile}.keep") + shell("rm {tmpfiles}") + + +def filter_gtf_chroms(tmpfiles, outfile, include_patterns=None, exclude_patterns=None): + working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp") + keep = _patterns(include_patterns, exclude_patterns) + with gzip.open(outfile, "wt") as fout: + for feature in gffutils.DataIterator(working_file): + if keep(feature.chrom): + fout.write(str(feature) + "\n") + shell("rm {tmpfiles}") def extract_from_zip(tmpfiles, outfile, path_in_zip): From b4870cd8a28fc40b33e6cc1defb62b50ecef7080 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 14 Oct 2025 22:20:06 +0000 Subject: [PATCH 142/196] add reference config templates for human --- .../Homo_sapiens/GENCODE.yaml | 10 + .../Homo_sapiens/GENCODE_v19.yaml | 176 ++++++++++++++++++ .../Homo_sapiens/GRCh37.yaml | 1 + .../Homo_sapiens/hg19.yaml | 1 + 4 files changed, 188 insertions(+) create mode 100644 include/reference_config_templates/Homo_sapiens/GENCODE.yaml create mode 100644 include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml create mode 120000 include/reference_config_templates/Homo_sapiens/GRCh37.yaml create mode 120000 include/reference_config_templates/Homo_sapiens/hg19.yaml diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml new file mode 100644 index 00000000..dd4ae34f --- /dev/null +++ b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml @@ -0,0 +1,10 @@ +# This config is intended to always point to the latest GENCODE version. If +# there is a newer version, please update and submit a pull request. +# +# https://www.gencodegenes.org/human/ +# +genome: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz" + +annotation: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.primary_assembly.annotation.gtf.gz" diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml new file mode 100644 index 00000000..c2e0bf6a --- /dev/null +++ b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml @@ -0,0 +1,176 @@ +# This is the last GENCODE release for hg19 / GRCh37. See +# https://www.gencodegenes.org/human/release_19.html. +# +# A primary assembly is not available like it is for GRCh38, so we make one by +# selecting the main chromosomes and unassembled contigs. It's not obvious +# which ones are the unassembled contigs, but the original fasta file has +# space-separated record names like this: +# +# >chr20 20 +# >chr21 21 +# >chr22 22 +# >chrX X +# >chrY Y +# >chrM MT +# >GL877870.2 HG1007_PATCH +# >GL877872.1 HG1032_PATCH +# >GL383535.1 HG104_HG975_PATCH +# >JH159133.1 HG1063_PATCH +# +# Spot-checking the entries, those that have PATCH in the line are assembly +# patches; those with HSCHR and HG*TEST are alt loci. None of those should be +# in a primary assembly. So the "include_pattern" list below was obtained with +# the following command: +# +# zcat GRCh37.p13.genome.fa.gz \ +# | grep -Ev "HS|PATCH|HG" \ +# | cut -f1 -d " " \ +# | sed "s/>//g" +# +# Spot-checking the remaining non-chr, they do all appear to be unassembled +# contigs, which we do want. +# +# So we can use this list of chroms to filter both the fasta as well as the gtf. +# +genome: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/GRCh37.p13.genome.fa.gz" + postprocess: + function: "lib.postprocess.utils.filter_fasta_chroms" + kwargs: + include_patterns: + - chr.* + - GL000191.1 + - GL000192.1 + - GL000193.1 + - GL000194.1 + - GL000195.1 + - GL000196.1 + - GL000197.1 + - GL000198.1 + - GL000199.1 + - GL000200.1 + - GL000201.1 + - GL000202.1 + - GL000203.1 + - GL000204.1 + - GL000205.1 + - GL000206.1 + - GL000207.1 + - GL000208.1 + - GL000209.1 + - GL000210.1 + - GL000211.1 + - GL000212.1 + - GL000213.1 + - GL000214.1 + - GL000215.1 + - GL000216.1 + - GL000217.1 + - GL000218.1 + - GL000219.1 + - GL000220.1 + - GL000221.1 + - GL000222.1 + - GL000223.1 + - GL000224.1 + - GL000225.1 + - GL000226.1 + - GL000227.1 + - GL000228.1 + - GL000229.1 + - GL000230.1 + - GL000231.1 + - GL000232.1 + - GL000233.1 + - GL000234.1 + - GL000235.1 + - GL000236.1 + - GL000237.1 + - GL000238.1 + - GL000239.1 + - GL000240.1 + - GL000241.1 + - GL000242.1 + - GL000243.1 + - GL000244.1 + - GL000245.1 + - GL000246.1 + - GL000247.1 + - GL000248.1 + - GL000249.1 + + +annotation: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.chr_patch_hapl_scaff.annotation.gtf.gz" + postprocess: + function: "lib.postprocess.utils.filter_gtf_chroms" + kwargs: + include_patterns: + - chr.* + - GL000191.1 + - GL000192.1 + - GL000193.1 + - GL000194.1 + - GL000195.1 + - GL000196.1 + - GL000197.1 + - GL000198.1 + - GL000199.1 + - GL000200.1 + - GL000201.1 + - GL000202.1 + - GL000203.1 + - GL000204.1 + - GL000205.1 + - GL000206.1 + - GL000207.1 + - GL000208.1 + - GL000209.1 + - GL000210.1 + - GL000211.1 + - GL000212.1 + - GL000213.1 + - GL000214.1 + - GL000215.1 + - GL000216.1 + - GL000217.1 + - GL000218.1 + - GL000219.1 + - GL000220.1 + - GL000221.1 + - GL000222.1 + - GL000223.1 + - GL000224.1 + - GL000225.1 + - GL000226.1 + - GL000227.1 + - GL000228.1 + - GL000229.1 + - GL000230.1 + - GL000231.1 + - GL000232.1 + - GL000233.1 + - GL000234.1 + - GL000235.1 + - GL000236.1 + - GL000237.1 + - GL000238.1 + - GL000239.1 + - GL000240.1 + - GL000241.1 + - GL000242.1 + - GL000243.1 + - GL000244.1 + - GL000245.1 + - GL000246.1 + - GL000247.1 + - GL000248.1 + - GL000249.1 + +rrna: + url: + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + postprocess: + function: 'lib.utils.filter_fastas' + args: 'Homo sapiens' diff --git a/include/reference_config_templates/Homo_sapiens/GRCh37.yaml b/include/reference_config_templates/Homo_sapiens/GRCh37.yaml new file mode 120000 index 00000000..99b7f940 --- /dev/null +++ b/include/reference_config_templates/Homo_sapiens/GRCh37.yaml @@ -0,0 +1 @@ +GENCODE_v19.yaml \ No newline at end of file diff --git a/include/reference_config_templates/Homo_sapiens/hg19.yaml b/include/reference_config_templates/Homo_sapiens/hg19.yaml new file mode 120000 index 00000000..99b7f940 --- /dev/null +++ b/include/reference_config_templates/Homo_sapiens/hg19.yaml @@ -0,0 +1 @@ +GENCODE_v19.yaml \ No newline at end of file From 40a64ff3ebaa29327218ba390f606f87cc67c2a3 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 14 Oct 2025 22:35:41 +0000 Subject: [PATCH 143/196] verbose arg for filtering --- lib/postprocess/utils.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py index 18fa5296..1963fa76 100644 --- a/lib/postprocess/utils.py +++ b/lib/postprocess/utils.py @@ -4,6 +4,7 @@ import re import sys import tempfile +import shutil import zipfile import gffutils @@ -38,7 +39,7 @@ def ensure_single_unzipped(tmpfiles, outfile): raise ValueError("Mixture of compressed and uncompressed files") -def _patterns(include_patterns, exclude_patterns): +def _patterns(include_patterns, exclude_patterns, verbose=False): """ Return a function that will include/exclude strings based on the patterns provided. @@ -51,10 +52,11 @@ def _patterns(include_patterns, exclude_patterns): for p in include_patterns: patterns.append(re.compile(p)) - def keep(s): + def keep(s): for p in patterns: if p.search(s): - logger.info(f"Keeping {s} because it matches {p}") + if verbose: + logger.info(f"Keeping {s} because it matches {p}") return True return False @@ -65,7 +67,8 @@ def keep(s): def keep(s): for p in patterns: if p.search(s): - logger.info(f"Excluding {s} because it matches {p}") + if verbose: + logger.info(f"Excluding {s} because it matches {p}") return False return True @@ -107,7 +110,7 @@ def filter_fasta_chroms( def filter_gtf_chroms(tmpfiles, outfile, include_patterns=None, exclude_patterns=None): working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp") - keep = _patterns(include_patterns, exclude_patterns) + keep = _patterns(include_patterns, exclude_patterns, verbose=False) with gzip.open(outfile, "wt") as fout: for feature in gffutils.DataIterator(working_file): if keep(feature.chrom): @@ -182,8 +185,6 @@ def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand="None"): fout.write(line) -# match_gtf_9th(['/home/esnaultcm/Downloads/Rattus_norvegicus.Rnor_6.0.94.gtf.gz'], "test.gz", ['ENSRNOG00000046319'], '-') - def convert_gtf_chroms(tmpfiles, outfile, conv_table): """ From 999e12299fcc151c5be92d18e696927c6e8b98e9 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 17 Oct 2025 02:37:04 +0000 Subject: [PATCH 144/196] updates to decision log --- docs/decisions.rst | 97 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/docs/decisions.rst b/docs/decisions.rst index 5dcb3460..117deca7 100644 --- a/docs/decisions.rst +++ b/docs/decisions.rst @@ -448,3 +448,100 @@ understand the complete workflow. Taken together, it made more sense to eliminate the references workflow entirely, and port the rules to the respective workflows. + +featureCounts all-in-one or individually +---------------------------------------- + +featureCounts can accept a list of BAMs and run everything in one shot, or can +be run once per sample and then manusally aggregated later. Previously, we +provided all BAMs. However, for paired-end BAMs, featureCounts will internally +name sort each BAM before counting. It does this serially. The result is +possibly substantial memory usage and a lot of time. + +One approach could be to temporarily name-sort BAMs in a separate rule, +conditional on paired-end reads, and the featureCounts rule would need to have +conditional input filenames as well. This adds a little bit of complexity for +the benefit of being able to more finely control resource usage. Another +approach would be to run featureCounts independently on each BAM, allosing it +to name-sort independently each one in parallel, and then manually aggregate +the featureCounts output of each. + +Since the conditional inclusion of a namesorted rule was straightforward (a +matter of choosing the input file for featureCounts rule), it made the most +sense to run featureCounts once, providing it all samples. + +Selection of reference genomes and annotations +---------------------------------------------- + +Where possible, we select "primary" assemblies -- those with th canonical +chromosomes and unassembled contigs (scaffolds) but NOT haplotypes, alternate +loci, or assembly patches. + +`Heng Li's blog post +`__ on +the subject is a useful guideline. To summarize, we want to exclude alt contigs +/ haplotypes because they may create multimapping issues, and we want to +include unassembled contigs because excluding them will artificially decrease +alignment percentage. + +Since lcdb-wf is intended to be used with arbitrary organisms, the PAR and +mitochondrial sequences mentioned there are not relevant in general. + +Ideally, we would have a tool that, given the URLs for raw fastq and gtf, + +1. Displays the set of chromosomes +2. Infers if there are any that look like rDNA or mtDNA +3. Ensures the GTF matches the fasta match chromosomes +4. Accepts a template config to assess to process + + +Annotations +----------- + +We use the most comprehensive annotations. For human and mouse, this is the +GENCODE "comprehensive" annotation for the primary assembly, which will include +many more than just protein-coding transcripts. For example, here are the +frequencies of ``transcript_type`` values in GENCODE v19's comprehensive +annotation: + +:: + + 1726632 protein_coding + 214952 nonsense_mediated_decay + 154780 processed_transcript + 135772 retained_intron + 54584 lincRNA + 44207 antisense + 22976 processed_pseudogene + 15313 pseudogene + 11202 unprocessed_pseudogene + 9477 miRNA + 7090 transcribed_unprocessed_pseudogene + 6149 misc_RNA + 5783 snRNA + 4521 snoRNA + 3148 sense_intronic + 1662 polymorphic_pseudogene + 1610 rRNA + 1430 unitary_pseudogene + 1417 sense_overlapping + 1117 IG_V_gene + 1091 transcribed_processed_pseudogene + 1035 non_stop_decay + 755 TR_V_gene + 681 IG_V_pseudogene + 300 TR_J_gene + 185 IG_C_gene + 152 IG_D_gene + 100 3prime_overlapping_ncrna + 99 TR_V_pseudogene + 80 IG_J_gene + 66 Mt_tRNA + 56 TR_C_gene + 36 IG_C_pseudogene + 12 TR_J_pseudogene + 12 TR_D_gene + 9 IG_J_pseudogene + 6 Mt_rRNA + 3 translated_processed_pseudogene + From 98a75b2727bd7ec1495d22fbc691505623518e05 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 17 Oct 2025 14:18:22 +0000 Subject: [PATCH 145/196] add gencode_m25 config for mouse --- .../Mus_musculus/GENCODE_M25.yaml | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml diff --git a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml new file mode 100644 index 00000000..b959d3a5 --- /dev/null +++ b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml @@ -0,0 +1,20 @@ +# This is the latest GENCODE release for GRCm38/mm10. +# +# Primary assembly and associated annotations are directly available from GENCODE, +# https://www.gencodegenes.org/mouse/release_M25.html + +species: "Mus musculus" + +genome: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz" + +annotation: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.primary_assembly.annotation.gtf.gz" + +rrna: + url: + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' + - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' + postprocess: + function: 'lib.utils.filter_fastas' + args: 'Mus musculus' From a0610ed61a4c2aba40c49fd47f2ad4f5371e1c68 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 19 Oct 2025 00:32:15 +0000 Subject: [PATCH 146/196] move postprocess.utils to postprocess --- .../Homo_sapiens/GENCODE.yaml | 2 + .../Homo_sapiens/GENCODE_v19.yaml | 4 +- .../Mus_musculus/GENCODE_M25.yaml | 3 + lib/postprocess/__init__.py | 280 ++++++++++++++++++ lib/postprocess/utils.py | 275 ----------------- lib/utils.py | 2 - 6 files changed, 287 insertions(+), 279 deletions(-) delete mode 100644 lib/postprocess/utils.py diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml index dd4ae34f..86d0538d 100644 --- a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml +++ b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml @@ -5,6 +5,8 @@ # genome: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz" + postprocess: lib.postprocess.default annotation: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.primary_assembly.annotation.gtf.gz" + postprocess: lib.postprocess.default diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml index c2e0bf6a..f32dc7e9 100644 --- a/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml +++ b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml @@ -35,7 +35,7 @@ genome: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/GRCh37.p13.genome.fa.gz" postprocess: - function: "lib.postprocess.utils.filter_fasta_chroms" + function: "lib.postprocess.filter_fasta_chroms" kwargs: include_patterns: - chr.* @@ -103,7 +103,7 @@ genome: annotation: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.chr_patch_hapl_scaff.annotation.gtf.gz" postprocess: - function: "lib.postprocess.utils.filter_gtf_chroms" + function: "lib.postprocess.filter_gtf_chroms" kwargs: include_patterns: - chr.* diff --git a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml index b959d3a5..a077e399 100644 --- a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml +++ b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml @@ -7,9 +7,12 @@ species: "Mus musculus" genome: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz" + postprocess: lib.postprocess.default + annotation: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.primary_assembly.annotation.gtf.gz" + postprocess: lib.postprocess.default rrna: url: diff --git a/lib/postprocess/__init__.py b/lib/postprocess/__init__.py index b6e690fd..3d7fbbfe 100644 --- a/lib/postprocess/__init__.py +++ b/lib/postprocess/__init__.py @@ -1 +1,281 @@ +import gzip +import logging +import os +import re +import sys +import tempfile +import shutil +import zipfile + +import gffutils +import pandas as pd +from snakemake.shell import shell + +here = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, os.path.join(here, "../../lib")) +from .. import utils as u + from . import * + +logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) + + +def default(origfn, newfn): + shell("mv {origfn} {newfn}") + + +def ensure_single_unzipped(tmpfiles, outfile): + """ + Sometimes it makes things easier in downstream code to assume there's + a single uncompressed file to work with. + """ + all_gzipped = all([u.is_gzipped(i) for i in tmpfiles]) + none_gzipped = all([not u.is_gzipped(i) for i in tmpfiles]) + + if all_gzipped: + shell("zcat {tmpfiles} > {outfile}") + return outfile + + elif none_gzipped: + shell("cat {tmpfiles} > {outfile}") + return outfile + + else: + raise ValueError("Mixture of compressed and uncompressed files") + + +def _patterns(include_patterns, exclude_patterns, verbose=False): + """ + Return a function that will include/exclude strings based on the patterns + provided. + """ + + if include_patterns and exclude_patterns: + raise ValueError("include_patterns and exclude_patterns are mutually exclusive") + patterns = [] + if include_patterns: + for p in include_patterns: + patterns.append(re.compile(p)) + + def keep(s): + for p in patterns: + if p.search(s): + if verbose: + logger.info(f"Keeping {s} because it matches {p}") + return True + return False + + elif exclude_patterns: + for p in exclude_patterns: + patterns.append(re.compile(p)) + + def keep(s): + for p in patterns: + if p.search(s): + if verbose: + logger.info(f"Excluding {s} because it matches {p}") + return False + return True + + else: + raise ValueError( + "Expecting exactly one of include_patterns or exclude_patterns" + ) + + return keep + + +def filter_fasta_chroms( + tmpfiles, outfile, include_patterns=None, exclude_patterns=None +): + # samtools won't work with gzip (only bgzip) files, so the lowest common + # denominator is to use uncompressed. + working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp") + if include_patterns and exclude_patterns: + raise ValueError("include_patterns and exclude_patterns are mutually exclusive") + + logger.info(f"Finding chrom names and putting them in {working_file}.record_names") + shell( + 'grep ">" {working_file} | cut -f1 -d " " | sed "s/>//g" > {working_file}.record_names' + ) + + keep = _patterns(include_patterns, exclude_patterns) + with open(outfile + ".keep", "w") as fout, open( + working_file + ".record_names", "r" + ) as fin: + for line in fin: + line = line.replace(">", "").strip() + chrom = line.split()[0] + if keep(chrom): + fout.write(chrom + "\n") + shell("samtools faidx -r {outfile}.keep {working_file} | bgzip -c > {outfile}") + # shell("rm {outfile}.tmp {outfile}.tmp.fai {outfile}.keep") + shell("rm {tmpfiles}") + + +def filter_gtf_chroms(tmpfiles, outfile, include_patterns=None, exclude_patterns=None): + working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp") + keep = _patterns(include_patterns, exclude_patterns, verbose=False) + with gzip.open(outfile, "wt") as fout: + for feature in gffutils.DataIterator(working_file): + if keep(feature.chrom): + fout.write(str(feature) + "\n") + shell("rm {tmpfiles}") + + +def extract_from_zip(tmpfiles, outfile, path_in_zip): + """ + Parameters + ---------- + + tmpfiles : list + One-item list containing zip file + + outfile : str + gzipped output file to create + + path_in_zip : str + Path within zipfile to extract. You can identify the path using unzip + -l x.zip from bash. + """ + assert len(tmpfiles) == 1, f"expected single zip file, got {tmpfiles}" + + extraction_dir = tempfile.mkdtemp() + + with zipfile.ZipFile(tmpfiles[0], "r") as z: + z.extract(path_in_zip, path=extraction_dir) + + full_path_to_extracted = os.path.join(extraction_dir, path_in_zip) + + with open(full_path_to_extracted, "rb") as fin: + with gzip.open(outfile, "wb") as fout: + shutil.copyfileobj(fin, fout) + + shutil.rmtree(extraction_dir) + + +def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand="None"): + """ + Matches string to the 9th field of GTF and an optional strand that defaults to None; + if the pattern is found and the provided strand match then the line is excluded + + Parameters + ---------- + tmpfiles : str + GTF files + + outfile : str + gzipped output GTF file + + strmatch : list + List of strings to match in the 9th field of the GTF. Must be list + + optstrand : str + String to match to the strand. Default is None + """ + regex_strmatch = re.compile(r"|".join(strmatch)) + + with gzip.open(outfile, "wt") as fout: + for tmpfn in tmpfiles: + with openfile(tmpfn, "rt") as tmp: + for line in tmp: + if line.startswith("#"): + fout.write(line) + else: + toks = line.split("\t") + if not ( + regex_strmatch.search(toks[8]) != None + and toks[6] == optstrand + ): + fout.write(line) + + + +def convert_gtf_chroms(tmpfiles, outfile, conv_table): + """ + Convert chrom names in GTF file according to conversion table. + + Parameters + ---------- + tmpfiles : str + GTF files to look through + + outfile : str + gzipped output GTF file + + conv_table : str + Lookup table file for the chromosome name conversion. Uses pandas to + read lookup table, so it can be file://, a path relative to the + snakefile, or an http://, https://, or ftp:// URL. + """ + lookup = ( + pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b")) + .set_index("a")["b"] + .to_dict() + ) + + with gzip.open(outfile, "wt") as fout: + for tmpfn in tmpfiles: + with openfile(tmpfn, "rt") as tmp: + for line in tmp: + if not line.startswith("#"): + toks = line.split("\t") + chrom = toks[0] + if chrom in lookup.keys(): + toks[0] = lookup[chrom] + line = "\t".join(toks) + else: + raise ValueError( + 'Chromosome "{chrom}" not found in conversion table ' + '"{conv_table}"'.format( + chrom=chrom, conv_table=conv_table + ) + ) + fout.write(line) + + +def convert_fasta_chroms(tmpfiles, outfile, conv_table): + """ + Convert chrom names in fasta file according to conversion table. + + Parameters + ---------- + tmpfiles : str + fasta files to look through + + outfile : str + gzipped output fasta file + + conv_table : str + Lookup table file for the chromosome name conversion. Uses pandas to + read lookup table, so it can be file://, a path relative to the + snakefile, or an http://, https://, or ftp:// URL. + """ + + lookup = ( + pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b")) + .set_index("a")["b"] + .to_dict() + ) + + with gzip.open(outfile, "wt") as fout: + for tmpfn in tmpfiles: + with openfile(tmpfn, "rt") as tmp: + for line in tmp: + if line.startswith(">"): + line = line.rstrip("\n") + toks = line.split(" ") + chrom = toks[0].lstrip(">") + chrom = chrom.rstrip("\n") + if chrom in lookup.keys(): + toks[0] = ">" + lookup[chrom] + line = " ".join(toks) + "\n" + else: + raise ValueError( + 'Chromosome "{chrom}" not found in conversion table ' + '"{conv_table}"'.format( + chrom=chrom, conv_table=conv_table + ) + ) + fout.write(line) diff --git a/lib/postprocess/utils.py b/lib/postprocess/utils.py deleted file mode 100644 index 1963fa76..00000000 --- a/lib/postprocess/utils.py +++ /dev/null @@ -1,275 +0,0 @@ -import gzip -import logging -import os -import re -import sys -import tempfile -import shutil -import zipfile - -import gffutils -import pandas as pd -from snakemake.shell import shell - -here = os.path.dirname(os.path.abspath(__file__)) -sys.path.insert(0, os.path.join(here, "../../lib")) -from .. import utils as u - -logger = logging.getLogger(__name__) -logging.basicConfig(format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO) - - -def ensure_single_unzipped(tmpfiles, outfile): - """ - Sometimes it makes things easier in downstream code to assume there's - a single uncompressed file to work with. - """ - all_gzipped = all([u.is_gzipped(i) for i in tmpfiles]) - none_gzipped = all([not u.is_gzipped(i) for i in tmpfiles]) - - if all_gzipped: - shell("zcat {tmpfiles} > {outfile}") - return outfile - - elif none_gzipped: - shell("cat {tmpfiles} > {outfile}") - return outfile - - else: - raise ValueError("Mixture of compressed and uncompressed files") - - -def _patterns(include_patterns, exclude_patterns, verbose=False): - """ - Return a function that will include/exclude strings based on the patterns - provided. - """ - - if include_patterns and exclude_patterns: - raise ValueError("include_patterns and exclude_patterns are mutually exclusive") - patterns = [] - if include_patterns: - for p in include_patterns: - patterns.append(re.compile(p)) - - def keep(s): - for p in patterns: - if p.search(s): - if verbose: - logger.info(f"Keeping {s} because it matches {p}") - return True - return False - - elif exclude_patterns: - for p in exclude_patterns: - patterns.append(re.compile(p)) - - def keep(s): - for p in patterns: - if p.search(s): - if verbose: - logger.info(f"Excluding {s} because it matches {p}") - return False - return True - - else: - raise ValueError( - "Expecting exactly one of include_patterns or exclude_patterns" - ) - - return keep - - -def filter_fasta_chroms( - tmpfiles, outfile, include_patterns=None, exclude_patterns=None -): - # samtools won't work with gzip (only bgzip) files, so the lowest common - # denominator is to use uncompressed. - working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp") - if include_patterns and exclude_patterns: - raise ValueError("include_patterns and exclude_patterns are mutually exclusive") - - logger.info(f"Finding chrom names and putting them in {working_file}.record_names") - shell( - 'grep ">" {working_file} | cut -f1 -d " " | sed "s/>//g" > {working_file}.record_names' - ) - - keep = _patterns(include_patterns, exclude_patterns) - with open(outfile + ".keep", "w") as fout, open( - working_file + ".record_names", "r" - ) as fin: - for line in fin: - line = line.replace(">", "").strip() - chrom = line.split()[0] - if keep(chrom): - fout.write(chrom + "\n") - shell("samtools faidx -r {outfile}.keep {working_file} | bgzip -c > {outfile}") - # shell("rm {outfile}.tmp {outfile}.tmp.fai {outfile}.keep") - shell("rm {tmpfiles}") - - -def filter_gtf_chroms(tmpfiles, outfile, include_patterns=None, exclude_patterns=None): - working_file = ensure_single_unzipped(tmpfiles, outfile + ".tmp") - keep = _patterns(include_patterns, exclude_patterns, verbose=False) - with gzip.open(outfile, "wt") as fout: - for feature in gffutils.DataIterator(working_file): - if keep(feature.chrom): - fout.write(str(feature) + "\n") - shell("rm {tmpfiles}") - - -def extract_from_zip(tmpfiles, outfile, path_in_zip): - """ - Parameters - ---------- - - tmpfiles : list - One-item list containing zip file - - outfile : str - gzipped output file to create - - path_in_zip : str - Path within zipfile to extract. You can identify the path using unzip - -l x.zip from bash. - """ - assert len(tmpfiles) == 1, f"expected single zip file, got {tmpfiles}" - - extraction_dir = tempfile.mkdtemp() - - with zipfile.ZipFile(tmpfiles[0], "r") as z: - z.extract(path_in_zip, path=extraction_dir) - - full_path_to_extracted = os.path.join(extraction_dir, path_in_zip) - - with open(full_path_to_extracted, "rb") as fin: - with gzip.open(outfile, "wb") as fout: - shutil.copyfileobj(fin, fout) - - shutil.rmtree(extraction_dir) - - -def match_gtf_9th(tmpfiles, outfile, strmatch, optstrand="None"): - """ - Matches string to the 9th field of GTF and an optional strand that defaults to None; - if the pattern is found and the provided strand match then the line is excluded - - Parameters - ---------- - tmpfiles : str - GTF files - - outfile : str - gzipped output GTF file - - strmatch : list - List of strings to match in the 9th field of the GTF. Must be list - - optstrand : str - String to match to the strand. Default is None - """ - regex_strmatch = re.compile(r"|".join(strmatch)) - - with gzip.open(outfile, "wt") as fout: - for tmpfn in tmpfiles: - with openfile(tmpfn, "rt") as tmp: - for line in tmp: - if line.startswith("#"): - fout.write(line) - else: - toks = line.split("\t") - if not ( - regex_strmatch.search(toks[8]) != None - and toks[6] == optstrand - ): - fout.write(line) - - - -def convert_gtf_chroms(tmpfiles, outfile, conv_table): - """ - Convert chrom names in GTF file according to conversion table. - - Parameters - ---------- - tmpfiles : str - GTF files to look through - - outfile : str - gzipped output GTF file - - conv_table : str - Lookup table file for the chromosome name conversion. Uses pandas to - read lookup table, so it can be file://, a path relative to the - snakefile, or an http://, https://, or ftp:// URL. - """ - lookup = ( - pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b")) - .set_index("a")["b"] - .to_dict() - ) - - with gzip.open(outfile, "wt") as fout: - for tmpfn in tmpfiles: - with openfile(tmpfn, "rt") as tmp: - for line in tmp: - if not line.startswith("#"): - toks = line.split("\t") - chrom = toks[0] - if chrom in lookup.keys(): - toks[0] = lookup[chrom] - line = "\t".join(toks) - else: - raise ValueError( - 'Chromosome "{chrom}" not found in conversion table ' - '"{conv_table}"'.format( - chrom=chrom, conv_table=conv_table - ) - ) - fout.write(line) - - -def convert_fasta_chroms(tmpfiles, outfile, conv_table): - """ - Convert chrom names in fasta file according to conversion table. - - Parameters - ---------- - tmpfiles : str - fasta files to look through - - outfile : str - gzipped output fasta file - - conv_table : str - Lookup table file for the chromosome name conversion. Uses pandas to - read lookup table, so it can be file://, a path relative to the - snakefile, or an http://, https://, or ftp:// URL. - """ - - lookup = ( - pd.read_csv(conv_table, sep="\t", header=None, names=("a", "b")) - .set_index("a")["b"] - .to_dict() - ) - - with gzip.open(outfile, "wt") as fout: - for tmpfn in tmpfiles: - with openfile(tmpfn, "rt") as tmp: - for line in tmp: - if line.startswith(">"): - line = line.rstrip("\n") - toks = line.split(" ") - chrom = toks[0].lstrip(">") - chrom = chrom.rstrip("\n") - if chrom in lookup.keys(): - toks[0] = ">" + lookup[chrom] - line = " ".join(toks) + "\n" - else: - raise ValueError( - 'Chromosome "{chrom}" not found in conversion table ' - '"{conv_table}"'.format( - chrom=chrom, conv_table=conv_table - ) - ) - fout.write(line) diff --git a/lib/utils.py b/lib/utils.py index b74fc7e4..0ae3edc7 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -783,8 +783,6 @@ def twobit_to_fasta(tmpfiles, outfile): shell("cat {fastas} | gzip -c > {outfile}") shell("rm {fastas}") -def default_postprocess(origfn, newfn): - shell("mv {origfn} {newfn}") def download_and_postprocess(urls, postprocess, outfile, log): """ From 999fde27a5ccd7d11be894f61f97389bb98a332d Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 19 Oct 2025 00:32:32 +0000 Subject: [PATCH 147/196] match chipseq to rnaseq fasta->genome --- workflows/chipseq/Snakefile | 4 ++-- workflows/chipseq/config/config.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 8b2b9004..4f347eb7 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -50,8 +50,8 @@ rule fasta: mem_mb="4g", runtime="2h", params: - urls=config["fasta"]["url"], - postprocess=config["fasta"].get("postprocess", None), + urls=config["genome"]["url"], + postprocess=config["genome"].get("postprocess", None), run: utils.download_and_postprocess( urls=params.urls, diff --git a/workflows/chipseq/config/config.yaml b/workflows/chipseq/config/config.yaml index 268dcf59..d20f6e36 100644 --- a/workflows/chipseq/config/config.yaml +++ b/workflows/chipseq/config/config.yaml @@ -1,6 +1,6 @@ sampletable: 'config/sampletable.tsv' -fasta: +genome: url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" postprocess: 'lib.utils.gzipped' From 854c362d9ddd1bb6ee5380d24c509a9cfe96189a Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 19 Oct 2025 02:39:41 +0000 Subject: [PATCH 148/196] minor cleanups in rnaseq snakefile --- .../Homo_sapiens/GENCODE.yaml | 2 ++ .../Homo_sapiens/GENCODE_v19.yaml | 11 +++----- .../Mus_musculus/GENCODE_M25.yaml | 10 +------ lib/utils.py | 7 ++--- workflows/rnaseq/Snakefile | 26 ++++++++++++++----- 5 files changed, 29 insertions(+), 27 deletions(-) diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml index 86d0538d..507877bb 100644 --- a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml +++ b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml @@ -3,6 +3,8 @@ # # https://www.gencodegenes.org/human/ # +organism: "Homo sapiens" + genome: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz" postprocess: lib.postprocess.default diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml index f32dc7e9..18648cd6 100644 --- a/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml +++ b/include/reference_config_templates/Homo_sapiens/GENCODE_v19.yaml @@ -32,6 +32,9 @@ # # So we can use this list of chroms to filter both the fasta as well as the gtf. # + +organism: "Homo sapiens" + genome: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/GRCh37.p13.genome.fa.gz" postprocess: @@ -166,11 +169,3 @@ annotation: - GL000247.1 - GL000248.1 - GL000249.1 - -rrna: - url: - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' - postprocess: - function: 'lib.utils.filter_fastas' - args: 'Homo sapiens' diff --git a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml index a077e399..99120cbf 100644 --- a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml +++ b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml @@ -3,7 +3,7 @@ # Primary assembly and associated annotations are directly available from GENCODE, # https://www.gencodegenes.org/mouse/release_M25.html -species: "Mus musculus" +organism: "Mus musculus" genome: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz" @@ -13,11 +13,3 @@ genome: annotation: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.primary_assembly.annotation.gtf.gz" postprocess: lib.postprocess.default - -rrna: - url: - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_138.2/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' - postprocess: - function: 'lib.utils.filter_fastas' - args: 'Mus musculus' diff --git a/lib/utils.py b/lib/utils.py index 0ae3edc7..a0f90a7f 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -723,7 +723,7 @@ def strand_arg_lookup(config, lookup): return lookup[config.stranded] -def filter_fastas(tmpfiles, outfile, pattern): +def filter_rrna_fastas(tmpfiles, outfile, pattern): """ Extract records from fasta file(s) given a search pattern. @@ -742,7 +742,8 @@ def filter_fastas(tmpfiles, outfile, pattern): Look for this string in each record's description """ - + if pattern is None: + raise ValueError("Pattern cannot be None") def gen(): for tmp in tmpfiles: handle = gzip.open(tmp, "rt") @@ -751,7 +752,7 @@ def gen(): if pattern not in rec.description: continue rec.seq = rec.seq.back_transcribe() - rec.description = rec.name + # rec.description = rec.name yield rec with gzip.open(outfile, "wt") as fout: diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index c0574925..1e4f3ab2 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -112,12 +112,22 @@ rule rrna_fasta: mem="4g", runtime="2h", params: - urls=config["rrna"]["url"], - postprocess=config["rrna"].get("postprocess", None), + organism=config.get("organism", None), + silva_release="138.1", run: + # SILVA database fasta file with all species + urls=[ + f'https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_LSURef_NR99_tax_silva.fasta.gz', + f'https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_SSURef_NR99_tax_silva.fasta.gz', + ] + + # Keep only sequences for the configured organism utils.download_and_postprocess( - urls=params.urls, - postprocess=params.postprocess, + urls=urls, + postprocess={ + "function": "lib.utils.filter_rrna_fastas", + "args": params.organism, + }, outfile=output[0], log=log, ) @@ -127,7 +137,7 @@ rule unzip: input: "references/{prefix}.gz", output: - temporary("references/{prefix}"), + "references/{prefix}", resources: mem="4g", runtime="2h", @@ -176,7 +186,7 @@ rule star_index: "--runThreadN {threads} " "--genomeDir {genomedir} " "--genomeFastaFiles {input.fasta} " - # NOTE: GTF is optional + # NOTE: GTF is optional but highly recommended by STAR docs "--sjdbGTFfile {input.gtf} " # NOTE: STAR docs say that 100 should work well. "--sjdbOverhang 100 " @@ -198,11 +208,13 @@ rule transcriptome_fasta: gtf="references/annotation.gtf", output: "references/transcriptome.fa", + log: + "references/transcriptome.log", resources: mem="4g", runtime="2h", shell: - "gffread {input.gtf} -w {output} -g {input.fasta}" + "gffread {input.gtf} -w {output} -g {input.fasta} &> {log}" rule salmon_index: From e6ef55410aca7ad4ea122180fd31ed986cdbaa44 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 19 Oct 2025 02:40:08 +0000 Subject: [PATCH 149/196] more simplification of references --- workflows/rnaseq/config/config.yaml | 16 ---------------- workflows/rnaseq/run_test.sh | 2 +- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/workflows/rnaseq/config/config.yaml b/workflows/rnaseq/config/config.yaml index 9047b4ab..2c34c6d9 100644 --- a/workflows/rnaseq/config/config.yaml +++ b/workflows/rnaseq/config/config.yaml @@ -4,19 +4,3 @@ sampletable: 'config/sampletable.tsv' stranded: 'fr-firststrand' # for dUTP libraries # 'fr-secondstrand' # for ligation libraries # 'unstranded' # for libraries without strand specificity - -genome: - url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" - postprocess: 'lib.utils.gzipped' - -annotation: - url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf" - postprocess: 'lib.utils.gzipped' - -rrna: - url: - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' - postprocess: - function: 'lib.utils.filter_fastas' - args: 'Drosophila melanogaster' diff --git a/workflows/rnaseq/run_test.sh b/workflows/rnaseq/run_test.sh index 7aacb413..fc76064e 100755 --- a/workflows/rnaseq/run_test.sh +++ b/workflows/rnaseq/run_test.sh @@ -1,3 +1,3 @@ set -e python -m doctest ../../ci/preprocessor.py -python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test "$@" +python ../../ci/preprocessor.py Snakefile > Snakefile.test && snakemake -s Snakefile.test --configfile ../../include/reference_config_templates/test.yaml "$@" From b37edbf5230bcb772e6e55e59c63088d5916aa03 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 19 Oct 2025 02:40:22 +0000 Subject: [PATCH 150/196] decision log updates --- docs/decisions.rst | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/docs/decisions.rst b/docs/decisions.rst index 117deca7..71d2a20b 100644 --- a/docs/decisions.rst +++ b/docs/decisions.rst @@ -545,3 +545,19 @@ annotation: 6 Mt_rRNA 3 translated_processed_pseudogene +Erring on the side of too many annotations (i.e., using the comprehensive +annotation instead of a curated version) will result in more features, which at +face value might make the FDR adjustment more harsh in DESeq2. But DESeq2's +independent filtering (not even testing those features with so few reads that +they would not reach significance) guards against this. + +Zipping/unzipping references +---------------------------- +STAR requires uncogffread references/annotation.gtf -w +references/transcriptome.fa -g references/genome.fampressed +FASTA and GTF files to build the index. Making uncompressed +temporary means running the risk of another rule needing +uncompressed to trigger costly STAR alignment. The extra +storage cost of leaving an uncompressed fasta (~3 GB) around is +minimal compared to the scale of all other data, and guards +against inadvertently re-running all alignment jobs. From a82e10c85a00e3dfb56d0dfd792b74745889b4de Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 19 Oct 2025 02:40:35 +0000 Subject: [PATCH 151/196] add test config --- include/reference_config_templates/test.yaml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 include/reference_config_templates/test.yaml diff --git a/include/reference_config_templates/test.yaml b/include/reference_config_templates/test.yaml new file mode 100644 index 00000000..ceb36877 --- /dev/null +++ b/include/reference_config_templates/test.yaml @@ -0,0 +1,9 @@ +organism: 'Drosophila melanogaster' + +genome: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" + postprocess: 'lib.utils.gzipped' + +annotation: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf" + postprocess: 'lib.utils.gzipped' From 2910b85d6319de4bb6ac37a4bbe937e5f91cb575 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 19 Oct 2025 02:41:16 +0000 Subject: [PATCH 152/196] snakefmt --- workflows/rnaseq/Snakefile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 1e4f3ab2..6c377137 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -72,17 +72,19 @@ rule fasta: log=log, ) + rule faidx: input: - "references/genome.fa" + "references/genome.fa", output: - "references/genome.fa.fai" + "references/genome.fa.fai", resources: mem_mb="4g", runtime="2h", shell: "samtools faidx {input}" + rule annotation: output: "references/annotation.gtf.gz", @@ -116,9 +118,9 @@ rule rrna_fasta: silva_release="138.1", run: # SILVA database fasta file with all species - urls=[ - f'https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_LSURef_NR99_tax_silva.fasta.gz', - f'https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_SSURef_NR99_tax_silva.fasta.gz', + urls = [ + f"https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_LSURef_NR99_tax_silva.fasta.gz", + f"https://www.arb-silva.de/fileadmin/silva_databases/release_{params.silva_release.replace('.', '_')}/Exports/SILVA_{params.silva_release}_SSURef_NR99_tax_silva.fasta.gz", ] # Keep only sequences for the configured organism From f2ddbe486602a8e432c464c1299002b13abed824 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 24 Oct 2025 00:38:16 +0000 Subject: [PATCH 153/196] reconfigure tests --- .circleci/config.yml | 98 +++++++++++------------ test/test_configs/test_rnaseq_config.yaml | 27 ++----- test/test_configs/test_sra_config.yaml | 5 +- 3 files changed, 54 insertions(+), 76 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index bee8727d..80767ded 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -108,6 +108,8 @@ variables: time conda env create -n $LCDBWF_ENV --file env.yml time conda env create -n $LCDBWF_ENV_R --file env-r.yml + conda env export -n $LCDBWF_ENV > /opt/miniconda/env.yml + conda env export -n $LCDBWF_ENV_R > /opt/miniconda/env.yml fi # -------------------------------------------------------------------------- @@ -135,32 +137,31 @@ variables: tree $ORIG set +x - # Separately copy over some test-specific files + # Separately copy over some test-specific files that are not part of deploying cp $ORIG/workflows/chipseq/run_test.sh $DEPLOY/workflows/chipseq/run_test.sh cp $ORIG/workflows/rnaseq/run_test.sh $DEPLOY/workflows/rnaseq/run_test.sh cp $ORIG/workflows/rnaseq/run_downstream_test.sh $DEPLOY/workflows/rnaseq/run_downstream_test.sh mkdir $DEPLOY/ci mkdir $DEPLOY/test - cp $ORIG/test/lcdb-wf-test $DEPLOY/test/lcdb-wf-test - cp $ORIG/test/workflow_test_params.yaml $DEPLOY/test/workflow_test_params.yaml cp $ORIG/ci/get-data.py $DEPLOY/ci/get-data.py # the ./run_test.sh scripts run this cp $ORIG/ci/preprocessor.py $DEPLOY/ci/preprocessor.py - # download example data + # Now we can download example data cd $DEPLOY - test/lcdb-wf-test data --kind=all --verbose + ci/get-data.py # -------------------------------------------------------------------------- # Run the doctests across the included modules pytest-step: &pytest-step run: - name: Run pytest suite and testthat suite + name: Run pytest suite and R testthat suite command: | source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV + # run unit tests and doctests for the modules in lib test/lcdb-wf-test unit_tests --pytest @@ -183,8 +184,9 @@ variables: cd $DEPLOY/workflows/chipseq source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test chipseq --run-workflow --use-conda -j2 -k -p - $DEPLOY/test/lcdb-wf-test chipseq --trackhub + cd $DEPLOY/workflows/chipseq + ./run_test.sh --use-conda -j2 -k -p + python chipseq_trackhub.py config/config.yaml config/hub_config.yaml # -------------------------------------------------------------------------- # Previous versions had an error where chipseq peaks needed to be defined for @@ -194,10 +196,9 @@ variables: run: name: chipseq misc command: | - cd $DEPLOY/workflows/chipseq source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - + cd $DEPLOY/workflows/chipseq ./run_test.sh --use-conda -j2 -k -p \ --configfile $ORIG/test/test_configs/test_chipseq_regression.yaml \ --config sampletable=$ORIG/test/test_configs/chipseq_one_run.tsv \ @@ -227,15 +228,18 @@ variables: cd $DEPLOY source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow -n - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --use-conda -j2 -k -p --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --trackhub --orig $ORIG + cd workflows/rnaseq + + ./run_test.sh -n \ + --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml + + ./run_test.sh --use-conda -j2 -k -p \ + --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml - # This run the preprocessor on the Rmd files and stores them - # in a new download-test directory (see the comments in the script - # for details) - $DEPLOY/test/lcdb-wf-test rnaseq --downstream + python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml + + ./run_downstream_test.sh # bundle up the entire directory to be used as an artifact tar -zcf /tmp/downstream.tar.gz workflows/rnaseq/downstream-test/ @@ -256,13 +260,22 @@ variables: source /opt/miniforge/etc/profile.d/conda.sh conda activate $LCDBWF_ENV - # Check the help for test/lcdb-wf-test to see what args these - # provide; some of them use the --until argument to restrict the - # rules that are run. Note the use of --orig $ORIG to use the test - # configs from the original clone rather than the deployed directory. - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-pe -k -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --sra-se -k -p -j2 --use-conda --orig $ORIG - $DEPLOY/test/lcdb-wf-test rnaseq --run-workflow --pe -k -p -j2 --use-conda --orig $ORIG + cd workflows/rnaseq + + # SRA test + ./run_test.sh -k -p -j2 --use-conda \ + --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \ + --config sampletable=$ORIG/test/test_configs/test_sra_sampletable.tsv + + # SRA SE only + ./run_test.sh -k -p -j2 --use-conda \ + --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \ + --config sampletable=$ORIG/test/test_configs/test_sra_sampletable_SE_only.tsv + + # PE + ./run_test.sh -k -p -j2 --use-conda \ + --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \ + --config sampletable=$ORIG/test/test_configs/test_pe_sampletable.tsv @@ -305,10 +318,13 @@ jobs: # themselves. - *save_cache + # These files were created during conda setup, and become part of the + # cache. So we should get them as artifacts regardless of if the conda + # setup ran this time. - store_artifacts: - path: /tmp/lcdb-wf-test/env.yaml + path: /opt/miniforge/env.yml - store_artifacts: - path: /tmp/lcdb-wf-test/env-r.yaml + path: /opt/miniforge/env-r.yml pytest: <<: *defaults resource_class: small @@ -328,7 +344,7 @@ jobs: - *get-data - *chipseq-step - store_artifacts: - path: /tmp/lcdb-wf-test/workflows/chipseq/data/chipseq_aggregation/multiqc.html + path: $DEST/workflows/chipseq/data/chipseq_aggregation/multiqc.html chipseq-misc: <<: *defaults @@ -363,7 +379,6 @@ jobs: path: /tmp/gene-patterns.html destination: gene-patterns.html - rnaseq-misc: <<: *defaults steps: @@ -373,7 +388,6 @@ jobs: - *get-data - *rnaseq-misc-step - build-docs: <<: *defaults resource_class: small @@ -402,24 +416,6 @@ jobs: - store_artifacts: path: /tmp/docs.tar.gz - report-env: - <<: *defaults - resource_class: small - steps: - - checkout - - *restore_cache - - *set-path - - run: - name: Report environment - command: | - source /opt/miniforge/etc/profile.d/conda.sh - conda env export -n lcdb-wf-test > /tmp/env.yaml - conda env export -n lcdb-wf-test-r > /tmp/env-r.yaml - - store_artifacts: - path: /tmp/env.yaml - - store_artifacts: - path: /tmp/env-r.yaml - # ---------------------------------------------------------------------------- # This section configures the dependencies across jobs. workflows: @@ -438,6 +434,7 @@ workflows: requires: - initial-setup - pytest + - chipseq - rnaseq: requires: - initial-setup @@ -446,12 +443,7 @@ workflows: requires: - initial-setup - pytest + - rnaseq - build-docs: requires: - initial-setup - - report-env: - requires: - - rnaseq - - rnaseq-misc - - chipseq - - chipseq-misc diff --git a/test/test_configs/test_rnaseq_config.yaml b/test/test_configs/test_rnaseq_config.yaml index 2cbd3d66..ff043f40 100644 --- a/test/test_configs/test_rnaseq_config.yaml +++ b/test/test_configs/test_rnaseq_config.yaml @@ -1,27 +1,16 @@ -fasta: - url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" - postprocess: 'lib.utils.gzipped' - -gtf: - url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf" - postprocess: 'lib.utils.gzipped' - -rrna: - url: - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_LSURef_tax_silva_trunc.fasta.gz' - - 'https://www.arb-silva.de/fileadmin/silva_databases/release_128/Exports/SILVA_128_SSURef_Nr99_tax_silva_trunc.fasta.gz' - postprocess: - function: 'lib.utils.filter_fastas' - args: 'Drosophila melanogaster' - +organism: Drosophila melanogaster sampletable: 'config/sampletable.tsv' -patterns: 'config/rnaseq_patterns.yaml' - # See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more info. stranded: 'fr-firststrand' # for dUTP libraries # 'fr-secondstrand' # for ligation libraries # 'unstranded' # for libraries without strand specificity -aligner: 'star' +genome: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" + postprocess: 'lib.utils.gzipped' + +annotation: + url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/annotation/dm6.small.gtf" + postprocess: 'lib.utils.gzipped' diff --git a/test/test_configs/test_sra_config.yaml b/test/test_configs/test_sra_config.yaml index f3f92cc4..427cae90 100644 --- a/test/test_configs/test_sra_config.yaml +++ b/test/test_configs/test_sra_config.yaml @@ -1,7 +1,4 @@ -patterns: 'config/rnaseq_patterns.yaml' - -# Which key in the `references` dict below to use -organism: 'human' +organism: 'Homo sapiens' # If not specified here, use the environment variable REFERENCES_DIR. references_dir: 'references_data' From 1291a2373de6ad8ee22d72472ee76773f87c8b79 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 24 Oct 2025 00:43:07 +0000 Subject: [PATCH 154/196] rm lcdb-wf-test --- docs/decisions.rst | 23 +- test/lcdb-wf-test | 584 --------------------------------------------- 2 files changed, 15 insertions(+), 592 deletions(-) delete mode 100755 test/lcdb-wf-test diff --git a/docs/decisions.rst b/docs/decisions.rst index 71d2a20b..38e1a2f7 100644 --- a/docs/decisions.rst +++ b/docs/decisions.rst @@ -553,11 +553,18 @@ they would not reach significance) guards against this. Zipping/unzipping references ---------------------------- -STAR requires uncogffread references/annotation.gtf -w -references/transcriptome.fa -g references/genome.fampressed -FASTA and GTF files to build the index. Making uncompressed -temporary means running the risk of another rule needing -uncompressed to trigger costly STAR alignment. The extra -storage cost of leaving an uncompressed fasta (~3 GB) around is -minimal compared to the scale of all other data, and guards -against inadvertently re-running all alignment jobs. + +STAR requires uncompressed FASTA and GTF files to build the index. Making +uncompressed files temporary means running the risk of another rule needing +uncompressed to trigger costly STAR alignment. The extra storage cost of +leaving an uncompressed fasta (~3 GB) around is minimal compared to the scale +of all other data, and guards against inadvertently re-running all alignment +jobs. + +Test framework +-------------- + +I had previously thought that the CircleCI tests were annoying to run and +reproduce locally, so the ``tests/lcdb-wf-test`` script was born. Turns out +that got rather complicated, and ended up being just as annoying. In the spirit +of reducing complexity, that test harness script is removed. diff --git a/test/lcdb-wf-test b/test/lcdb-wf-test deleted file mode 100755 index 8e8525fb..00000000 --- a/test/lcdb-wf-test +++ /dev/null @@ -1,584 +0,0 @@ -#!/usr/bin/env python - -""" -This script aims to make it more convenient to run various tests using -different configs. - -Below are configured various tests that are exposed to the commandline as -subcommands. These in turn support other commandline args to run a specific -test under that subcommand. - -The command-line help is the authoritative source for commands. Since it is -partly autogenerated, be sure to check it out by running with -h from the -command line. - -Here is a high-level description of what's going on here, which is not in the -command-line help: - -The Runner class, at creation time, sets up a top-level ArgumentParser with -args used throughout, like which env to use, or which dir to consider as the -"original" directory (for testing cases where we've deployed somewhere but we -want to use the test configs from the originally-cloned repo). - -The Runner class also has `_cmd_` methods. At runtime, the -Runner's ArgumentParser inspects the Runner to see what `_cmd_*` methods it -has, and adds subcommands for each one it finds. - -It's the job of each of those methods to make an ArgumentParser, parse the -args, and do the right thing. - -Since there are a lot of RNA-seq tests, and they use different parameters (like -different config files, and restricting the run to a sub-dag), these are -configured in the workflow_test_params.yaml file and the ArgumentParser is -automatically populated with these arguments. - -You can always see the CI tests (currently in .circleci/config.yml at the -top-level of the repo) for how this tool is used. - -""" - -import os -import shlex -from textwrap import dedent -import subprocess as sp -import sys -from pathlib import Path -import argparse -import yaml - -HERE = Path(__file__).resolve().parent -TOPLEVEL = Path(__file__).resolve().parent.parent - -WORKFLOW_ARGS = yaml.safe_load(open(TOPLEVEL / "test" / "workflow_test_params.yaml")) - - -def print_header(name): - print("-" * 80) - print("lcdb-wf-test: ", name) - print("-" * 80) - - -class Runner(object): - """ - To add a new command, create a new method with a name starting with - "_cmd_", create a new ArgumentParser. - """ - - default_env = os.getenv("LCDBWF_ENV", str(TOPLEVEL / "env")) - default_env_r = os.getenv("LCDBWF_ENV_R", str(TOPLEVEL / "env-r")) - global_parser = argparse.ArgumentParser(add_help=False) - global_parser.add_argument( - "--env", default=default_env, - help=f"""Main conda environment to use. Override - by setting $LCDBWF_ENV or override that by explicity setting --env. Currently will use {default_env}""" - ) - global_parser.add_argument( - "--env-r", - default=default_env_r, - help=f"""Main R conda environment to use. Override by setting - $LCDBWF_ENV_R or override that by explicity setting --env-r. Currently - will use {default_env_r}""" - ) - global_parser.add_argument( - "--orig", - default=str(TOPLEVEL), - help=f"""If specified, you can use the special string '__ORIG__' in - command line arguments which will be filled in with the value provided - here. Mostly used in CI.""", - ) - - def __init__(self): - parser = argparse.ArgumentParser( - description=""" - Test runner for lcdb-wf. - - For any any tests that use Snakemake, you'll need to provide the - relevant extra arguments for Snakemake as well (-n, -j, - --use-conda, etc). These additional args are passed directly to - Snakemake. - - %(prog)s data --kind all - %(prog)s unit_tests --pytest - %(prog)s unit_tests --r-test - %(prog)s rnaseq --run-workflow - %(prog)s rnaseq --trackhub - %(prog)s rnaseq --downstream - %(prog)s chipseq --run-workflow - %(prog)s references --run-workflow --configfile=config/config.yaml - - DATA - ---- - %(prog)s data --kind all --verbose - - UNIT TESTS - ---------- - # Run the pytest unit tests on the lib/ - %(prog)s unit_tests --pytest - - # Run tests on lcdbwf R package - %(prog)s unit_tests --r-test - - # Ensure URLs in the configs exist - %(prog)s unit_tests --url-check - - # Ensure rnaseq.Rmd has matching sections in the docs - %(prog)s unit_tests --ensure-docs - - RNASEQ - ------ - # Run main workflow - %(prog)s rnaseq --run-workflow - - # Build RNA-seq trackhub from output of main workflow - %(prog)s rnaseq --trackhub - - # Run rnaseq.Rmd - %(prog)s rnaseq --downstream - - # Each of these runs a restricted subset of the workflow with - # customized configs; they should be run one at a time. - %(prog)s rnaseq --run-workflow --sra-pe - %(prog)s rnaseq --run-workflow --sra-se - %(prog)s rnaseq --run-workflow --strandedness-pe - %(prog)s rnaseq --run-workflow --strandedness-se - %(prog)s rnaseq --run-workflow --pe - - # Since there are a lot of parameters here, see - # "workflow_test_params.yaml" for how they are managed. - - """, - formatter_class=argparse.RawDescriptionHelpFormatter - ) - - # Introspection to build subcommands based on which `_cmd_*` methods - # are defined - choices = [i.replace("_cmd_", "") for i in dir(self) if i.startswith("_cmd_")] - - parser.add_argument("command", help="Subcommand to run", choices=choices) - - # Second arg is the subcommand; dispatch to the appropriate method - args = parser.parse_args(sys.argv[1:2]) - - if not hasattr(self, "_cmd_" + args.command): - print("Unrecognized command") - parser.print_help() - sys.exit(1) - - # Get it and then immediately call it. - subcommand = getattr(self, "_cmd_" + args.command) - subcommand() - - def _cmd_data(self): - """ - Subcommand for downloading test data - """ - - parser = argparse.ArgumentParser( - description="Download data", - parents=[self.global_parser], - ) - - parser.add_argument( - "--kind", - default="all", - choices=["all", "rnaseq", "chipseq"], - help="Kind of data to download", - ) - parser.add_argument( - "--branch", default="master", help="Branch from lcdb-test-data to use" - ) - parser.add_argument( - "--verbose", - action="store_true", - help="Be verbose about what's being downloaded", - ) - - args = parser.parse_args(sys.argv[2:]) - - repo = "lcdb-test-data" - URL = f"https://github.com/lcdb/{repo}/blob/{args.branch}/data/{{}}?raw=true" - - # This dict maps files in the `data` directory of test-data repo to - # a local path to which it should be downloaded, as expected by the - # various test configs and sampletables. Directories are made as - # needed. First one is commented as an example. - data_files = { - "rnaseq": [ - ( - # Path in test data repo on GitHub - "rnaseq_samples/sample1/sample1.small_R1.fastq.gz", - - # Download it to this path locally - "workflows/rnaseq/data/example_data/rnaseq_sample1.fq.gz", - ), - ( - "rnaseq_samples/sample2/sample2.small_R1.fastq.gz", - "workflows/rnaseq/data/example_data/rnaseq_sample2.fq.gz", - ), - ( - "rnaseq_samples/sample3/sample3.small_R1.fastq.gz", - "workflows/rnaseq/data/example_data/rnaseq_sample3.fq.gz", - ), - ( - "rnaseq_samples/sample4/sample4.small_R1.fastq.gz", - "workflows/rnaseq/data/example_data/rnaseq_sample4.fq.gz", - ), - ( - "rnaseq_samples/sample1/sample1.small_R1.fastq.gz", - "workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz", - ), - ( - "rnaseq_samples/sample1/sample1.small_R2.fastq.gz", - "workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz", - ), - ( - "rnaseq_samples/sample2/sample2.small_R1.fastq.gz", - "workflows/rnaseq/data/example_data/rnaseq_sample2PE_1.fq.gz", - ), - ( - "rnaseq_samples/sample2/sample2.small_R2.fastq.gz", - "workflows/rnaseq/data/example_data/rnaseq_sample2PE_2.fq.gz", - ), - ], - "chipseq": [ - ( - "chipseq_samples/input_1/input_1.tiny_R1.fastq.gz", - "workflows/chipseq/data/example_data/chipseq_input1.fq.gz", - ), - ( - "chipseq_samples/input_2/input_2.tiny_R1.fastq.gz", - "workflows/chipseq/data/example_data/chipseq_input2.fq.gz", - ), - ( - "chipseq_samples/input_3/input_3.tiny_R1.fastq.gz", - "workflows/chipseq/data/example_data/chipseq_input3.fq.gz", - ), - ( - "chipseq_samples/ip_1/ip_1.tiny_R1.fastq.gz", - "workflows/chipseq/data/example_data/chipseq_ip1.fq.gz", - ), - ( - "chipseq_samples/ip_2/ip_2.tiny_R1.fastq.gz", - "workflows/chipseq/data/example_data/chipseq_ip2.fq.gz", - ), - ( - "chipseq_samples/ip_3/ip_3.tiny_R1.fastq.gz", - "workflows/chipseq/data/example_data/chipseq_ip3.fq.gz", - ), - ( - "chipseq_samples/ip_4/ip_4.tiny_R1.fastq.gz", - "workflows/chipseq/data/example_data/chipseq_ip4.fq.gz", - ), - ], - } - - if args.kind == "all": - kinds = list(data_files.keys()) - else: - kinds = [args.kind] - for kind in kinds: - for fn, dest in data_files[kind]: - url = URL.format(fn) - if args.verbose: - print(f"downloading {url}") - if dest is None: - dest = fn - dest = Path(dest) - dest.parent.mkdir(parents=True, exist_ok=True) - sp.run( - f"wget -q -O- {url} > {dest}", shell=True, check=True, cwd=TOPLEVEL - ) - - def _cmd_unit_tests(self): - """ - Subcommand for unit tests -- these don't run Snakemake. - """ - parser = argparse.ArgumentParser( - description="Run various unit tests and checks", - parents=[self.global_parser], - ) - parser.add_argument( - "--pytest", - action="store_true", - help="Run pytest unit tests and module doctests on lib/ directory", - ) - parser.add_argument( - "--url-check", - action="store_true", - help="Ensure that URLs found in config files (e.g., to genome references) are still valid", - ) - parser.add_argument( - "--r-test", - action="store_true", - help="""Run devtools::test on the lcdbwf R package. Activates the - conda environment specified by --env-r just before running.""", - ) - - parser.add_argument( - "--ensure-docs", - action="store_true", - help="Ensure that all named R chunks are documented in the online help docs", - ) - - args = parser.parse_args(sys.argv[2:]) - - if args.pytest: - print_header("pytest") - sp.run(["pytest", "--doctest-modules", "lib"], check=True, cwd=TOPLEVEL) - - if args.url_check: - print_header("url check") - sys.path.insert(0, str(TOPLEVEL)) - from lib.utils import check_all_urls_found - - check_all_urls_found() - - if args.r_test: - print_header("R test") - p = sp.run( - 'eval "$(conda shell.bash hook)" ' - f"&& conda activate {args.env_r} " - '''&& Rscript -e "devtools::test('lib/lcdbwf', reporter=c('summary', 'fail'), export_all=TRUE)"''', - shell=True, - check=True, - executable="/bin/bash" - ) - if p.returncode: - sys.exit(1) - - if args.ensure_docs: - sp.run(["./ensure_docs.py"], check=True, cwd=TOPLEVEL / "ci") - - def _cmd_rnaseq(self): - """ - Subcommand for RNA-seq. There are many tests here, with different - config files and sampletables etc. So the possibilities are configured - over in workflow_test_params.yaml and auto-generated here. - """ - - parser = argparse.ArgumentParser( - description="Run rnaseq workflow and downstream tests", - parents=[self.global_parser], - ) - parser.add_argument( - "--run-workflow", - action="store_true", - help="""Run rnaseq workflow using the run_tesh.sh harness, which - edits the Snakefile to use test settings before running. Additional - args not specified here are passed to Snakemake, or use other flags - below to easily specify config sets.""", - ) - parser.add_argument( - "--trackhub", action="store_true", help="Build the rnaseq track hub" - ) - parser.add_argument( - "--downstream", - action="store_true", - help="""Run the downstream rnaseq.Rmd, via - workflows/rnaseq/run_downstream_test.sh. This runs the preprocessor - on the files to allow the use of # [TEST SETTINGS] comments; see - that script for details. Activates environment configured in - --env-r before running.""", - ) - - # Here we programmatically build the parser from the - # workflow_test_params.yaml file which configures arguments for each - # test. Here, the configured tests are added to a mutually-exclusive - # group to avoid inadvertently overwriting each others' config file - # params (in which case the test would not be the the thing you thought - # you were testing...). They all write their params to the - # args.additional_args attribute, which is passed to run_test.sh, which - # in turn passes them to Snakemake itself. - group = parser.add_mutually_exclusive_group() - workflow_prefix = "bash run_test.sh" - workflow_dir = TOPLEVEL / "workflows/rnaseq" - for key, val in WORKFLOW_ARGS["rnaseq"].items(): - group.add_argument( - "--" + key, - action="store_const", - default="", - dest="additional_args", - const=val["args"], - - # Be really explicit about what's being run, so you can run it - # yourself separately if you want (or for double-checking this - # is doing what you want it to do) - help=dedent( - f""" - {val['desc']} - - Runs the following, as configured in workflow_test_params.yaml: - - cd {workflow_dir} && {workflow_prefix} {val['args']} - """), - ) - - args, extra = parser.parse_known_args(sys.argv[2:]) - - if args.run_workflow: - print(args) - if args.additional_args: - extra.extend(shlex.split(args.additional_args)) - - extra = [i.replace("__ORIG__", args.orig) for i in extra] - strargs = " ".join(extra) - cmd = ( - 'eval "$(conda shell.bash hook)" ' - f"&& conda activate {args.env} " - f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})" - ) - print_header(f"Running the following command:\n{cmd}") - sp.run( - cmd, - check=True, - shell=True, - executable="/bin/bash" - ) - if args.trackhub: - cmd = ( - 'eval "$(conda shell.bash hook)" ' - f"&& conda activate {args.env} " - f"&& (cd {workflow_dir} " - "&& python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml)" - ) - print_header(f"Building trackhub with command: {cmd}") - - sp.run( - cmd, - shell=True, - check=True, - executable="/bin/bash" - ) - print("See workflows/rnaseq/staging for the built trackhub") - - if args.downstream: - print_header("running downstream rnaseq.Rmd") - sp.run( - 'eval "$(conda shell.bash hook)" ' - f"&& conda activate {args.env_r} " - "&& (cd workflows/rnaseq && bash run_downstream_test.sh)", - shell=True, - check=True, - executable="/bin/bash" - ) - - def _cmd_chipseq(self): - """ - This function handles the "chipseq" subcommand. - """ - - parser = argparse.ArgumentParser( - description="Run chipseq workflow", - parents=[self.global_parser], - ) - parser.add_argument( - "--run-workflow", - action="store_true", - help="""Run chipseq workflow using the run_tesh.sh harness, which - edits the Snakefile to use test settings before running. Additional - args not specified here are passed to Snakemake, or use other flags - below to easily specify config sets.""", - ) - parser.add_argument( - "--trackhub", action="store_true", help="Build the rnaseq track hub" - ) - args, extra = parser.parse_known_args(sys.argv[2:]) - workflow_prefix = "bash run_test.sh" - workflow_dir = TOPLEVEL / "workflows/chipseq" - - if args.run_workflow: - extra = [i.replace("__ORIG__", args.orig) for i in extra] - strargs = " ".join(extra) - cmd = ( - 'eval "$(conda shell.bash hook)" ' - f"&& conda activate {args.env} " - f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})" - ) - print_header(f"Running the following command:\n{cmd}") - sp.run( - cmd, - shell=True, - check=True, - executable="/bin/bash" - ) - if args.trackhub: - cmd = ( - 'eval "$(conda shell.bash hook)" ' - f"&& conda activate {args.env} " - f"&& (cd {workflow_dir} " - "&& python chipseq_trackhub.py config/config.yaml config/hub_config.yaml)" - ) - print_header(f"Building trackhub with command: {cmd}") - - sp.run( - cmd, - shell=True, - check=True, - executable="/bin/bash" - ) - print("See workflows/chipseq/staging for the built trackhub") - - def _cmd_references(self): - parser = argparse.ArgumentParser( - description="Run references workflow", - parents=[self.global_parser], - ) - parser.add_argument( - "--run-workflow", - action="store_true", - help="""Run references workflow using the run_tesh.sh harness, which - edits the Snakefile to use test settings before running.""" - ) - args, extra = parser.parse_known_args(sys.argv[2:]) - - workflow_prefix = "bash run_test.sh" - workflow_dir = TOPLEVEL / "workflows/references" - if args.run_workflow: - extra = [i.replace("__ORIG__", args.orig) for i in extra] - strargs = " ".join(extra) - cmd = ( - 'eval "$(conda shell.bash hook)" ' - f"&& conda activate {args.env} " - f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})" - ) - print_header(f"Running the following command:\n{cmd}") - sp.run( - cmd, - shell=True, - check=True, - executable="/bin/bash" - ) - - def _cmd_colocalization(self): - parser = argparse.ArgumentParser( - description="Run colocalization workflow", - parents=[self.global_parser], - ) - parser.add_argument( - "--run-workflow", - action="store_true", - help="""Run colocalization workflow using the run_test.sh harness""" - ) - args, extra = parser.parse_known_args(sys.argv[2:]) - workflow_prefix = "bash run_test.sh" - workflow_dir = TOPLEVEL / "workflows/colocalization" - if args.run_workflow: - extra = [i.replace("__ORIG__", args.orig) for i in extra] - strargs = " ".join(extra) - cmd = ( - 'eval "$(conda shell.bash hook)" ' - f"&& conda activate {args.env} " - f"&& (cd {workflow_dir} && {workflow_prefix} {strargs})" - ) - print_header(f"Running the following command:\n{cmd}") - sp.run( - cmd, - shell=True, - check=True, - executable="/bin/bash" - ) - -if __name__ == "__main__": - Runner() - -# vim: ft=python From 5372cfcd0741c57352d7c93037017ccfb0571294 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 24 Oct 2025 01:46:36 +0000 Subject: [PATCH 155/196] fix some tests --- .circleci/config.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 80767ded..996d6577 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -163,16 +163,17 @@ variables: conda activate $LCDBWF_ENV # run unit tests and doctests for the modules in lib - test/lcdb-wf-test unit_tests --pytest + pytest --doctest-modules lib # Ensure that the chunks in rnaseq.Rmd have matching documentation - test/lcdb-wf-test unit_tests --ensure-docs + (cd ci && ./ensure_docs.py) # find all URLs in reference configs and make sure they exist - test/lcdb-wf-test unit_tests --url-check + python -c "import sys; sys.path.insert(0, '$DEST'); from lib.utils import check_all_urls_found; check_all_urls_found()" # run R package unit tests using the R env - test/lcdb-wf-test unit_tests --r-test + conda activate $LCDBWF_ENV_R + Rscript -e "devtools::test('lib/lcdbwf', reporter=c('summary', 'fail'), export_all=TRUE)" # -------------------------------------------------------------------------- From 0394f850d64123e3df30f504d69e6ed6498dfdbf Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 24 Oct 2025 02:03:33 +0000 Subject: [PATCH 156/196] update decision log --- docs/decisions.rst | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/docs/decisions.rst b/docs/decisions.rst index 38e1a2f7..1ceeb339 100644 --- a/docs/decisions.rst +++ b/docs/decisions.rst @@ -567,4 +567,47 @@ Test framework I had previously thought that the CircleCI tests were annoying to run and reproduce locally, so the ``tests/lcdb-wf-test`` script was born. Turns out that got rather complicated, and ended up being just as annoying. In the spirit -of reducing complexity, that test harness script is removed. +of reducing complexity, that test harness script is removed. In part, the new +reference config simplification allows control over configs from the +commandline, reducing the need to handle that from the test script. + +rRNA +---- +Assessing ribosomal RNA contamination is an important QC step. Different +annotation sources have different ways of indicating ribosomal RNA. For example, +Ensembl GTF files typically have "trancript_biotype" attributes on transcript +featuretypes and "gene_biotype" attributes on gene features, depending on +version (older versions have separate rRNA featuretypes). FlyBase uses separate +rRNA feature types. Dictyostelium does not have anything in the GTF. PomBase +uses the "biotype" attribute. + +One way of handling this is to have post-processing steps that extract the rRNA +features from a GTF (probably defaulting to assuming an Ensembl-like +"gene_biotype" attribute) and convert them to `IntervalList format +`__ +to pass to Picard CollectRnaSeqMetrics. + +Another way is to bypass the GTF altogether and align to rRNA directly, which is +what we have historically done here. Previously, the reference configs would all +need an rRNA entry that basically did the same thing for each organism, since +every model organism we've worked with is in the SILVA database. It would +download the full SILVA fasta (for large and small subunits), grep out the +records for our species of interest, and build a bowtie2 index out of that. That +means this method is more general, and arguably more complete, but has its own +complexity: we need to download and filter the fasta, build the bowtie2 index, +and aggregate the results into a MultiQC module. + +In the 2.0 refactor, rRNA fasta creation now only needs an organism name and the +Snakefile does what was always in the references config, which is to use the +post-process mechanism to filter the fasta. + + + + +Aligners +-------- + +Previously, HISAT2 and STAR were both supported; salmon and kallisto were both +supported. This created additional complexity in the references workflow and in +the configs. Now, we're just using STAR and salmon (for RNA-seq) and bowtie2 for +ChIP-seq. From 0d009b06356d4448e1530af3002104524ecebaa7 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 24 Oct 2025 02:08:55 +0000 Subject: [PATCH 157/196] another test fix --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 996d6577..5dafe186 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -240,6 +240,7 @@ variables: python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml + conda activate $LCDBWF_ENV_R ./run_downstream_test.sh # bundle up the entire directory to be used as an artifact From d6933363553b4a4badd5be250aed65cc541254fb Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Sun, 26 Oct 2025 21:00:55 +0000 Subject: [PATCH 158/196] fix artifacts for rnaseq --- .circleci/config.yml | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 5dafe186..3da87834 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -241,14 +241,13 @@ variables: python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml conda activate $LCDBWF_ENV_R + + # This creates files in `workflows/rnaseq/downstream-test`: ./run_downstream_test.sh # bundle up the entire directory to be used as an artifact - tar -zcf /tmp/downstream.tar.gz workflows/rnaseq/downstream-test/ - cp workflows/rnaseq/downstream-test/rnaseq.html /tmp/rnaseq.html - cp workflows/rnaseq/downstream-test/functional-enrichment.html /tmp/functional-enrichment.html - cp workflows/rnaseq/downstream-test/gene-patterns.html /tmp/gene-patterns.html - cp workflows/rnaseq/data/rnaseq_aggregation/multiqc.html /tmp/rnaseq.html + tar -zcf /tmp/downstream.tar.gz downstream-test/ + cp data/rnaseq_aggregation/multiqc.html /tmp/rnaseq.html # -------------------------------------------------------------------------- # Various tests on RNA-seq workflow that don't warrant the overhead of a new @@ -368,18 +367,9 @@ jobs: - store_artifacts: path: /tmp/downstream.tar.gz destination: downstream.tar.gz - - store_artifacts: - path: /tmp/rnaseq.html - destination: rnaseq.html - store_artifacts: path: /tmp/multiqc.html destination: multiqc.html - - store_artifacts: - path: /tmp/functional-enrichment.html - destination: functional-enrichment.html - - store_artifacts: - path: /tmp/gene-patterns.html - destination: gene-patterns.html rnaseq-misc: <<: *defaults From da705f46447da93ff70ded3b22a50a920b189691 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Mon, 27 Oct 2025 13:50:41 +0000 Subject: [PATCH 159/196] fix export path --- .circleci/config.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3da87834..3899fa03 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -108,8 +108,8 @@ variables: time conda env create -n $LCDBWF_ENV --file env.yml time conda env create -n $LCDBWF_ENV_R --file env-r.yml - conda env export -n $LCDBWF_ENV > /opt/miniconda/env.yml - conda env export -n $LCDBWF_ENV_R > /opt/miniconda/env.yml + conda env export -n $LCDBWF_ENV > /opt/miniforge/env.yml + conda env export -n $LCDBWF_ENV_R > /opt/miniforge/env.yml fi # -------------------------------------------------------------------------- From 828dcea5a35c68399e49d9c7c89b788a283029db Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Mon, 27 Oct 2025 17:32:41 +0000 Subject: [PATCH 160/196] initial round of reference configs --- .../Danio_rerio/GRCz11.yaml | 10 ++++++++++ .../Drosophila_melanogaster/r6.65.yaml | 7 +++++++ .../Eschericia_coli.yaml/ASM584v2.yaml | 7 +++++++ .../Eschericia_coli.yaml/K-12_substr.yaml | 1 + .../Mus_musculus/GENCODE_M38.yaml | 12 ++++++++++++ .../Plodia_interpunctella/ilPloInte3.2.yaml | 7 +++++++ .../Rattus_norvegicus/GRCr8.yaml | 13 +++++++++++++ .../Saccharomyces_cerevisiae/R64-1-1.115.yaml | 1 + .../Saccharomyces_cerevisiae/S288C.yaml | 9 +++++++++ .../Schizosaccharomyces_pombe/ASM294v2.yaml | 7 +++++++ 10 files changed, 74 insertions(+) create mode 100644 include/reference_config_templates/Danio_rerio/GRCz11.yaml create mode 100644 include/reference_config_templates/Drosophila_melanogaster/r6.65.yaml create mode 100644 include/reference_config_templates/Eschericia_coli.yaml/ASM584v2.yaml create mode 120000 include/reference_config_templates/Eschericia_coli.yaml/K-12_substr.yaml create mode 100644 include/reference_config_templates/Mus_musculus/GENCODE_M38.yaml create mode 100644 include/reference_config_templates/Plodia_interpunctella/ilPloInte3.2.yaml create mode 100644 include/reference_config_templates/Rattus_norvegicus/GRCr8.yaml create mode 120000 include/reference_config_templates/Saccharomyces_cerevisiae/R64-1-1.115.yaml create mode 100644 include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml create mode 100644 include/reference_config_templates/Schizosaccharomyces_pombe/ASM294v2.yaml diff --git a/include/reference_config_templates/Danio_rerio/GRCz11.yaml b/include/reference_config_templates/Danio_rerio/GRCz11.yaml new file mode 100644 index 00000000..909d27b8 --- /dev/null +++ b/include/reference_config_templates/Danio_rerio/GRCz11.yaml @@ -0,0 +1,10 @@ +organism: "Danio rerio" + +# Primary assembly (excludes haplotypes and alt regions) from Ensembl, +# soft-masked +genome: + url: "https://ftp.ensembl.org/pub/release-115/fasta/danio_rerio/dna/Danio_rerio.GRCz11.dna_sm.primary_assembly.fa.gz" + +# Ensembl provides a version with "chr" prefixes, but this one matches the fasta above. +annotation: + url: "https://ftp.ensembl.org/pub/release-115/gtf/danio_rerio/Danio_rerio.GRCz11.115.gtf.gz" diff --git a/include/reference_config_templates/Drosophila_melanogaster/r6.65.yaml b/include/reference_config_templates/Drosophila_melanogaster/r6.65.yaml new file mode 100644 index 00000000..92704845 --- /dev/null +++ b/include/reference_config_templates/Drosophila_melanogaster/r6.65.yaml @@ -0,0 +1,7 @@ +organism: "Drosophila melanogaster" + +genome: + url: "https://s3ftp.flybase.org/genomes/Drosophila_melanogaster/dmel_r6.65_FB2025_04/fasta/dmel-all-chromosome-r6.65.fasta.gz" + +annotation: + url: "https://s3ftp.flybase.org/genomes/Drosophila_melanogaster/dmel_r6.65_FB2025_04/gtf/dmel-all-r6.65.gtf.gz" diff --git a/include/reference_config_templates/Eschericia_coli.yaml/ASM584v2.yaml b/include/reference_config_templates/Eschericia_coli.yaml/ASM584v2.yaml new file mode 100644 index 00000000..97ee4c38 --- /dev/null +++ b/include/reference_config_templates/Eschericia_coli.yaml/ASM584v2.yaml @@ -0,0 +1,7 @@ +organism: "Escherichia coli" + +# From NCBI. +genome: + url: "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.fna.gz" +annotation: + url: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_genomic.gtf.gz" diff --git a/include/reference_config_templates/Eschericia_coli.yaml/K-12_substr.yaml b/include/reference_config_templates/Eschericia_coli.yaml/K-12_substr.yaml new file mode 120000 index 00000000..e7f0926c --- /dev/null +++ b/include/reference_config_templates/Eschericia_coli.yaml/K-12_substr.yaml @@ -0,0 +1 @@ +ASM584v2.yaml \ No newline at end of file diff --git a/include/reference_config_templates/Mus_musculus/GENCODE_M38.yaml b/include/reference_config_templates/Mus_musculus/GENCODE_M38.yaml new file mode 100644 index 00000000..47d4fbb0 --- /dev/null +++ b/include/reference_config_templates/Mus_musculus/GENCODE_M38.yaml @@ -0,0 +1,12 @@ +# This is the latest release for GRCm39 (mm39). + +organism: "Mus musculus" + +genome: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M38/GRCm39.primary_assembly.genome.fa.gz" + +# Although there is a separate lncRNA annotation that does not specify that it +# is a subset, it does appear to be a subset because those features are in +# this primary assembly annotation. +annotation: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M38/gencode.vM38.primary_assembly.annotation.gtf.gz" diff --git a/include/reference_config_templates/Plodia_interpunctella/ilPloInte3.2.yaml b/include/reference_config_templates/Plodia_interpunctella/ilPloInte3.2.yaml new file mode 100644 index 00000000..c23f990c --- /dev/null +++ b/include/reference_config_templates/Plodia_interpunctella/ilPloInte3.2.yaml @@ -0,0 +1,7 @@ +organism: "Plodia interpunctella" + +genome: + url: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/027/563/975/GCF_027563975.2_ilPloInte3.2/GCF_027563975.2_ilPloInte3.2_genomic.fna.gz" + +annotation: + url: "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/027/563/975/GCF_027563975.2_ilPloInte3.2/GCF_027563975.2_ilPloInte3.2_genomic.gtf.gz" diff --git a/include/reference_config_templates/Rattus_norvegicus/GRCr8.yaml b/include/reference_config_templates/Rattus_norvegicus/GRCr8.yaml new file mode 100644 index 00000000..b913d412 --- /dev/null +++ b/include/reference_config_templates/Rattus_norvegicus/GRCr8.yaml @@ -0,0 +1,13 @@ +organism: "Rattus norvegicus" + +# Although there are individual chromosome fastas with "primary" in the +# filename, there is no corresponding genome-wide fasta file with "primary" in +# the filename. However, the toplevel fasta here has "dna_sm:primary_assembly" +# in all of its record descriptions, so it does not appear to have haplotypes +# or alt regions. +genome: + url: "https://ftp.ensembl.org/pub/release-115/fasta/rattus_norvegicus/dna/Rattus_norvegicus.GRCr8.dna_sm.toplevel.fa.gz" + +annotation: + url: "https://ftp.ensembl.org/pub/release-115/gtf/rattus_norvegicus/Rattus_norvegicus.GRCr8.115.gtf.gz" + diff --git a/include/reference_config_templates/Saccharomyces_cerevisiae/R64-1-1.115.yaml b/include/reference_config_templates/Saccharomyces_cerevisiae/R64-1-1.115.yaml new file mode 120000 index 00000000..04eff7f4 --- /dev/null +++ b/include/reference_config_templates/Saccharomyces_cerevisiae/R64-1-1.115.yaml @@ -0,0 +1 @@ +S288C.yaml \ No newline at end of file diff --git a/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml b/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml new file mode 100644 index 00000000..4e0204d0 --- /dev/null +++ b/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml @@ -0,0 +1,9 @@ +# https://www.yeastgenome.org/strain/s288c + +# From Ensembl. According to README in this FTP dir, if there's no primary +# assembly then the toplevel is assumed to be the primary assembly. +genome: + url: "https://ftp.ensembl.org/pub/release-115/fasta/saccharomyces_cerevisiae/dna/Saccharomyces_cerevisiae.R64-1-1.dna_sm.toplevel.fa.gz" + +annotation: + url: "https://ftp.ensembl.org/pub/release-115/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.115.gtf.gz" diff --git a/include/reference_config_templates/Schizosaccharomyces_pombe/ASM294v2.yaml b/include/reference_config_templates/Schizosaccharomyces_pombe/ASM294v2.yaml new file mode 100644 index 00000000..ff0d37c8 --- /dev/null +++ b/include/reference_config_templates/Schizosaccharomyces_pombe/ASM294v2.yaml @@ -0,0 +1,7 @@ +organism: "Schizosaccharomyces pombe" +genome: + url: "http://ftp.ensemblgenomes.org/pub/fungi/release-62/fasta/schizosaccharomyces_pombe/dna/Schizosaccharomyces_pombe.ASM294v2.dna_sm.toplevel.fa.gz" + +annotation: + url: "https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-62/gff3/schizosaccharomyces_pombe/Schizosaccharomyces_pombe.ASM294v2.62.gff3.gz" + postprocess: 'lib.postprocess.gff2gtf' From db65bb7e19a60a9af9873dc70ef7ee268d0fb8db Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Mon, 27 Oct 2025 17:32:50 +0000 Subject: [PATCH 161/196] ci/get-data.py: pep8, run from any dir, verbose mode --- ci/get-data.py | 117 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 21 deletions(-) diff --git a/ci/get-data.py b/ci/get-data.py index cd2d356b..984ed9de 100755 --- a/ci/get-data.py +++ b/ci/get-data.py @@ -1,37 +1,112 @@ #!/usr/bin/env python +import argparse import os + from snakemake.shell import shell from snakemake.utils import makedirs -shell.executable('/bin/bash') -BRANCH = 'master' -URL = 'https://github.com/lcdb/lcdb-test-data/blob/{0}/data/{{}}?raw=true'.format(BRANCH) +BRANCH = "master" +URL = "https://github.com/lcdb/lcdb-test-data/blob/{0}/data/{{}}?raw=true".format( + BRANCH +) + +TOPLEVEL = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -def _download_file(fn, dest=None): +def _download_file(fn, dest=None, verbose=False): url = URL.format(fn) if dest is None: dest = fn + dest = os.path.join(TOPLEVEL, dest) makedirs(os.path.dirname(dest)) - basename = os.path.basename(fn) - shell('wget -q -O- {url} > {dest}') + if not verbose: + q = "-q" + else: + q = "" + shell(f"wget {q} -O- {url} > {dest}") + if verbose: + print(f"Saved {dest}") return dest -_download_file('rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1.fq.gz') -_download_file('rnaseq_samples/sample2/sample2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2.fq.gz') -_download_file('rnaseq_samples/sample3/sample3.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample3.fq.gz') -_download_file('rnaseq_samples/sample4/sample4.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample4.fq.gz') +ap = argparse.ArgumentParser() +ap.add_argument("-v", "--verbose", action="store_true", help="Be verbose when downloading") +args = ap.parse_args() + +_download_file( + "rnaseq_samples/sample1/sample1.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample1.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample2/sample2.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample2.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample3/sample3.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample3.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample4/sample4.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample4.fq.gz", + args.verbose, +) -_download_file('rnaseq_samples/sample1/sample1.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz') -_download_file('rnaseq_samples/sample1/sample1.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz') -_download_file('rnaseq_samples/sample2/sample2.small_R1.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2PE_1.fq.gz') -_download_file('rnaseq_samples/sample2/sample2.small_R2.fastq.gz', 'workflows/rnaseq/data/example_data/rnaseq_sample2PE_2.fq.gz') +_download_file( + "rnaseq_samples/sample1/sample1.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample1PE_1.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample1/sample1.small_R2.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample1PE_2.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample2/sample2.small_R1.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample2PE_1.fq.gz", + args.verbose, +) +_download_file( + "rnaseq_samples/sample2/sample2.small_R2.fastq.gz", + "workflows/rnaseq/data/example_data/rnaseq_sample2PE_2.fq.gz", + args.verbose, +) -_download_file('chipseq_samples/input_1/input_1.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input1.fq.gz') -_download_file('chipseq_samples/ip_1/ip_1.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip1.fq.gz') -_download_file('chipseq_samples/input_2/input_2.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input2.fq.gz') -_download_file('chipseq_samples/ip_2/ip_2.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip2.fq.gz') -_download_file('chipseq_samples/ip_3/ip_3.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip3.fq.gz') -_download_file('chipseq_samples/ip_4/ip_4.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_ip4.fq.gz') -_download_file('chipseq_samples/input_3/input_3.tiny_R1.fastq.gz', 'workflows/chipseq/data/example_data/chipseq_input3.fq.gz') +_download_file( + "chipseq_samples/input_1/input_1.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_input1.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/ip_1/ip_1.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_ip1.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/input_2/input_2.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_input2.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/ip_2/ip_2.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_ip2.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/ip_3/ip_3.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_ip3.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/ip_4/ip_4.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_ip4.fq.gz", + args.verbose, +) +_download_file( + "chipseq_samples/input_3/input_3.tiny_R1.fastq.gz", + "workflows/chipseq/data/example_data/chipseq_input3.fq.gz", + args.verbose, +) From bc787f2e0f3abbcc0b457e8a55bf929421a37acb Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Mon, 27 Oct 2025 17:33:18 +0000 Subject: [PATCH 162/196] .fai as input for transcriptome fasta --- workflows/rnaseq/Snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 6c377137..d62e2ec9 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -208,6 +208,7 @@ rule transcriptome_fasta: input: fasta="references/genome.fa", gtf="references/annotation.gtf", + fai="references/genome.fa.fai", output: "references/transcriptome.fa", log: From 1d1683ef864e23c57d03fd0724ce06270a68063e Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Mon, 27 Oct 2025 17:33:34 +0000 Subject: [PATCH 163/196] unzipped references are marked temp() --- workflows/rnaseq/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index d62e2ec9..f9e33b1c 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -139,7 +139,7 @@ rule unzip: input: "references/{prefix}.gz", output: - "references/{prefix}", + temporary("references/{prefix}"), resources: mem="4g", runtime="2h", From d73109e5fa94b5f8357b6e4fd2ec8a0688c3f971 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Mon, 27 Oct 2025 17:33:53 +0000 Subject: [PATCH 164/196] updates to decision log --- docs/decisions.rst | 167 +++++++++++++++++++++++++++------------------ 1 file changed, 99 insertions(+), 68 deletions(-) diff --git a/docs/decisions.rst b/docs/decisions.rst index 1ceeb339..4de9189f 100644 --- a/docs/decisions.rst +++ b/docs/decisions.rst @@ -9,18 +9,12 @@ Here are use-cases we have that are common enough to warrant supporting: **References should support multiple workflows (ChIP-seq, RNA-seq, etc)** -- This implies that the means the references dir should be in the ``workflows`` - directory or above. -- For example, this may mean a STAR index for RNA-seq, a bowtie2 index for rRNA - contamination, and another bowtie2 index for ChIP-seq. - **References should support different organisms in different workflows. There should be only one organism per workflow though.** - For example, ``workflows/mouse-rnaseq`` and ``workflows/human-rnaseq`` should be supported in the same project. - **References should be re-created for each project.** - Historically we had a central location for the references (shared by multiple @@ -32,13 +26,13 @@ should be only one organism per workflow though.** back what commands were run to generate the reference, including additional patching that may have taken place (as is supported by the references workflow). -- Re-using indexes is space- and time-efficient in the short term, but has - shown to be inefficient in time and reproducibility in the long term. +- Re-using indexes is space- and time-efficient in the short term, but experience has + shown it to be inefficient in time and reproducibility in the long term. - Keeping everything in the same deployment directory also helps with the archiving process. - We were hesitant to update the references in the central location due to being unsure of what was depending on them. -- Overall, making the decision that the time and space cost to re-make +- Overall, here we make the decision that the time and space cost to re-make references for each project is worth the gain in simplicity and isolation. Reference nomenclature and directory structure @@ -49,24 +43,28 @@ Options considered: 1. ``references`` (top-level of project, shared by all workflows) 2. ``workflows//references`` (workflow-specific) -The location ``workflows/references`` is functionally similar to top-level -``references`` (in a parent directory of individual workflows) but references -is no longer a workflow so it doesn't make sense to have it right in the -``workflows`` directory. +The possible location ``workflows/references`` is functionally similar to +top-level ``references`` (in a parent directory of individual workflows) but +references is no longer a workflow so it doesn't make sense to have it right in +the ``workflows`` directory. So this was excluded as an option. Recall that in lcdb-wf <2.0, we have organism and then tag. For example, we might have configurations available for different human genome assemblies (hg19, hg38) and in the central location we needed to differentiate between -them (e.g. ``references/human/hg19/``). +them (e.g. ``references/human/hg19/``), which we did with tags. -If we assume a single organism per workflow, and that the references are -workflow-specific, then we don't need any of this. +If we assume a single organism per workflow, which seems reasonable and that +the references are workflow-specific, then we don't need any of this. ``workflows//references/genome.fa`` for example should cover it. This becomes inefficient in the case where there are multiple workflows, all -for the same organism and all the same workflow type. However in such cases, -manually creating symlinks can get around this, and I think it's an acceptable -workaround for the benefit of simplified references more generally. +for the same organism and all the same workflow type. For example, a project +with chipseq and a two different RNA-seq experiments would have three copies of +the genome fasta. However in such cases, manually creating symlinks can get +around this if space is a problem, and I think it's an acceptable workaround +for the benefit of simplified references more generally. + +So we might have something like the following: :: @@ -117,8 +115,8 @@ can be quite close to the equivalent command-line call. Since rules in these Snakefiles are intended to be edited, it makes sense to keep them as close to the command-line as is reasonable. -Take the cutadapt rule, for example, where we typically would want to include -the adapters in the call, but it's not uncommon to add other arguments. Here +Take the cutadapt rule for example, where we typically would want to include +the adapters but it's not uncommon to add other arguments. Here we're working with a simplified, single-end version of it: .. code-block:: python @@ -128,8 +126,7 @@ we're working with a simplified, single-end version of it: fastq='{sample}.fastq.gz" output: fastq='{sample}.cutadapt.fastq.gz' - threads: - 8 + threads: 8 shell: "cutadapt " "-o {output[0]} " @@ -248,8 +245,9 @@ cutadapt depend on that. Here's the actual rule: "&> {log}" ) -Notice that we have some shared arguments as well as a PE-specific adapter -argument. Converting this one to params would be something like the following: +Notice that we have some shared arguments (``--nextseq-trim``, ``--overlap``, +``--minimum-length``) as well as a PE-specific adapter argument. Converting +this one to params would be something like the following: .. code-block:: python @@ -350,7 +348,8 @@ specific is handled there? Now it becomes a little harder to understand what's going on, and we may have gone too far in pulling everything out into params. So maybe an absolute -principle of "everything in params" is not useful. +principle of "everything must go in params" is not useful because it impacts +clarity. Let's take another example, the featureCounts rule for RNA-seq: @@ -393,13 +392,15 @@ Let's take another example, the featureCounts rule for RNA-seq: ) Here, it is important to have ``strand_arg`` be in the params. To understand -why, imagine if instead we determined that argument inside the ``run:`` block, -and then we changed the config file's stranded entry (``config["stranded"]``). -Then this rule would NOT re-run because the code didn't change -- Snakemake -does not *evaluate* the code in a ``run:`` block to determine if it changed. -However, it *does* evaluate the params. So in this case, it's necessary to keep -the strand argument detection in the params to take advantage of this behavior, -and correctly re-run the rule if the config's strand argument has changed. +why, imagine if we determined that argument inside the ``run:`` block instead +of in params, and then we changed the config file's stranded entry +(``config["stranded"]``). Even though we would want it to re-run (since the +config changed), this rule would NOT re-run because the *code* didn't change -- +Snakemake does not *evaluate* the code in a ``run:`` block to determine if it +changed. However, it *does* evaluate the params. So in this case, it's +necessary to keep the strand argument detection in the params to take advantage +of this behavior, and correctly re-run the rule if the config's strand argument +has changed. Next, we would want to decide whether *all* arguments should go in ``params:``. In this case, since we're sort of forced to split out ``strand_arg``, we might @@ -419,29 +420,34 @@ Guidelines: - Stranded arguments must be in params - SE/PE arguments should be handled inside a ``run:`` block - Any other arguments should be written in a ``shell:`` block or a ``shell()`` - call directly, to visually match the equivalent command-line call + call directly, to visually match the equivalent command-line call and to make + it clear what should be edited. Arguments for and against a separate references workflow -------------------------------------------------------- RNA-seq, ChIP-seq, and the upcoming variant calling all need to do something -with references, including possibly patching them. So we have to deal with this -inherent complexity. It initially made sense to put such common rules in the +with references, including possibly patching them. We have to deal with this +inherent complexity. It initially made sense to put common rules in the separate references workflow. However, only a subset of the rules in the references workflow are actually shared across RNA-seq and ChIP-seq -- currently, only the bowtie2 index (genome-wide ChIP-seq alignment; rRNA screening for RNA-seq), the fasta rule, -chromsizes, and the generic unzip rule. The others (gtf, mappings, -conversion_bed12, conversion_refflat, kallisto_index, salmon_index, -transcriptome_fasta, star_index, rrna) are all unique to RNA-seq. So the -current references workflow is actually mostly an RNA-seq-only references -workflow. +chromsizes, and the generic unzip rule. The other rules in the `__ on the subject is a useful guideline. To summarize, we want to exclude alt contigs / haplotypes because they may create multimapping issues, and we want to -include unassembled contigs because excluding them will artificially decrease +include unassembled contigs because excluding them would artificially decrease alignment percentage. Since lcdb-wf is intended to be used with arbitrary organisms, the PAR and mitochondrial sequences mentioned there are not relevant in general. -Ideally, we would have a tool that, given the URLs for raw fastq and gtf, - -1. Displays the set of chromosomes -2. Infers if there are any that look like rDNA or mtDNA -3. Ensures the GTF matches the fasta match chromosomes -4. Accepts a template config to assess to process - Annotations ----------- @@ -549,17 +551,30 @@ Erring on the side of too many annotations (i.e., using the comprehensive annotation instead of a curated version) will result in more features, which at face value might make the FDR adjustment more harsh in DESeq2. But DESeq2's independent filtering (not even testing those features with so few reads that -they would not reach significance) guards against this. +they would not reach significance) guards against this. So we stick with the +comprehensive annotations when available. Zipping/unzipping references ---------------------------- -STAR requires uncompressed FASTA and GTF files to build the index. Making -uncompressed files temporary means running the risk of another rule needing -uncompressed to trigger costly STAR alignment. The extra storage cost of -leaving an uncompressed fasta (~3 GB) around is minimal compared to the scale -of all other data, and guards against inadvertently re-running all alignment -jobs. +Some tools need uncompressed files, others are fine with compressed. For example, +STAR requires uncompressed FASTA and GTF files to build the index, but bowtie2 +can use a compressed fasta. gffread nees uncompressed FASTA and GTF to make +a transcriptome fasta. + +Previously, anything using a FASTA or GTF would use the uncompressed version, +and the ``unzip`` rule marked the uncompressed output as temporary. The problem +with this was when we wanted to make a change in featureCounts. Since this used +the temp uncompressed GTF file, the ``unzip`` rule needed to run again...but +that would then trigger the STAR rule to rerun, because it too used that temp +file and it was being changed (well, re-created but that's the same to +Snakemake). As a result, we had to spend the time/resource cost to realign +*everything* and all the downstream jobs after alignment, just to run +featureCounts. + +Making the featureCounts rule use the compressed GTF avoids this issue. Now, +just the transcriptome fasta and the STAR index need the uncompressed +references, and these are set in the ``unzip`` rule to be temporary. Test framework -------------- @@ -603,7 +618,6 @@ post-process mechanism to filter the fasta. - Aligners -------- @@ -611,3 +625,20 @@ Previously, HISAT2 and STAR were both supported; salmon and kallisto were both supported. This created additional complexity in the references workflow and in the configs. Now, we're just using STAR and salmon (for RNA-seq) and bowtie2 for ChIP-seq. + +Aligners don't seem to make that much of a difference, and officially +supporting just one (plus a psueodaligner for RNA-seq) makes the workflows and +config simpler. + +Reference genome and annotation sources +--------------------------------------- + +lcdb-wf has always been organism-agnostic. It would be nice to have a single +source of all genomics data such that we could pass an organism name and get +back the referencs. But even Ensembl and NCBI are not uniform in their support. +Sometimes primary assemblies are available; sometimes primary chromosome fastas +are available but the top-level is actually primary (rat, Ensembl); A GTF might +not be available (pombe, Ensembl); or only a toplevel assembly is available and +we need to remove the haplotypes and alt loci out (hg19, Ensembl). + + From 919c45874901889328c935abb9bcfd83da7079b9 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 28 Oct 2025 18:29:13 +0000 Subject: [PATCH 165/196] gzip transcriptome fasta and mapping tsv --- workflows/rnaseq/Snakefile | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index f9e33b1c..f5a6f801 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -31,7 +31,7 @@ localrules: rule all: input: "data/rnaseq_aggregation/multiqc.html", - "references/annotation.mapping.tsv", + "references/annotation.mapping.tsv.gz", rule symlinks: @@ -210,19 +210,21 @@ rule transcriptome_fasta: gtf="references/annotation.gtf", fai="references/genome.fa.fai", output: - "references/transcriptome.fa", + fa=temporary("references/transcriptome.fa"), + gz="references/transcriptome.fa.gz", log: "references/transcriptome.log", resources: mem="4g", runtime="2h", shell: - "gffread {input.gtf} -w {output} -g {input.fasta} &> {log}" + "gffread {input.gtf} -w {output.fa} -g {input.fasta} &> {log} " + "&& gzip -c {output.fa} > {output.gz} " rule salmon_index: input: - "references/transcriptome.fa", + "references/transcriptome.fa.gz", output: "references/salmon/versionInfo.json", log: @@ -297,21 +299,20 @@ rule mappings: input: gtf="references/annotation.gtf.gz", output: - tsv="references/annotation.mapping.tsv", + "references/annotation.mapping.tsv.gz", resources: - mem="2g", + mem="24g", runtime="2h", run: + tsv = output[0].replace(".gz", "") mappings_args = dict( exclude_featuretypes=None, include_featuretypes=None, include_attributes=None, ) - print(config["annotation"].get("mappings", {})) - mappings_args.update(config["annotation"].get("mappings", {})) - - utils.mappings_tsv(input.gtf, output.tsv, **mappings_args) + utils.mappings_tsv(input.gtf, tsv, **mappings_args) + shell("gzip {tsv}") rule symlink_targets: From a3b6b33260256fd00eece56f340f33cddba3b805 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 28 Oct 2025 18:30:24 +0000 Subject: [PATCH 166/196] pep8 --- lib/utils.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index a0f90a7f..97fed6cb 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1,31 +1,33 @@ import binascii import collections import contextlib +import csv import gzip import os import re -import sys import subprocess +import sys import warnings from collections.abc import Iterable from itertools import product +import gffutils import pandas import pandas as pd import yaml from Bio import SeqIO from snakemake.io import expand, regex_from_filepattern from snakemake.shell import shell -import gffutils -import csv # Small helper functions + def render_r1_r2(pattern): - return expand(pattern, sample='{sample}', n=c.n) + return expand(pattern, sample="{sample}", n=c.n) + def render_r1_only(pattern): - return expand(pattern, sample='{sample}', n=1) + return expand(pattern, sample="{sample}", n=1) def resolve_name(name): @@ -744,6 +746,7 @@ def filter_rrna_fastas(tmpfiles, outfile, pattern): """ if pattern is None: raise ValueError("Pattern cannot be None") + def gen(): for tmp in tmpfiles: handle = gzip.open(tmp, "rt") @@ -866,7 +869,6 @@ def func(infiles, outfile, *args, **kwargs): """ - if not isinstance(postprocess, list): postprocess = [postprocess] @@ -1187,10 +1189,13 @@ def gff2gtf(gff, gtf): def wrapper_for(path): - return 'file:' + os.path.join('../..','wrappers', 'wrappers', path) + return "file:" + os.path.join("../..", "wrappers", "wrappers", path) + def detect_sra(sampletable): - return 'Run' in sampletable.columns and any(sampletable['Run'].str.startswith('SRR')) + return "Run" in sampletable.columns and any( + sampletable["Run"].str.startswith("SRR") + ) def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None, include_attributes=None): @@ -1218,7 +1223,7 @@ def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None, raise ValueError("Both include_featuretypes and exclude_featuretypes were specified.") res = [] - keys = set(['__featuretype__']) + keys = set(["__featuretype__"]) seen = set() for f in gffutils.DataIterator(gtf): ft = f.featuretype @@ -1245,8 +1250,10 @@ def unlist_dict(d): sorted_keys = sorted(include_attributes) else: sorted_keys = sorted(keys) - with open(tsv, 'w') as fout: - writer = csv.DictWriter(fout, fieldnames=sorted_keys, restval="", delimiter='\t') + with open(tsv, "w") as fout: + writer = csv.DictWriter( + fout, fieldnames=sorted_keys, restval="", delimiter="\t" + ) writer.writeheader() for row in res: writer.writerow(unlist_dict(row)) From 1c698b242bce47d9ed5146b0db9035ac5f84f5ce Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 28 Oct 2025 18:30:38 +0000 Subject: [PATCH 167/196] include/exclude attributes in mappings --- lib/utils.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index 97fed6cb..1363cdf2 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1198,7 +1198,14 @@ def detect_sra(sampletable): ) -def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None, include_attributes=None): +def mappings_tsv( + gtf, + tsv, + exclude_featuretypes=None, + include_featuretypes=None, + include_attributes=None, + exclude_attributes=None, +): """ Create a TSV file of attributes found in a GTF file. @@ -1213,14 +1220,20 @@ def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None, E.g., we likely don't need entries for start_codon if those are in the GTF. - include_attributes : list - Restrict the attributes reported in the TSV. Should at least have + include_attributes, exclude_attributes : list + Mutually exclusive. Restrict the attributes reported in the TSV. Should at least have a column for gene ID and transcript ID in order for downstream RNA-seq work. """ if exclude_featuretypes and include_featuretypes: - raise ValueError("Both include_featuretypes and exclude_featuretypes were specified.") + raise ValueError( + "Both include_featuretypes and exclude_featuretypes were specified." + ) + if exclude_attributes and include_attributes: + raise ValueError( + "Both include_attributes and exclude_attributes were specified." + ) res = [] keys = set(["__featuretype__"]) @@ -1231,13 +1244,23 @@ def mappings_tsv(gtf, tsv, exclude_featuretypes=None, include_featuretypes=None, continue if include_featuretypes and ft not in include_featuretypes: continue + d = dict(f.attributes) + + if include_featuretypes: + d = {k: v for k, v in d.items() if k in include_featuretypes} + if exclude_featuretypes: + d = {k: v for k, v in d.items() if k not in exclude_featuretypes} + keys.update(d.keys()) d["__featuretype__"] = ft + + # Exclude duplicates (rather than sorting and uniq-ing the file later) h = hash(str(d)) if h in seen: continue seen.update([h]) + res.append(d) def unlist_dict(d): From 6b788ed78753fe1620ee2ffcc5f0d2c65efa8e52 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 31 Oct 2025 15:55:38 +0000 Subject: [PATCH 168/196] fix default postprocess --- lib/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/utils.py b/lib/utils.py index 1363cdf2..03dbe62f 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -895,10 +895,13 @@ def func(infiles, outfile, *args, **kwargs): # # ] # + def _default(origfn, newfn): + shell("mv {origfn} {newfn}") + for i, postprocess_i in enumerate(postprocess): if postprocess_i is None: - func = default_postprocess + func = _default args = () kwargs = {} name = None From 2b7084f5d60ea63061e8877c1ae299a976460157 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 31 Oct 2025 15:57:17 +0000 Subject: [PATCH 169/196] use configurable references dir --- workflows/chipseq/Snakefile | 21 ++++----- workflows/rnaseq/Snakefile | 87 +++++++++++++++++++------------------ 2 files changed, 55 insertions(+), 53 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 4f347eb7..0c9688a0 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -17,6 +17,7 @@ is_paired = utils.detect_layout(sampletable) == "PE" n = ["1", "2"] if is_paired else ["1"] SAMPLES = sampletable.iloc[:, 0].values LABELS = sampletable.label.values +REFERENCES = config.get("references", "references") peaks = chipseq.add_bams_to_peak_calling(config) @@ -43,9 +44,9 @@ rule all: rule fasta: output: - "references/genome.fa.gz", + f"{REFERENCES}/genome.fa.gz", log: - "references/logs/genome.fa.gz.log", + f"{REFERENCES}/logs/genome.fa.gz.log", resources: mem_mb="4g", runtime="2h", @@ -63,11 +64,11 @@ rule fasta: rule chromsizes: input: - "references/genome.fa.gz", + f"{REFERENCES}/genome.fa.gz", output: - "references/genome.chromsizes", + f"{REFERENCES}/genome.chromsizes", log: - "references/logs/genome.chromsizes.log", + f"{REFERENCES}/logs/genome.chromsizes.log", params: # java_args='-Xmx2g' # [enable for test] java_args="-Xmx20g", # [disable for test] @@ -89,12 +90,12 @@ rule chromsizes: rule bowtie2_index: input: - "references/genome.fa.gz", + f"{REFERENCES}/genome.fa.gz", output: - "references/bowtie2/genome.1.bt2", - "references/bowtie2/genome.fa", + f"{REFERENCES}/bowtie2/genome.1.bt2", + f"{REFERENCES}/bowtie2/genome.fa", log: - "references/logs/bowtie2_genome.log", + f"{REFERENCES}/logs/bowtie2_genome.log", resources: mem="32g", disk="50g", @@ -227,7 +228,7 @@ rule bowtie2: n=n, allow_missing=True, ), - index="references/bowtie2/genome.1.bt2", + index=f"{REFERENCES}/bowtie2/genome.1.bt2", output: bam=temporary("data/chipseq_samples/{sample}/{sample}.cutadapt.bam"), log: diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index f5a6f801..9327b0ac 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -15,6 +15,7 @@ sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" n = ["1", "2"] if is_paired else ["1"] SAMPLES = sampletable.index +REFERENCES = config.get("references", "references") sample_dir = "data/rnaseq_samples" @@ -31,7 +32,7 @@ localrules: rule all: input: "data/rnaseq_aggregation/multiqc.html", - "references/annotation.mapping.tsv.gz", + f"{REFERENCES}/annotation.mapping.tsv.gz", rule symlinks: @@ -55,9 +56,9 @@ rule symlinks: rule fasta: output: - "references/genome.fa.gz", + f"{REFERENCES}/genome.fa.gz", log: - "references/logs/genome.fa.gz.log", + f"{REFERENCES}/logs/genome.fa.gz.log", resources: mem_mb="4g", runtime="2h", @@ -75,9 +76,9 @@ rule fasta: rule faidx: input: - "references/genome.fa", + f"{REFERENCES}/genome.fa", output: - "references/genome.fa.fai", + f"{REFERENCES}/genome.fa.fai", resources: mem_mb="4g", runtime="2h", @@ -87,9 +88,9 @@ rule faidx: rule annotation: output: - "references/annotation.gtf.gz", + f"{REFERENCES}/annotation.gtf.gz", log: - "references/logs/annotation.gtf.gz.log", + f"{REFERENCES}/logs/annotation.gtf.gz.log", resources: mem="4g", runtime="2h", @@ -107,9 +108,9 @@ rule annotation: rule rrna_fasta: output: - "references/rrna.fa.gz", + f"{REFERENCES}/rrna.fa.gz", log: - "references/logs/rrna.fa.log", + f"{REFERENCES}/logs/rrna.fa.log", resources: mem="4g", runtime="2h", @@ -137,9 +138,9 @@ rule rrna_fasta: rule unzip: input: - "references/{prefix}.gz", + f"{REFERENCES}/{{prefix}}.gz", output: - temporary("references/{prefix}"), + temporary(f"{REFERENCES}/{{prefix}}"), resources: mem="4g", runtime="2h", @@ -149,12 +150,12 @@ rule unzip: rule rrna_index: input: - "references/rrna.fa.gz", + f"{REFERENCES}/rrna.fa.gz", output: - "references/bowtie2/rrna.1.bt2", - "references/bowtie2/rrna.fa.gz", + f"{REFERENCES}/bowtie2/rrna.1.bt2", + f"{REFERENCES}/bowtie2/rrna.fa.gz", log: - "references/logs/bowtie2_rrna.log", + f"{REFERENCES}/logs/bowtie2_rrna.log", resources: mem="32g", disk="50g", @@ -168,12 +169,12 @@ rule rrna_index: rule star_index: input: - fasta="references/genome.fa", - gtf="references/annotation.gtf", + fasta=f"{REFERENCES}/genome.fa", + gtf=f"{REFERENCES}/annotation.gtf", output: - "references/star/Genome", + f"{REFERENCES}/star/Genome", log: - "references/logs/star.log", + f"{REFERENCES}/logs/star.log", threads: 8 resources: mem="64g", @@ -206,14 +207,14 @@ rule star_index: rule transcriptome_fasta: input: - fasta="references/genome.fa", - gtf="references/annotation.gtf", - fai="references/genome.fa.fai", + fasta=f"{REFERENCES}/genome.fa", + gtf=f"{REFERENCES}/annotation.gtf", + fai=f"{REFERENCES}/genome.fa.fai", output: - fa=temporary("references/transcriptome.fa"), - gz="references/transcriptome.fa.gz", + fa=temporary(f"{REFERENCES}/transcriptome.fa"), + gz=f"{REFERENCES}/transcriptome.fa.gz", log: - "references/transcriptome.log", + f"{REFERENCES}/transcriptome.log", resources: mem="4g", runtime="2h", @@ -224,13 +225,13 @@ rule transcriptome_fasta: rule salmon_index: input: - "references/transcriptome.fa.gz", + f"{REFERENCES}/transcriptome.fa.gz", output: - "references/salmon/versionInfo.json", + f"{REFERENCES}/salmon/versionInfo.json", log: - "references/logs/salmon.log", + f"{REFERENCES}/logs/salmon.log", params: - outdir="references/salmon", + outdir=f"{REFERENCES}/salmon", resources: mem="32g", runtime="2h", @@ -241,11 +242,11 @@ rule salmon_index: rule conversion_refflat: input: - "references/annotation.gtf.gz", + f"{REFERENCES}/annotation.gtf.gz", output: - "references/annotation.refflat", + f"{REFERENCES}/annotation.refflat", log: - "references/logs/annotation.refflat.log", + f"{REFERENCES}/logs/annotation.refflat.log", resources: mem="2g", runtime="2h", @@ -257,9 +258,9 @@ rule conversion_refflat: rule conversion_bed12: input: - "references/annotation.gtf.gz", + f"{REFERENCES}/annotation.gtf.gz", output: - "references/annotation.bed12", + f"{REFERENCES}/annotation.bed12", resources: mem="2g", runtime="2h", @@ -271,11 +272,11 @@ rule conversion_bed12: rule chromsizes: input: - "references/genome.fa.gz", + f"{REFERENCES}/genome.fa.gz", output: - "references/genome.chromsizes", + f"{REFERENCES}/genome.chromsizes", log: - "references/logs/genome.chromsizes.log", + f"{REFERENCES}/logs/genome.chromsizes.log", params: # java_args='-Xmx2g' # [enable for test] java_args="-Xmx20g", # [disable for test] @@ -297,9 +298,9 @@ rule chromsizes: rule mappings: input: - gtf="references/annotation.gtf.gz", + gtf=f"{REFERENCES}/annotation.gtf.gz", output: - "references/annotation.mapping.tsv.gz", + f"{REFERENCES}/annotation.mapping.tsv.gz", resources: mem="24g", runtime="2h", @@ -405,7 +406,7 @@ rule star: input: fastq=rules.cutadapt.output, index=rules.star_index.output, - annotation="references/annotation.gtf", + annotation=f"{REFERENCES}/annotation.gtf", output: bam=temporary("data/rnaseq_samples/{sample}/{sample}.cutadapt.bam"), sjout=temporary( @@ -463,7 +464,7 @@ rule star: rule rRNA: input: fastq="data/rnaseq_samples/{sample}/{sample}_R1.cutadapt.fastq.gz", - index="references/bowtie2/rrna.1.bt2", + index=f"{REFERENCES}/bowtie2/rrna.1.bt2", output: bam="data/rnaseq_samples/{sample}/rRNA/{sample}.cutadapt.rrna.bam", log: @@ -574,7 +575,7 @@ rule namesorted_bam: rule featurecounts: input: - annotation="references/annotation.gtf.gz", + annotation=f"{REFERENCES}/annotation.gtf.gz", bam=expand( ( rules.namesorted_bam.output @@ -691,7 +692,7 @@ rule preseq: rule salmon: input: fastq=rules.cutadapt.output, - index="references/salmon/versionInfo.json", + index=f"{REFERENCES}/salmon/versionInfo.json", output: "data/rnaseq_samples/{sample}/{sample}.salmon/quant.sf", log: From 0ea7cfcbadd5b464a400cec82ad9fc563ebb7005 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Fri, 31 Oct 2025 21:24:58 +0000 Subject: [PATCH 170/196] preflight checks --- lib/utils.py | 30 ++++++++++++++++++++++-------- workflows/chipseq/Snakefile | 1 + workflows/rnaseq/Snakefile | 1 + 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index 03dbe62f..202cddd3 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -698,14 +698,28 @@ def preflight(config): check_unique_samplename(sampletable) if "orig_filename" in sampletable.columns: check_unique_fn(sampletable) - - -def rnaseq_preflight(c): - pass - - -def chipseq_preflight(c): - pass + if "genome" not in config: + raise ConfigurationError("Config is missing 'genome' key") + if "url" not in config["genome"]: + raise ConfigurationError("Config is missing 'url' key for 'genome'") + + +def rnaseq_preflight(config): + preflight(config) + if "annotation" not in config: + raise ConfigurationError("Config is missing 'annotation' key") + if "url" not in config["annotation"]: + raise ConfigurationError("Config is missing 'url' key for 'annotation'") + if "stranded" not in config: + raise ConfigurationError("Config is missing 'stranded' key") + if "organism" not in config: + raise ConfigurationError("Config is missing 'organism' key") + + +def chipseq_preflight(config): + preflight(config) + if "peaks" not in config: + config["peaks"] = [] def strand_arg_lookup(config, lookup): diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 0c9688a0..83386e82 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -11,6 +11,7 @@ from lib import chipseq configfile: "config/config.yaml" +utils.chipseq_preflight(config) sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 9327b0ac..6c776dde 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -10,6 +10,7 @@ from lib import utils configfile: "config/config.yaml" +utils.rnaseq_preflight(config) sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) is_paired = utils.detect_layout(sampletable) == "PE" From 0ead4c3783e302b05bcc28e585dbde7ae5259289 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 4 Nov 2025 10:23:22 -0500 Subject: [PATCH 171/196] rm no-longer used dependencies --- include/requirements.txt | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/include/requirements.txt b/include/requirements.txt index dfcb8601..98b67ee4 100644 --- a/include/requirements.txt +++ b/include/requirements.txt @@ -1,18 +1,13 @@ bedtools biopython -bowtie bowtie2 cutadapt>=3.0 deeptools epic2 -fastq-screen fastqc font-ttf-dejavu-sans-mono gffread gffutils -hisat2 -intervalstats -ipython macs3 multiqc pandas @@ -25,16 +20,14 @@ preseq pybedtools pyfaidx pysam -pytest -pytest-xdist python +pytest rseqc # earlier versions of salmon can segfault on Slurm salmon>=1.10.1 samtools -seaborn snakemake>8 sra-tools star @@ -43,11 +36,6 @@ trackhub ucsc-bedgraphtobigwig ucsc-bedsort ucsc-bedtobigbed -ucsc-bigwigmerge -ucsc-fetchchromsizes ucsc-genepredtobed ucsc-gtftogenepred -ucsc-liftover -ucsc-oligomatch ucsc-twobittofa -ucsc-wigtobigwig From dbb21a9a54fc27a83c0c009b834083c6db96f3d0 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 4 Nov 2025 21:24:15 +0000 Subject: [PATCH 172/196] fill in label column with sample names if missing --- workflows/chipseq/Snakefile | 1 + workflows/chipseq/config/sampletable.tsv | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index 83386e82..f54dddcc 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -14,6 +14,7 @@ configfile: "config/config.yaml" utils.chipseq_preflight(config) sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) +sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0]) is_paired = utils.detect_layout(sampletable) == "PE" n = ["1", "2"] if is_paired else ["1"] SAMPLES = sampletable.iloc[:, 0].values diff --git a/workflows/chipseq/config/sampletable.tsv b/workflows/chipseq/config/sampletable.tsv index 05212460..bb7e7831 100644 --- a/workflows/chipseq/config/sampletable.tsv +++ b/workflows/chipseq/config/sampletable.tsv @@ -1,11 +1,11 @@ # Samplenames with the same "label" will be considered technical replicates -samplename antibody biological_material replicate label orig_filename -input_1 input wingdisc-1 1 input-wingdisc-1 data/example_data/chipseq_input1.fq.gz -input_2 input wingdisc-2 2 input-wingdisc-2 data/example_data/chipseq_input2.fq.gz -ip_1 gaf wingdisc-1 1 gaf-wingdisc-1 data/example_data/chipseq_ip1.fq.gz -ip_2 gaf wingdisc-2 2 gaf-wingdisc-2 data/example_data/chipseq_ip2.fq.gz - +samplename label antibody biological_material replicate orig_filename +input-wingdisc-1 input wingdisc-1 1 data/example_data/chipseq_input1.fq.gz +input-wingdisc-2 input wingdisc-2 2 data/example_data/chipseq_input2.fq.gz +gaf-wingdisc-1 gaf wingdisc-1 1 data/example_data/chipseq_ip1.fq.gz +gaf-wingdisc-2 gaf wingdisc-2 2 data/example_data/chipseq_ip2.fq.gz + # Note here we are treating ip_3 and ip_4 as technical replicates for the sake of testing -ip_3 gaf embryo-1 1 gaf-embryo-1 data/example_data/chipseq_ip3.fq.gz -ip_4 gaf embryo-1 1 gaf-embryo-1 data/example_data/chipseq_ip4.fq.gz -input_3 input embryo-1 1 input-embryo-1 data/example_data/chipseq_input3.fq.gz +ip_3 gaf-embryo-1 gaf embryo-1 1 data/example_data/chipseq_ip3.fq.gz +ip_4 gaf-embryo-1 gaf embryo-1 1 data/example_data/chipseq_ip4.fq.gz +input-embryo-1 input embryo-1 1 data/example_data/chipseq_input3.fq.gz From a73eb63e85850a45635042a885334bf9d56bcfd1 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 4 Nov 2025 21:27:18 +0000 Subject: [PATCH 173/196] rm plotfingerprint --- workflows/chipseq/Snakefile | 66 +------------------------------------ 1 file changed, 1 insertion(+), 65 deletions(-) diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index f54dddcc..d3568b58 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -421,61 +421,6 @@ rule bigwig: "&> {log}" -rule fingerprint: - input: - bams=lambda wc: expand( - "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", - label=wc.ip_label, - ), - control=lambda wc: expand( - "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam", - label=chipseq.merged_input_for_ip(sampletable, wc.ip_label), - ), - bais=lambda wc: expand( - "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", - label=wc.ip_label, - ), - control_bais=lambda wc: expand( - "data/chipseq_merged/{label}/{label}.cutadapt.unique.nodups.merged.bam.bai", - label=chipseq.merged_input_for_ip(sampletable, wc.ip_label), - ), - output: - plot="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.png", - raw_counts="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.tab", - metrics="data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics", - threads: 8 - log: - "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics.log", - threads: 1 - resources: - mem="32g", - runtime="2h", - run: - if len(input.control) == 0: - jsdsample_arg = "" - else: - jsdsample_arg = "--JSDsample " + str(input.control) - shell( - "plotFingerprint " - "--bamfiles {input.bams} " - "-p {threads} " - # The JSDsample argument is disabled for testing as it dramatically - # increases the run time. - "{jsdsample_arg} " # [disable for test] - "--outQualityMetrics {output.metrics} " - "--outRawCounts {output.raw_counts} " - "--plotFile {output.plot} " - # Default is 500k; use fewer to speed up testing: - # '--numberOfSamples 50 ' # [enable for test] - "--smartLabels " - "--extendReads=300 " - "--skipZeros " - "&> {log} " - '&& sed -i "s/NA/0.0/g" {output.metrics} ' - ) - - - rule macs: input: ip=lambda wc: expand( @@ -664,16 +609,7 @@ rule multiqc: expand(rules.samtools_idxstats.output, sample=SAMPLES), expand(rules.bigwig.output, label=sampletable.label), expand(rules.merge_techreps.output, label=sampletable.label), - expand( - "data/chipseq_aggregation/fingerprints/{ip_label}/{ip_label}_fingerprint.metrics", - ip_label=sampletable.loc[sampletable.antibody != "input", "label"], - ), - expand( - "data/chipseq_samples/{sample}/{sample}.cutadapt.unique.nodups.bam.collectinsertsizemetrics.metrics", - sample=SAMPLES, - ) - if is_paired - else [], + expand(rules.collectinsertsizemetrics.output.metric, sample=SAMPLES) if is_paired else [], [v["bigbed"] for v in peaks.values()], config="config/multiqc_config.yaml", output: From 383fb8b521e99a66417398a88069cd8b3cf297e5 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 4 Nov 2025 22:42:34 -0500 Subject: [PATCH 174/196] clean up with prepare_*_sampletable functions --- lib/utils.py | 17 +++++++++++++++++ workflows/chipseq/Snakefile | 11 ++++------- workflows/rnaseq/Snakefile | 5 +---- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index 202cddd3..c4225f74 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1299,4 +1299,21 @@ def unlist_dict(d): writer.writerow(unlist_dict(row)) +def prepare_chipseq_sampletable(config): + chipseq_preflight(config) + sampletable_fn = config.get("sampletable", "config/sampletable.tsv") + sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#") + sampletable = sampletable.set_inde(sampletable.columns[0], drop=False) + sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0]) + return sampletable + + +def prepare_rnaseq_sampletable(config): + rnaseq_preflight(config) + sampletable_fn = config.get("sampletable", "config/sampletable.tsv") + sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#") + sampletable = sampletable.set_inde(sampletable.columns[0], drop=False) + return sampletable + + # vim: ft=python diff --git a/workflows/chipseq/Snakefile b/workflows/chipseq/Snakefile index d3568b58..e6b01bb8 100644 --- a/workflows/chipseq/Snakefile +++ b/workflows/chipseq/Snakefile @@ -11,13 +11,10 @@ from lib import chipseq configfile: "config/config.yaml" -utils.chipseq_preflight(config) -sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") -sampletable = sampletable.set_index(sampletable.columns[0], drop=False) -sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0]) +sampletable = utils.prepare_chipseq_sampletable(config) is_paired = utils.detect_layout(sampletable) == "PE" n = ["1", "2"] if is_paired else ["1"] -SAMPLES = sampletable.iloc[:, 0].values +SAMPLES = sampletable.index.values LABELS = sampletable.label.values REFERENCES = config.get("references", "references") peaks = chipseq.add_bams_to_peak_calling(config) @@ -607,8 +604,8 @@ rule multiqc: expand(rules.samtools_stats.output, sample=SAMPLES), expand(rules.samtools_flagstat.output, sample=SAMPLES), expand(rules.samtools_idxstats.output, sample=SAMPLES), - expand(rules.bigwig.output, label=sampletable.label), - expand(rules.merge_techreps.output, label=sampletable.label), + expand(rules.bigwig.output, label=LABELS), + expand(rules.merge_techreps.output, label=LABELS), expand(rules.collectinsertsizemetrics.output.metric, sample=SAMPLES) if is_paired else [], [v["bigbed"] for v in peaks.values()], config="config/multiqc_config.yaml", diff --git a/workflows/rnaseq/Snakefile b/workflows/rnaseq/Snakefile index 6c776dde..5c22fda8 100644 --- a/workflows/rnaseq/Snakefile +++ b/workflows/rnaseq/Snakefile @@ -9,10 +9,7 @@ from lib import utils configfile: "config/config.yaml" - -utils.rnaseq_preflight(config) -sampletable = pd.read_table(config["sampletable"], sep="\t", comment="#") -sampletable = sampletable.set_index(sampletable.columns[0], drop=False) +sampletable = utils.prepare_rnaseq_sampletable(config) is_paired = utils.detect_layout(sampletable) == "PE" n = ["1", "2"] if is_paired else ["1"] SAMPLES = sampletable.index From 350f338d1015980921f6a5664578551d70bc5485 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 4 Nov 2025 22:42:56 -0500 Subject: [PATCH 175/196] substantial cleanup in utils --- lib/utils.py | 502 +-------------------------------------------------- 1 file changed, 6 insertions(+), 496 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index c4225f74..17614537 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -1,34 +1,16 @@ import binascii -import collections -import contextlib import csv import gzip import os -import re import subprocess -import sys import warnings from collections.abc import Iterable -from itertools import product import gffutils -import pandas import pandas as pd -import yaml from Bio import SeqIO -from snakemake.io import expand, regex_from_filepattern from snakemake.shell import shell -# Small helper functions - - -def render_r1_r2(pattern): - return expand(pattern, sample="{sample}", n=c.n) - - -def render_r1_only(pattern): - return expand(pattern, sample="{sample}", n=1) - def resolve_name(name): """ @@ -54,22 +36,6 @@ def resolve_name(name): return obj -@contextlib.contextmanager -def temp_env(env): - """ - Context manager to temporarily set os.environ. - """ - env = dict(env) - orig = os.environ.copy() - _env = {k: str(v) for k, v in env.items()} - os.environ.update(_env) - try: - yield - finally: - os.environ.clear() - os.environ.update(orig) - - def flatten(iter, unlist=False): """ Flatten an arbitrarily nested iterable whose innermost items are strings @@ -121,110 +87,6 @@ def test_flatten(): assert flatten(["a"]) == ["a"] -def updatecopy(orig, update_with, keys=None, override=False): - """ - Update a copy of a dictionary, with a bit more control than the built-in - dict.update. - - Parameters - ----------- - - orig : dict - Dict to update - - update_with : dict - Dict with new values - - keys : list or None - If not None, then only consider these keys in `update_with`. Otherwise - consider all. - - override : bool - If True, then this is similar to `dict.update`, except only those keys - in `keys` will be considered. If False (default), then if a key exists - in both `orig` and `update_with`, no updating will occur so `orig` will - retain its original value. - """ - d = orig.copy() - if keys is None: - keys = update_with.keys() - for k in keys: - if k in update_with: - if k in d and not override: - continue - d[k] = update_with[k] - return d - - -def update_recursive(orig, update_with): - """ - Recursively update one dict with another. - - From https://stackoverflow.com/a/3233356 - - >>> orig = {'a': {'b': 1, 'c': 2, 'd': [7, 8, 9]}} - >>> update_with = {'a': {'b': 5}} - >>> expected = {'a': {'b': 5, 'c': 2, 'd': [7, 8, 9]}} - >>> result = update_recursive(orig, update_with) - >>> assert result == expected, result - - >>> update_with = {'a': {'d': 1}} - >>> result = update_recursive(orig, update_with) - >>> expected = {'a': {'b': 5, 'c': 2, 'd': 1}} - >>> result = update_recursive(orig, update_with) - >>> assert result == expected, result - """ - for k, v in update_with.items(): - if isinstance(v, collections.abc.Mapping): - orig[k] = update_recursive(orig.get(k, {}), v) - else: - orig[k] = v - return orig - - -def boolean_labels(names, idx, mapping={True: "AND", False: "NOT"}, strip="AND_"): - """ - Creates labels for boolean lists. - - For example: - - >>> names = ['exp1', 'exp2', 'exp3'] - >>> idx = [True, True, False] - >>> boolean_labels(names, idx) - 'exp1_AND_exp2_NOT_exp3' - - Parameters - ---------- - - names : list - List of names to include in output - - idx : list - List of booleans, same size as `names` - - mapping : dict - Linking words to use for True and False - - strip : str - Strip this text off the beginning of labels. - - given a list of names and a same-size boolean, return strings like - - a_NOT_b_AND_c - - or - - a_AND_b_AND_c_NOT_d_AND_e - """ - s = [] - for n, x in zip(names, idx): - s.append(mapping[x] + "_" + n) - s = "_".join(s) - if s.startswith(strip): - s = s.replace(strip, "", 1) - return s - - def make_relative_symlink(target, linkname): """ Helper function to create a relative symlink. @@ -240,33 +102,6 @@ def make_relative_symlink(target, linkname): shell(f"cd {linkdir}; ln -sf {relative_target} {linkbase}") -def extract_wildcards(pattern, target): - """ - Return a dictionary of wildcards and values identified from `target`. - - Returns None if the regex match failed. - - Parameters - ---------- - pattern : str - Snakemake-style filename pattern, e.g. ``{output}/{sample}.bam``. - - target : str - Filename from which to extract wildcards, e.g., ``data/a.bam``. - - Examples - -------- - >>> pattern = '{output}/{sample}.bam' - >>> target = 'data/a.bam' - >>> expected = {'output': 'data', 'sample': 'a'} - >>> assert extract_wildcards(pattern, target) == expected - >>> assert extract_wildcards(pattern, 'asdf') is None - """ - m = re.compile(regex_from_filepattern(pattern)).match(target) - if m: - return m.groupdict() - - def is_gzipped(fn): """ Filename-independent method of checking if a file is gzipped or not. Uses @@ -299,16 +134,6 @@ def gzipped(tmpfiles, outfile): fout.write(line) -def cat(tmpfiles, outfile): - """ - Simple concatenation of files. - - Note that gzipped files can be concatenated as-is without un- and re- - compressing. - """ - shell(f"cat {tmpfiles} > {outfile}") - - def is_paired_end(sampletable, sample): """ Inspects the sampletable to see if the sample is paired-end or not @@ -316,9 +141,12 @@ def is_paired_end(sampletable, sample): Parameters ---------- sampletable : pandas.DataFrame - Contains a "layout" or "LibraryLayout" column (but not both). If the - lowercase value is "pe" or "paired", consider the sample paired-end. - Otherwise consider single-end. + If SRA sampletable, contains a "layout" or "LibraryLayout" column (but + not both). If the lowercase value is "pe" or "paired", consider the + sample paired-end. Otherwise consider single-end. + + Otherwise, if there's an "orig_filename_R2" column consider it + paired-end, otherwise single-end. sample : str Assumed to be found in the first column of `sampletable` @@ -358,46 +186,6 @@ def is_paired_end(sampletable, sample): return False -def fill_r1_r2(sampletable, pattern, r1_only=False): - """ - Returns a function intended to be used as a rule's input function. - - The returned function, when provided with wildcards, will return one or two - rendered versions of a pattern depending on SE or PE respectively. - Specifically, given a pattern (which is expected to contain a placeholder - for "{sample}" and "{n}"), look up in the sampletable whether or not it is - paired-end. - - Parameters - ---------- - - sampletable : pandas.DataFrame - Contains a "layout" column with either "SE" or "PE", or "LibraryLayout" - column with "SINGLE" or "PAIRED". If column does not exist, assume SE. - - pattern : str - Must contain at least a "{sample}" placeholder. - - r1_only : bool - If True, then only return the file for R1 even if PE is configured. - """ - - def func(wc): - try: - wc.sample - except AttributeError: - raise ValueError( - 'Need "{{sample}}" in pattern ' '"{pattern}"'.format(pattern=pattern) - ) - n = [1] - if is_paired_end(sampletable, wc.sample) and not r1_only: - n = [1, 2] - res = expand(pattern, sample=wc.sample, n=n) - return res - - return func - - def pluck(obj, kv): """ For a given dict or list that somewhere contains keys `kv`, return the @@ -418,136 +206,6 @@ def pluck(obj, kv): yield x -# Functions for conveniently working with resources - - -def autobump(*args, **kwargs): - """ - Used to automatically bump resources depending on how many times the job - was attempted. This will return a function that is appropriate to use for - an entry in Snakemake's `resources:` directive:: - - rule example: - input: "a.txt" - resources: - mem_mb=autobump(gb=10), - runtime=autobump(hours=2, increment_hours=10) - - Values can be specified in multiple ways. - - A single number will be provided as the resource, and will be used to - increment each time. For example, this is the equivalent of 10 GB for the - first attempt, and 20 GB for the second: - - >>> f = autobump(1024 * 10) - >>> f(None, 1) - 10240 - - Adding a second unnamed argument will use it as a value to increment by for - each subsequent attempt. This will use 10 GB for the first attempt, and 110 - GB for the second attempt. - - >>> f = autobump(1024 * 10, 1024 * 100) - >>> f(None, 1) - 10240 - - >>> f(None, 2) - 112640 - - Instead of bare numbers, keyword arguments can be used for more convenient - specification of units. The above two examples can also take this form: - - >>> f = autobump(gb=10) - >>> f(None, 1) - 10240 - - >>> f = autobump(gb=10, increment_gb=100) - >>> f(None, 2) - 112640 - - - Units can be minutes, hours, days, mb, gb, or tb. For example: - - >>> f = autobump(hours=2, increment_hours=5) - >>> f(None, 2) - 420 - - """ - multiplier = { - "mb": 1, - "minutes": 1, - "gb": 1024, - "hours": 60, - "days": 1440, - "tb": 1024 * 1024, - } - units = list(multiplier.keys()) - - if args and kwargs: - raise ValueError( - "Mixture of unnamed and keyword arguments not supported with autobump()" - ) - - if len(kwargs) > 2: - raise ValueError("Only 2 kwargs allowed for autobump()") - - elif len(args) == 1 and not kwargs: - baseline_converted = args[0] - increment_converted = baseline_converted - - elif len(args) == 2 and not kwargs: - baseline_converted, increment_converted = args - - elif len(kwargs) <= 2: - baseline_kwargs = [k for k in kwargs.keys() if k in units] - if len(baseline_kwargs) != 1: - raise ValueError( - "Multiple baseline kwargs found. Do you need to change one to have an 'increment_' prefix?" - ) - - baseline_kwarg = baseline_kwargs[0] - baseline_value = kwargs[baseline_kwarg] - baseline_unit = baseline_kwarg - - increment_kwargs = [k for k in kwargs if k.startswith("increment_")] - if increment_kwargs: - assert len(increment_kwargs) == 1 - increment_kwarg = increment_kwargs[0] - increment_value = kwargs[increment_kwarg] - increment_unit = increment_kwarg.split("_")[-1] - else: - increment_value = baseline_value - increment_unit = baseline_unit - - if baseline_unit not in multiplier: - raise ValueError( - f"Baseline unit {baseline_unit} not in valid units {units}" - ) - if increment_unit not in multiplier: - raise ValueError( - f"Increment unit {increment_unit} not in valid units {units}" - ) - - baseline_converted = baseline_value * multiplier[baseline_unit] - increment_converted = increment_value * multiplier[increment_unit] - - else: - raise ValueError(f"Unhandled args and kwargs: {args}, {kwargs}") - - def f(wildcards, attempt): - return baseline_converted + (attempt - 1) * increment_converted - - return f - - -def gb(size_in_gb): - return 1024 * size_in_gb - - -def hours(time_in_hours): - return time_in_hours * 60 - - # Config parsing and handling @@ -576,96 +234,6 @@ def detect_layout(sampletable): raise ValueError(f"Only a single layout (SE or PE) is supported. {report_}") -def fill_patterns(patterns, fill, combination=product): - """ - Fills in a dictionary of patterns with the dictionary `fill`. - - >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2]) - >>> sorted(fill_patterns(patterns, fill)['a']) - ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] - - If using `zip` as a combination, checks to ensure all values in `fill` are - the same length to avoid truncated output. - - This fails: - - >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = dict(sample=['one', 'two', 'three'], N=[1, 2]) - >>> sorted(fill_patterns(patterns, fill, zip)['a']) # doctest: +IGNORE_EXCEPTION_DETAIL - Traceback (most recent call last): - ... - ValueError: {'sample': ['one', 'two', 'three'], 'N': [1, 2]} does not have the same number of entries for each key - - But this works: - - >>> patterns = dict(a='{sample}_R{N}.fastq') - >>> fill = dict(sample=['one', 'one', 'two', 'two', 'three', 'three'], N=[1, 2, 1, 2, 1, 2]) - >>> sorted(fill_patterns(patterns, fill, zip)['a']) - ['one_R1.fastq', 'one_R2.fastq', 'three_R1.fastq', 'three_R2.fastq', 'two_R1.fastq', 'two_R2.fastq'] - - """ - # In recent Snakemake versions (e.g., this happens in 5.4.5) file patterns - # with no wildcards in them are removed from expand when `zip` is used as - # the combination function. - # - # For example, in 5.4.5: - # - # expand('x', zip, d=[1,2,3]) == [] - # - # But in 4.4.0: - # - # expand('x', zip, d=[1,2,3]) == ['x', 'x', 'x'] - - if combination == zip: - lengths = set([len(v) for v in fill.values()]) - if len(lengths) != 1: - raise ValueError( - f"{fill} does not have the same number of entries for each key" - ) - - def update(d, u, c): - for k, v in u.items(): - if isinstance(v, collections.abc.Mapping): - r = update(d.get(k, {}), v, c) - d[k] = r - else: # not a dictionary, so we're at a leaf - if isinstance(fill, pd.DataFrame): - d[k] = list(set(expand(u[k], zip, **fill.to_dict("list")))) - else: - d[k] = list(set(expand(u[k], c, **fill))) - if not d[k]: - d[k] = [u[k]] - return d - - d = {} - return update(d, patterns, combination) - - -def rscript(string, scriptname, log=None): - """ - Saves the string as `scriptname` and then runs it - - Parameters - ---------- - string : str - Filled-in template to be written as R script - - scriptname : str - File to save script to - - log : str - File to redirect stdout and stderr to. If None, no redirection occurs. - """ - with open(scriptname, "w") as fout: - fout.write(string) - if log: - _log = "> {0} 2>&1".format(log) - else: - _log = "" - shell("Rscript {scriptname} {_log}") - - def check_unique_fn(df): """ Raises an error if the fastq filenames are not unique @@ -722,23 +290,6 @@ def chipseq_preflight(config): config["peaks"] = [] -def strand_arg_lookup(config, lookup): - """ - Given a config object and lookup dictionary, confirm that the config has - correctly specified strandedness and then return the value for that key. - """ - if not config.stranded: - raise ConfigurationError( - "Starting in v1.8, 'stranded' is required in the config file. " - "Values can be 'unstranded', 'fr-firststrand' (R1 aligns antisense to original transcript), " - "or 'fr-secondstrand' (R1 aligns sense to original transcript)." - ) - if config.stranded not in lookup: - keys = list(lookup.keys()) - raise KeyError(f"'{config.stranded}' not one of {keys}") - return lookup[config.stranded] - - def filter_rrna_fastas(tmpfiles, outfile, pattern): """ Extract records from fasta file(s) given a search pattern. @@ -1012,24 +563,6 @@ def _default(origfn, newfn): raise ValueError(f"{outfile} does not appear to be gzipped.") -def get_sampletable(config): - """ - Return samples and pandas.DataFrame of parsed sampletable. - - Returns the sample IDs and the parsed sampletable from the file specified - in the config. - - The sample IDs are assumed to be the first column of the sampletable. - - Parameters - ---------- - config : dict - """ - sampletable = pandas.read_csv(config["sampletable"], comment="#", sep="\t") - samples = sampletable.iloc[:, 0] - return samples, sampletable - - def get_techreps(sampletable, label): """ Return all sample IDs for which the "label" column is `label`. @@ -1176,25 +709,6 @@ def check_urls(config, verbose=False): ) -def check_all_urls_found(verbose=True): - """ - Recursively loads all references that can be included and checks them. - Reports out if there are any failures. - """ - check_urls( - { - "include_references": [ - "include/reference_configs", - "test/test_configs", - "workflows/rnaseq/config", - "workflows/chipseq/config", - "workflows/references/config", - ] - }, - verbose=verbose, - ) - - def gff2gtf(gff, gtf): """ Converts a gff file to a gtf format using the gffread function from Cufflinks @@ -1205,10 +719,6 @@ def gff2gtf(gff, gtf): shell("gffread {gff} -T -o- | gzip -c > {gtf}") -def wrapper_for(path): - return "file:" + os.path.join("../..", "wrappers", "wrappers", path) - - def detect_sra(sampletable): return "Run" in sampletable.columns and any( sampletable["Run"].str.startswith("SRR") From b2370d2e823025d1908df436726336258a04be46 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 4 Nov 2025 22:45:17 -0500 Subject: [PATCH 176/196] update decision log: - clearer chipseq config - remove autobump - cleanup utils - rm plotfingerprint - techreps - pep consideration --- docs/decisions.rst | 118 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/docs/decisions.rst b/docs/decisions.rst index 4de9189f..86aa71f7 100644 --- a/docs/decisions.rst +++ b/docs/decisions.rst @@ -641,4 +641,122 @@ are available but the top-level is actually primary (rat, Ensembl); A GTF might not be available (pombe, Ensembl); or only a toplevel assembly is available and we need to remove the haplotypes and alt loci out (hg19, Ensembl). +.. _decisions-sample-specific-params: + +Lack of sample-specific parameters +---------------------------------- + +Currently if we have samples with different library preps that need different +arguments for cutadapt, then they need to be split into two separate workflow +directories. Supporting sample-specific parameters would certainly be possible, +but the addtional complexity this would impose would go against the goal of +reducing complexity. For example, we'd need a location to store multiple sets +of parameters (probably in the config file) and a mechanism to retrieve them +based on sample names. This could be an additional column in the sampletable +indicating "parameter sets", which could be used as a lookup in a ``params:`` +directive lookup function. + +Again, this would be possible, but it is a deliberate design choice to opt for +a simpler approach, which is to use multiple workflow directories and edit the +respective Snakefiles appropriately. In cases where samples across the split +workflows need to be compared or considered together, an additional workflow +can be introduced to aggregate their output. + +PEP support +----------- +Support for `Portable Encapsulated Projects +`__ is built into Snakemake. Using +a combination of PEP config files, sample tables, and subsample tables, it is +possible to set up the workflows to use PEP in such a way that it can be +backwards-compatible with prior lcdb-wf versions. Specifically, by providing +TSV sampletables, forcing a sample column name, and populating the table with +subsamples. It would be convenient to offload the complexity of handling +technical replicate configuration to a third-party package. + +However, getting technical replicates to work correctly proved to be tricky, +due to the way they come in as lists in the resulting dataframe with PEP. While +it would be possible to fix this, some initial experimentation with this +suggested that it would actually be more complex to do that, so deferring to +another package did not result in a net gain in convenience or in complexity +reduction. + +PEP configs are not ruled out completely, but we might need a rewiring and +possible rewriting of the ChIP-seq (and possibly RNA-seq) workflows to fully +support PEP subsamples. I don't consider that effort to be worth it right now, +especially because the current config system already supports technical +replicates. + +Technical replicates +-------------------- +In practice, it's not uncommon for something to go wrong in library prep or +sequencing such that it makes sense to re-do a library. Typically, if it's just +resequencing the same library (perhaps after rebalancing the multiplexing), we +consider that a technical replicate. + +The conventional method for handling technical replicates in RNA-seq is to sum +the counts. That is, we take the Salmon or featureCounts files, where technical +replicates are quantified separately, and sum them after import into R. This +allows us to check QC on individual tech reps e.g. to see if they worked. If we +merged at an early stage (like cocatting the FASTQs), then we would not be able +to check QC separately. + +For ChIP-seq, the conventional method is to merge BAM files. However, we still +want to keep observability of individual technical replicates where possible, +which includes inspecting duplicates. However, when we merge BAMs of technical +replicates that each had duplicates removed, it's possible that we're +introducing additional duplicates. So we do another round of duplicate removal +after merging. + +The end result of all of this is that we get MultiQC output for all of the +technical replicates separately. For ChIP-seq, the post-merging files are +bigWigs and merged-and-deduped BAMs. Currently these do not have separate +entries in MultiQC. + +Removing built-in support for plotFingerprint +--------------------------------------------- + +deepTools' `plotFingerprint +`__ +needs matched input to each antibody. Previously, we configured this in the +sampletable with the combination of "biological_material" and "antibody" +columns. Samples with exactly "input" as the antibody were the matched control +for the non-input samples with the same biological material. + +This ended up being a little complicated because "biological material" is easily +confused with "biological replicate". And now with common CUT&RUN and Cut&Tag +assays that use IgG as control, "IgG" and "control" should probably be aliases +for "input". + +It turns out the "biological_material" column was only ever used for the +plotFingerprint rule. It introduced complexity (in code, configuration, +documentation, and user support) for a single rule. In addition, in practice we +ended up visualizing the bigWigs rather than relying exclusively on the +plotFingerprint metrics. So to reduce complexity, plotFingerpring support is +being removed. + +Clearer ChIP-seq config +----------------------- + +For "label", it was not clear that it was the merged name. And even if there +were no technical replicates in an experiment, it still needed to be filled out +with copies of the sample name. + +Now, ``merged_label`` is an alias for ``label``. If the column is missing +entirely, or if the value is empty for a row, then the samplename will be used +automatically. + +Removal of autobump +------------------- +For several versions, resources were wrapped with the ``autobump()`` function, +which would automatically retry jobs with more resources if they failed. Turns +out this wasn't as helpful as expected, because errors (like syntax errors or +other mistakes) ended up being a lot more frequent than exceeding resources. +This resulted in escalating resource allocations and longer run time with no +need. So the autobump was removed. + +Cleanup of lib/utils.py +----------------------- +We had accumulated a lot of useful functions over time, but things have changed +enough that they haven't been used. To avoid clutter and additional maintenance +burden in supporting otherwise unused code, these functions were removed. From 04d5d695f6a5e470c042671a2adb5a1cba2a67c5 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Tue, 4 Nov 2025 22:50:04 -0500 Subject: [PATCH 177/196] trackhub scripts use new utils functions --- workflows/chipseq/chipseq_trackhub.py | 4 ++-- workflows/rnaseq/rnaseq_trackhub.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py index 4e520be2..9d7ba3eb 100644 --- a/workflows/chipseq/chipseq_trackhub.py +++ b/workflows/chipseq/chipseq_trackhub.py @@ -24,7 +24,7 @@ from trackhub.helpers import filter_composite_from_subgroups, dimensions_from_subgroups, hex2rgb from trackhub.upload import upload_hub, stage_hub -from lib import chipseq +from lib import chipseq, utils ap = argparse.ArgumentParser() ap.add_argument('config', help='Main config.yaml file') @@ -55,7 +55,7 @@ ) # Set up subgroups based on unique values from columns specified in the config -df = pandas.read_csv(config['sampletable'], comment='#', sep='\t') +df = utils.prepare_chipseq_sampletable(config) cols = hub_config['subgroups']['columns'] subgroups = [] for col in cols: diff --git a/workflows/rnaseq/rnaseq_trackhub.py b/workflows/rnaseq/rnaseq_trackhub.py index 6fe17f80..d6bb8cf4 100644 --- a/workflows/rnaseq/rnaseq_trackhub.py +++ b/workflows/rnaseq/rnaseq_trackhub.py @@ -9,6 +9,7 @@ """ import os +import sys import re from pprint import pprint import pandas @@ -20,6 +21,9 @@ from trackhub.upload import upload_hub, stage_hub import argparse +sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") +from lib import utils + ap = argparse.ArgumentParser() ap.add_argument('config', help='Main config.yaml file') ap.add_argument('hub_config', help='Track hub config YAML file') @@ -47,7 +51,7 @@ ) # Set up subgroups based on the configured columns -df = pandas.read_csv(config['sampletable'], comment='#', sep='\t') +df = utils.prepare_rnaseq_sampletable(config) cols = hub_config['subgroups']['columns'] subgroups = [] for col in cols: From 8951b570dd970a7417ced6ce8ec6fe9d87c435fc Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 5 Nov 2025 13:05:41 +0000 Subject: [PATCH 178/196] rm url check --- .circleci/config.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3899fa03..60fa6a54 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -168,9 +168,6 @@ variables: # Ensure that the chunks in rnaseq.Rmd have matching documentation (cd ci && ./ensure_docs.py) - # find all URLs in reference configs and make sure they exist - python -c "import sys; sys.path.insert(0, '$DEST'); from lib.utils import check_all_urls_found; check_all_urls_found()" - # run R package unit tests using the R env conda activate $LCDBWF_ENV_R Rscript -e "devtools::test('lib/lcdbwf', reporter=c('summary', 'fail'), export_all=TRUE)" From b8fb369198aa8a58a68306e6a0613c4d315a0eb6 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 5 Nov 2025 13:15:25 +0000 Subject: [PATCH 179/196] typo --- lib/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index 17614537..ca07a9fb 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -813,7 +813,7 @@ def prepare_chipseq_sampletable(config): chipseq_preflight(config) sampletable_fn = config.get("sampletable", "config/sampletable.tsv") sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#") - sampletable = sampletable.set_inde(sampletable.columns[0], drop=False) + sampletable = sampletable.set_index(sampletable.columns[0], drop=False) sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0]) return sampletable @@ -822,7 +822,7 @@ def prepare_rnaseq_sampletable(config): rnaseq_preflight(config) sampletable_fn = config.get("sampletable", "config/sampletable.tsv") sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#") - sampletable = sampletable.set_inde(sampletable.columns[0], drop=False) + sampletable = sampletable.set_index(sampletable.columns[0], drop=False) return sampletable From cb97ce1c67bed1ca054b316f4030e91cf2b9a4b1 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 5 Nov 2025 14:15:53 +0000 Subject: [PATCH 180/196] improved sampletable handling --- lib/utils.py | 147 +++++++++++--------- workflows/rnaseq/config/sra_sampletable.csv | 20 +++ workflows/rnaseq/config/sra_sampletable.tsv | 7 - 3 files changed, 98 insertions(+), 76 deletions(-) create mode 100644 workflows/rnaseq/config/sra_sampletable.csv delete mode 100644 workflows/rnaseq/config/sra_sampletable.tsv diff --git a/lib/utils.py b/lib/utils.py index ca07a9fb..b3182567 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -156,14 +156,13 @@ def is_paired_end(sampletable, sample): # # So detect first detect if SRA sampletable based on presence of "Run" # column and all values of that column starting with "SRR", and then raise - # an error if the Layout column does not exist. + # an error if the Layout or LibraryLayout column does not exist. - if "Run" in sampletable.columns: + sra_layout_columns = ["layout", "librarylayout"] + sampletable_columns = [i.lower() for i in sampletable.columns] + if "run" in sampletable_columns: if all(sampletable["Run"].str.startswith("SRR")): - if ( - "Layout" not in sampletable.columns - and "layout" not in sampletable.columns - ): + if len(set(sra_layout_columns).intersection(sampletable_columns)) == 0: raise ValueError( "Sampletable appears to be SRA, but no 'Layout' column " "found. This is required to specify single- or paired-end " @@ -234,62 +233,6 @@ def detect_layout(sampletable): raise ValueError(f"Only a single layout (SE or PE) is supported. {report_}") -def check_unique_fn(df): - """ - Raises an error if the fastq filenames are not unique - """ - fns = df["orig_filename"] - if "orig_filename_R2" in df.columns: - fns = pd.concat([fns, df["orig_filename_R2"]]) - if len(fns.unique()) < len(fns): - raise ValueError("Fastq filenames non unique, check the sampletable\n") - - -def check_unique_samplename(df): - """ - Raises an error if the samplenames are not unique - """ - ns = df.index - if len(ns.unique()) < len(ns): - raise ConfigurationError("Samplenames non unique, check the sampletable\n") - - -def preflight(config): - """ - Performs verifications on config and sampletable files - - Parameters - ---------- - config: yaml config object - """ - sampletable = pd.read_table(config["sampletable"], index_col=0, comment="#") - check_unique_samplename(sampletable) - if "orig_filename" in sampletable.columns: - check_unique_fn(sampletable) - if "genome" not in config: - raise ConfigurationError("Config is missing 'genome' key") - if "url" not in config["genome"]: - raise ConfigurationError("Config is missing 'url' key for 'genome'") - - -def rnaseq_preflight(config): - preflight(config) - if "annotation" not in config: - raise ConfigurationError("Config is missing 'annotation' key") - if "url" not in config["annotation"]: - raise ConfigurationError("Config is missing 'url' key for 'annotation'") - if "stranded" not in config: - raise ConfigurationError("Config is missing 'stranded' key") - if "organism" not in config: - raise ConfigurationError("Config is missing 'organism' key") - - -def chipseq_preflight(config): - preflight(config) - if "peaks" not in config: - config["peaks"] = [] - - def filter_rrna_fastas(tmpfiles, outfile, pattern): """ Extract records from fasta file(s) given a search pattern. @@ -809,20 +752,86 @@ def unlist_dict(d): writer.writerow(unlist_dict(row)) -def prepare_chipseq_sampletable(config): - chipseq_preflight(config) +def preflight(config, sampletable): + """ + Performs verifications on config and sampletable files + + Parameters + ---------- + config: yaml config object + """ + + if len(sampletable) != len(sampletable.iloc[:, 0].unique()): + raise ConfigurationError("Samplenames non unique, check the sampletable") + + # For non-SRA sampletables + if "orig_filename" in sampletable.columns: + fns = df["orig_filename"] + if "orig_filename_R2" in df.columns: + fns = pd.concat([fns, df["orig_filename_R2"]]) + if len(fns.unique()) < len(fns): + raise ValueError("Fastq filenames non unique, check the sampletable\n") + + if "genome" not in config: + raise ConfigurationError("Config is missing 'genome' key") + if "url" not in config["genome"]: + raise ConfigurationError("Config is missing 'url' key for 'genome'") + + +def rnaseq_preflight(config, sampletable): + preflight(config, sampletable) + if "annotation" not in config: + raise ConfigurationError("Config is missing 'annotation' key") + if "url" not in config["annotation"]: + raise ConfigurationError("Config is missing 'url' key for 'annotation'") + if "stranded" not in config: + raise ConfigurationError("Config is missing 'stranded' key") + if "organism" not in config: + raise ConfigurationError("Config is missing 'organism' key") + + +def chipseq_preflight(config, sampletable): + preflight(config, sampletable) + if "peaks" not in config: + config["peaks"] = [] + + +def read_sampletable(config): + """ + Given a config object, return the sampletable with the first column used as the index. + + Autodetect tsv/csv. + """ sampletable_fn = config.get("sampletable", "config/sampletable.tsv") - sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#") + if sampletable_fn.endswith(".tsv"): + sep = "\t" + elif sampletable_fn.endswith(".csv"): + sep = "," + else: + raise ConfigurationError( + f"Sampletable should end in .csv or .tsv to indicate format, got {sampletable_fn}" + ) + sampletable = pd.read_table(sampletable_fn, sep=sep, comment="#") sampletable = sampletable.set_index(sampletable.columns[0], drop=False) + return sampletable + + +def prepare_chipseq_sampletable(config): + """ + Given a config, return the validated and prepared ChIP-seq table. + """ + sampletable = read_sampletable(config) sampletable["label"] = sampletable["label"].fillna(sampletable.iloc[:, 0]) + chipseq_preflight(config, sampletable) return sampletable def prepare_rnaseq_sampletable(config): - rnaseq_preflight(config) - sampletable_fn = config.get("sampletable", "config/sampletable.tsv") - sampletable = pd.read_table(sampletable_fn, sep="\t", comment="#") - sampletable = sampletable.set_index(sampletable.columns[0], drop=False) + """ + Given a config, return the validated and prepared RNA-seq table. + """ + sampletable = read_sampletable(config) + rnaseq_preflight(config, sampletable) return sampletable diff --git a/workflows/rnaseq/config/sra_sampletable.csv b/workflows/rnaseq/config/sra_sampletable.csv new file mode 100644 index 00000000..1ecdc3cf --- /dev/null +++ b/workflows/rnaseq/config/sra_sampletable.csv @@ -0,0 +1,20 @@ +Run,Assay Type,AvgSpotLen,Bases,BioProject,BioSample,Bytes,Center Name,Consent,DATASTORE filetype,DATASTORE provider,DATASTORE region,Experiment,GEO_Accession (exp),Instrument,LibraryLayout,LibrarySelection,LibrarySource,Organism,Platform,ReleaseDate,create_date,version,Sample Name,source_name,SRA Study,treatment,cell_type,Developmental_stage,cell_line +SRR5182696,RNA-Seq,50,720229800,PRJNA362227,SAMN06236711,456024643,GEO,public,"fastq,run.zq,sra","s3,gs,ncbi","gs.us-east1,ncbi.public,s3.us-east-1",SRX2498797,GSM2461336,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461336,"sorted live neurons\, P14 pupal\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,P14 pupal, +SRR5182697,RNA-Seq,50,651467650,PRJNA362227,SAMN06236734,413453724,GEO,public,"fastq,run.zq,sra","gs,ncbi,s3","s3.us-east-1,ncbi.public,gs.us-east1",SRX2498798,GSM2461337,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461337,"sorted live neurons\, P14 pupal\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,P14 pupal, +SRR5182698,RNA-Seq,50,501312400,PRJNA362227,SAMN06236733,318819526,GEO,public,"run.zq,sra,fastq","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX2498799,GSM2461338,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:10:00Z,1,GSM2461338,"sorted live neurons\, P14 pupal\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,P14 pupal, +SRR5182699,RNA-Seq,50,744291500,PRJNA362227,SAMN06236732,473503018,GEO,public,"fastq,sra,run.zq","ncbi,gs,s3","gs.us-east1,s3.us-east-1,ncbi.public",SRX2498800,GSM2461339,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461339,"sorted live neurons\, third instar larval\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,third instar larval, +SRR5182700,RNA-Seq,50,607856150,PRJNA362227,SAMN06236731,386029421,GEO,public,"run.zq,fastq,sra","ncbi,gs,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX2498801,GSM2461340,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:12:00Z,1,GSM2461340,"sorted live neurons\, third instar larval\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,third instar larval, +SRR5182701,RNA-Seq,50,641763000,PRJNA362227,SAMN06236730,407428219,GEO,public,"run.zq,sra,fastq","ncbi,s3,gs","ncbi.public,gs.us-east1,s3.us-east-1",SRX2498802,GSM2461341,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461341,"sorted live neurons\, third instar larval\, dicer-gfp_shep-rnai",SRP096911,dicer-gfp_shep-rnai,neurons,third instar larval, +SRR5182702,RNA-Seq,50,602992350,PRJNA362227,SAMN06236729,383110310,GEO,public,"fastq,sra,run.zq","gs,s3,ncbi","s3.us-east-1,ncbi.public,gs.us-east1",SRX2498803,GSM2461342,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461342,"sorted live neurons\, P14 pupal\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,P14 pupal, +SRR5182703,RNA-Seq,50,639787300,PRJNA362227,SAMN06236728,406192647,GEO,public,"fastq,run.zq,sra","gs,s3,ncbi","gs.us-east1,s3.us-east-1,ncbi.public",SRX2498804,GSM2461343,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461343,"sorted live neurons\, P14 pupal\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,P14 pupal, +SRR5182704,RNA-Seq,50,645383100,PRJNA362227,SAMN06236727,409821107,GEO,public,"fastq,sra,run.zq","s3,gs,ncbi","gs.us-east1,s3.us-east-1,ncbi.public",SRX2498805,GSM2461344,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:13:00Z,1,GSM2461344,"sorted live neurons\, P14 pupal\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,P14 pupal, +SRR5182705,RNA-Seq,50,867006750,PRJNA362227,SAMN06236726,550448623,GEO,public,"sra,fastq,run.zq","ncbi,s3,gs","gs.us-east1,s3.us-east-1,ncbi.public",SRX2498806,GSM2461345,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:12:00Z,1,GSM2461345,"sorted live neurons\, third instar larval\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,third instar larval, +SRR5182706,RNA-Seq,50,664061850,PRJNA362227,SAMN06236725,421272040,GEO,public,"sra,run.zq,fastq","s3,gs,ncbi","s3.us-east-1,gs.us-east1,ncbi.public",SRX2498807,GSM2461346,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:12:00Z,1,GSM2461346,"sorted live neurons\, third instar larval\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,third instar larval, +SRR5182707,RNA-Seq,50,718867500,PRJNA362227,SAMN06236724,455538089,GEO,public,"fastq,sra,run.zq","s3,ncbi,gs","ncbi.public,gs.us-east1,s3.us-east-1",SRX2498808,GSM2461347,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:12:00Z,1,GSM2461347,"sorted live neurons\, third instar larval\, dicer-gfp_x",SRP096911,dicer-gfp_x,neurons,third instar larval, +SRR5182708,RNA-Seq,51,313585740,PRJNA362227,SAMN06236723,192621062,GEO,public,"sra,fastq,run.zq","ncbi,s3,gs","ncbi.public,gs.us-east1,s3.us-east-1",SRX2498809,GSM2461348,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461348,"cell culture from nervous system\, ds-GFP",SRP096911,dsRNA targeting GFP,,,BG3 +SRR5182709,RNA-Seq,51,354164145,PRJNA362227,SAMN06236722,217275323,GEO,public,"sra,run.zq,fastq","s3,gs,ncbi","ncbi.public,s3.us-east-1,gs.us-east1",SRX2498810,GSM2461349,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461349,"cell culture from nervous system\, ds-GFP",SRP096911,dsRNA targeting GFP,,,BG3 +SRR5182710,RNA-Seq,51,331996689,PRJNA362227,SAMN06236721,204104825,GEO,public,"run.zq,fastq,sra","gs,ncbi,s3","gs.us-east1,ncbi.public,s3.us-east-1",SRX2498811,GSM2461350,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461350,"cell culture from nervous system\, ds-GFP",SRP096911,dsRNA targeting GFP,,,BG3 +SRR5182711,RNA-Seq,51,484674828,PRJNA362227,SAMN06236720,298600165,GEO,public,"run.zq,fastq,sra","s3,ncbi,gs","s3.us-east-1,ncbi.public,gs.us-east1",SRX2498812,GSM2461351,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461351,"cell culture from nervous system\, ds-shep",SRP096911,dsRNA targeting shep,,,BG3 +SRR5182712,RNA-Seq,51,379084887,PRJNA362227,SAMN06236719,233433872,GEO,public,"run.zq,fastq,sra","s3,gs,ncbi","ncbi.public,gs.us-east1,s3.us-east-1",SRX2498813,GSM2461352,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461352,"cell culture from nervous system\, ds-shep",SRP096911,dsRNA targeting shep,,,BG3 +SRR5182713,RNA-Seq,51,410430405,PRJNA362227,SAMN06236718,252947684,GEO,public,"fastq,sra,run.zq","gs,s3,ncbi","s3.us-east-1,ncbi.public,gs.us-east1",SRX2498814,GSM2461353,Illumina HiSeq 2500,SINGLE,cDNA,TRANSCRIPTOMIC,Drosophila melanogaster,ILLUMINA,2017-11-22T00:00:00Z,2017-01-17T18:11:00Z,1,GSM2461353,"cell culture from nervous system\, ds-shep",SRP096911,dsRNA targeting shep,,,BG3 + diff --git a/workflows/rnaseq/config/sra_sampletable.tsv b/workflows/rnaseq/config/sra_sampletable.tsv deleted file mode 100644 index 3ed904c6..00000000 --- a/workflows/rnaseq/config/sra_sampletable.tsv +++ /dev/null @@ -1,7 +0,0 @@ -samplename AvgSpotLen BioSample Experiment MBases MBytes Run SRA_Sample Sample_Name developmental_stage source_name treatment Assay_Type BioProject Center_Name Consent DATASTORE_filetype DATASTORE_provider InsertSize Instrument LibraryLayout LibrarySelection LibrarySource LoadDate Organism Platform ReleaseDate SRA_Study cell_line cell_type -gfp1 51 SAMN06236723 SRX2498809 299 183 SRR5182708 SRS1925642 GSM2461348 cell culture from nervous system, ds-GFP dsRNA targeting GFP RNA-Seq PRJNA362227 GEO public sra ncbi 0 Illumina HiSeq 2500 SINGLE cDNA TRANSCRIPTOMIC 2017-01-17 Drosophila melanogaster ILLUMINA 2017-11-22 SRP096911 BG3 -gfp2 51 SAMN06236722 SRX2498810 337 207 SRR5182709 SRS1925643 GSM2461349 cell culture from nervous system, ds-GFP dsRNA targeting GFP RNA-Seq PRJNA362227 GEO public sra ncbi 0 Illumina HiSeq 2500 SINGLE cDNA TRANSCRIPTOMIC 2017-01-17 Drosophila melanogaster ILLUMINA 2017-11-22 SRP096911 BG3 -gfp3 51 SAMN06236721 SRX2498811 316 194 SRR5182710 SRS1925644 GSM2461350 cell culture from nervous system, ds-GFP dsRNA targeting GFP RNA-Seq PRJNA362227 GEO public sra ncbi 0 Illumina HiSeq 2500 SINGLE cDNA TRANSCRIPTOMIC 2017-01-17 Drosophila melanogaster ILLUMINA 2017-11-22 SRP096911 BG3 -shep1 51 SAMN06236720 SRX2498812 462 284 SRR5182711 SRS1925645 GSM2461351 cell culture from nervous system, ds-shep dsRNA targeting shep RNA-Seq PRJNA362227 GEO public sra ncbi 0 Illumina HiSeq 2500 SINGLE cDNA TRANSCRIPTOMIC 2017-01-17 Drosophila melanogaster ILLUMINA 2017-11-22 SRP096911 BG3 -shep2 51 SAMN06236719 SRX2498813 361 222 SRR5182712 SRS1925646 GSM2461352 cell culture from nervous system, ds-shep dsRNA targeting shep RNA-Seq PRJNA362227 GEO public sra ncbi 0 Illumina HiSeq 2500 SINGLE cDNA TRANSCRIPTOMIC 2017-01-17 Drosophila melanogaster ILLUMINA 2017-11-22 SRP096911 BG3 -shep3 51 SAMN06236718 SRX2498814 391 241 SRR5182713 SRS1925647 GSM2461353 cell culture from nervous system, ds-shep dsRNA targeting shep RNA-Seq PRJNA362227 GEO public sra ncbi 0 Illumina HiSeq 2500 SINGLE cDNA TRANSCRIPTOMIC 2017-01-17 Drosophila melanogaster ILLUMINA 2017-11-22 SRP096911 BG3 From d9c3f78acb78d7796d672150c24a6023230479ac Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 5 Nov 2025 16:35:28 +0000 Subject: [PATCH 181/196] fix trackhub imports --- workflows/chipseq/chipseq_trackhub.py | 2 +- workflows/rnaseq/rnaseq_trackhub.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/chipseq/chipseq_trackhub.py b/workflows/chipseq/chipseq_trackhub.py index 9d7ba3eb..80637a28 100644 --- a/workflows/chipseq/chipseq_trackhub.py +++ b/workflows/chipseq/chipseq_trackhub.py @@ -11,7 +11,6 @@ import os import sys -sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) import re import argparse from pprint import pprint @@ -24,6 +23,7 @@ from trackhub.helpers import filter_composite_from_subgroups, dimensions_from_subgroups, hex2rgb from trackhub.upload import upload_hub, stage_hub +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) from lib import chipseq, utils ap = argparse.ArgumentParser() diff --git a/workflows/rnaseq/rnaseq_trackhub.py b/workflows/rnaseq/rnaseq_trackhub.py index d6bb8cf4..92199cad 100644 --- a/workflows/rnaseq/rnaseq_trackhub.py +++ b/workflows/rnaseq/rnaseq_trackhub.py @@ -21,7 +21,7 @@ from trackhub.upload import upload_hub, stage_hub import argparse -sys.path.insert(0, os.path.dirname(workflow.snakefile) + "/../..") +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) from lib import utils ap = argparse.ArgumentParser() From 8f06eac28fa86fb4c535fad822ad8fe32e53f410 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 5 Nov 2025 16:38:51 +0000 Subject: [PATCH 182/196] test SRA csv --- .circleci/config.yml | 2 +- test/test_configs/test_sra_sampletable.csv | 3 +++ test/test_configs/test_sra_sampletable.tsv | 3 --- 3 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 test/test_configs/test_sra_sampletable.csv delete mode 100644 test/test_configs/test_sra_sampletable.tsv diff --git a/.circleci/config.yml b/.circleci/config.yml index 60fa6a54..51d81530 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -263,7 +263,7 @@ variables: # SRA test ./run_test.sh -k -p -j2 --use-conda \ --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml \ - --config sampletable=$ORIG/test/test_configs/test_sra_sampletable.tsv + --config sampletable=$ORIG/test/test_configs/test_sra_sampletable.csv # SRA SE only ./run_test.sh -k -p -j2 --use-conda \ diff --git a/test/test_configs/test_sra_sampletable.csv b/test/test_configs/test_sra_sampletable.csv new file mode 100644 index 00000000..34f57090 --- /dev/null +++ b/test/test_configs/test_sra_sampletable.csv @@ -0,0 +1,3 @@ +Run,LibraryLayout +SRR948304,PAIRED +SRR948305,PAIRED diff --git a/test/test_configs/test_sra_sampletable.tsv b/test/test_configs/test_sra_sampletable.tsv deleted file mode 100644 index 0f55c436..00000000 --- a/test/test_configs/test_sra_sampletable.tsv +++ /dev/null @@ -1,3 +0,0 @@ -samplename Run layout -sra2 SRR948304 PAIRED -sra3 SRR948305 PAIRED From a2cf9d88ceca278f5944ea8874b57380c6d055b6 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 5 Nov 2025 16:47:39 +0000 Subject: [PATCH 183/196] df -> sampletable --- lib/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/utils.py b/lib/utils.py index b3182567..2535286f 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -766,9 +766,9 @@ def preflight(config, sampletable): # For non-SRA sampletables if "orig_filename" in sampletable.columns: - fns = df["orig_filename"] - if "orig_filename_R2" in df.columns: - fns = pd.concat([fns, df["orig_filename_R2"]]) + fns = sampletable["orig_filename"] + if "orig_filename_R2" in sampletable.columns: + fns = pd.concat([fns, sampletable["orig_filename_R2"]]) if len(fns.unique()) < len(fns): raise ValueError("Fastq filenames non unique, check the sampletable\n") From fe999e27eaeee3acc07a99a7e52a204f3a6f042a Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:01:02 +0000 Subject: [PATCH 184/196] use additional config for rnaseq trackhub --- .circleci/config.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 51d81530..9014546f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -235,7 +235,9 @@ variables: ./run_test.sh --use-conda -j2 -k -p \ --configfile $ORIG/test/test_configs/test_rnaseq_config.yaml - python rnaseq_trackhub.py config/config.yaml config/hub_config.yaml + python rnaseq_trackhub.py \ + config/config.yaml config/hub_config.yaml \ + --additional-configs $ORIG/test/test_configs/test_rnaseq_config.yaml conda activate $LCDBWF_ENV_R From 23164b1fa1bc3468ed1012868eaac7b3c6a074c9 Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:24:51 -0500 Subject: [PATCH 185/196] update reference configs --- include/reference_config_templates/Homo_sapiens/GENCODE.yaml | 2 -- .../reference_config_templates/Mus_musculus/GENCODE_M25.yaml | 2 -- .../Saccharomyces_cerevisiae/S288C.yaml | 2 ++ 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml index 507877bb..97a48e93 100644 --- a/include/reference_config_templates/Homo_sapiens/GENCODE.yaml +++ b/include/reference_config_templates/Homo_sapiens/GENCODE.yaml @@ -7,8 +7,6 @@ organism: "Homo sapiens" genome: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz" - postprocess: lib.postprocess.default annotation: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.primary_assembly.annotation.gtf.gz" - postprocess: lib.postprocess.default diff --git a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml index 99120cbf..7899df9d 100644 --- a/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml +++ b/include/reference_config_templates/Mus_musculus/GENCODE_M25.yaml @@ -7,9 +7,7 @@ organism: "Mus musculus" genome: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.primary_assembly.genome.fa.gz" - postprocess: lib.postprocess.default annotation: url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.primary_assembly.annotation.gtf.gz" - postprocess: lib.postprocess.default diff --git a/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml b/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml index 4e0204d0..62e68ea5 100644 --- a/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml +++ b/include/reference_config_templates/Saccharomyces_cerevisiae/S288C.yaml @@ -2,6 +2,8 @@ # From Ensembl. According to README in this FTP dir, if there's no primary # assembly then the toplevel is assumed to be the primary assembly. + +organism: "Saccharomyces cerevisiae" genome: url: "https://ftp.ensembl.org/pub/release-115/fasta/saccharomyces_cerevisiae/dna/Saccharomyces_cerevisiae.R64-1-1.dna_sm.toplevel.fa.gz" From 5bc2b4bc6574a1151eb4a250021b0c417c26fbcf Mon Sep 17 00:00:00 2001 From: Ryan Dale <115406+daler@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:26:06 -0500 Subject: [PATCH 186/196] initial overhaul of docs --- docs/README.md | 30 - docs/_static/balloon.min.css | 1 - docs/_static/custom.css | 30 - docs/autodoc.rst | 9 - docs/changelog.rst | 912 ----------------------------- docs/chipseq.png | Bin 25746 -> 0 bytes docs/chipseq.rst | 33 -- docs/conda.rst | 209 ------- docs/conf.py | 1 - docs/config-yaml.rst | 587 ------------------- docs/config.rst | 389 ++++++++++-- docs/decisions.rst | 39 +- docs/developers.rst | 116 ---- docs/downstream-rnaseq.rst | 82 --- docs/external.png | Bin 11515 -> 0 bytes docs/faqs.rst | 189 ------ docs/functional-enrichment-rmd.rst | 122 ---- docs/gene-patterns-rmd.rst | 78 --- docs/generate_guide.py | 185 ------ docs/getting-started.rst | 200 ++----- docs/guide-to-files.txt | 122 ---- docs/guide.rst | 257 -------- docs/index.rst | 54 +- docs/integrative.rst | 82 --- docs/lib.chipseq.rst | 23 - docs/lib.common.rst | 35 -- docs/lib.patterns_targets.rst | 22 - docs/patterns-targets.rst | 140 ----- docs/references-config.rst | 603 ------------------- docs/references.png | Bin 10622 -> 0 bytes docs/references.rst | 82 --- docs/rnaseq-rmd.rst | 587 ------------------- docs/rnaseq.png | Bin 34326 -> 0 bytes docs/rnaseq.rst | 53 -- docs/sampletable.rst | 272 --------- docs/tests.rst | 183 ------ docs/toc.rst | 12 - docs/workflows.rst | 284 +++++---- 38 files changed, 600 insertions(+), 5423 deletions(-) delete mode 100644 docs/README.md delete mode 100644 docs/_static/balloon.min.css delete mode 100644 docs/_static/custom.css delete mode 100644 docs/autodoc.rst delete mode 100644 docs/changelog.rst delete mode 100644 docs/chipseq.png delete mode 100644 docs/chipseq.rst delete mode 100644 docs/conda.rst delete mode 100644 docs/config-yaml.rst delete mode 100644 docs/developers.rst delete mode 100644 docs/downstream-rnaseq.rst delete mode 100644 docs/external.png delete mode 100644 docs/faqs.rst delete mode 100644 docs/functional-enrichment-rmd.rst delete mode 100644 docs/gene-patterns-rmd.rst delete mode 100644 docs/generate_guide.py delete mode 100644 docs/guide-to-files.txt delete mode 100644 docs/guide.rst delete mode 100644 docs/integrative.rst delete mode 100644 docs/lib.chipseq.rst delete mode 100644 docs/lib.common.rst delete mode 100644 docs/lib.patterns_targets.rst delete mode 100644 docs/patterns-targets.rst delete mode 100644 docs/references-config.rst delete mode 100644 docs/references.png delete mode 100644 docs/references.rst delete mode 100644 docs/rnaseq-rmd.rst delete mode 100644 docs/rnaseq.png delete mode 100644 docs/rnaseq.rst delete mode 100644 docs/sampletable.rst delete mode 100644 docs/tests.rst diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 45ed3871..00000000 --- a/docs/README.md +++ /dev/null @@ -1,30 +0,0 @@ -This documentation uses [sphinx](http://www.sphinx-doc.org) to buid the documentation. - -The built documentation from the master branch can be found at -https://lcdb.github.io/lcdb-wf. If you want to build a local copy of the -documentation: - -- create an environment from the `docs/docs-requirements.txt` file -- activate it -- run the Makefile in `docs` - - -That is: - -```bash -# Create env -conda create -n lcdb-wf-docs \ - --file docs/docs-requirements.txt \ - --channel bioconda \ - --channel conda-forge \ - --channel lcdb - -# activate it -source activate lcdb-wf-docs - -# build the docs -cd docs -make html -``` - -The locally-built docs will be in `docs/_build/html/toc.html`. diff --git a/docs/_static/balloon.min.css b/docs/_static/balloon.min.css deleted file mode 100644 index 268c8a8e..00000000 --- a/docs/_static/balloon.min.css +++ /dev/null @@ -1 +0,0 @@ -[data-balloon]{position:relative}[data-balloon]:after,[data-balloon]:before{-ms-filter:"progid:DXImageTransform.Microsoft.Alpha(Opacity=0)";filter:alpha(opacity=0);-khtml-opacity:0;-moz-opacity:0;opacity:0;pointer-events:none;-webkit-transition:all .18s ease-out .18s;transition:all .18s ease-out .18s;bottom:100%;left:50%;position:absolute;z-index:10;-webkit-transform:translate(-50%,10px);-ms-transform:translate(-50%,10px);transform:translate(-50%,10px);-webkit-transform-origin:top;-ms-transform-origin:top;transform-origin:top}[data-balloon]:after{background:rgba(17,17,17,.9);border-radius:4px;color:#fff;content:attr(data-balloon);font-size:12px;padding:.5em 1em;white-space:nowrap;margin-bottom:11px}[data-balloon]:before{background:url('data:image/svg+xml;utf8,') no-repeat;background-size:100% auto;height:6px;width:18px;content:"";margin-bottom:5px}[data-balloon]:hover:after,[data-balloon]:hover:before{-ms-filter:"progid:DXImageTransform.Microsoft.Alpha(Opacity=100)";filter:alpha(opacity=100);-khtml-opacity:1;-moz-opacity:1;opacity:1;pointer-events:auto;-webkit-transform:translate(-50%,0);-ms-transform:translate(-50%,0);transform:translate(-50%,0)}[data-balloon][data-balloon-break]:after{white-space:normal}[data-balloon-pos=down]:after,[data-balloon-pos=down]:before{bottom:auto;left:50%;top:100%;-webkit-transform:translate(-50%,-10px);-ms-transform:translate(-50%,-10px);transform:translate(-50%,-10px)}[data-balloon-pos=down]:after{margin-top:11px}[data-balloon-pos=down]:before{background:url('data:image/svg+xml;utf8,') no-repeat;background-size:100% auto;height:6px;width:18px;margin-top:5px;margin-bottom:0}[data-balloon-pos=down]:hover:after,[data-balloon-pos=down]:hover:before{-webkit-transform:translate(-50%,0);-ms-transform:translate(-50%,0);transform:translate(-50%,0)}[data-balloon-pos=left]:after,[data-balloon-pos=left]:before{bottom:auto;left:auto;right:100%;top:50%;-webkit-transform:translate(10px,-50%);-ms-transform:translate(10px,-50%);transform:translate(10px,-50%)}[data-balloon-pos=left]:after{margin-right:11px}[data-balloon-pos=left]:before{background:url('data:image/svg+xml;utf8,') no-repeat;background-size:100% auto;height:18px;width:6px;margin-right:5px;margin-bottom:0}[data-balloon-pos=left]:hover:after,[data-balloon-pos=left]:hover:before{-webkit-transform:translate(0,-50%);-ms-transform:translate(0,-50%);transform:translate(0,-50%)}[data-balloon-pos=right]:after,[data-balloon-pos=right]:before{bottom:auto;left:100%;top:50%;-webkit-transform:translate(-10px,-50%);-ms-transform:translate(-10px,-50%);transform:translate(-10px,-50%)}[data-balloon-pos=right]:after{margin-left:11px}[data-balloon-pos=right]:before{background:url('data:image/svg+xml;utf8,') no-repeat;background-size:100% auto;height:18px;width:6px;margin-bottom:0;margin-left:5px}[data-balloon-pos=right]:hover:after,[data-balloon-pos=right]:hover:before{-webkit-transform:translate(0,-50%);-ms-transform:translate(0,-50%);transform:translate(0,-50%)}[data-balloon-length]:after{white-space:normal}[data-balloon-length=small]:after{width:80px}[data-balloon-length=medium]:after{width:150px}[data-balloon-length=large]:after{width:260px}[data-balloon-length=xlarge]:after{width:90vw}@media screen and (min-width:768px){[data-balloon-length=xlarge]:after{width:380px}}[data-balloon-length=fit]:after{width:100%} \ No newline at end of file diff --git a/docs/_static/custom.css b/docs/_static/custom.css deleted file mode 100644 index b83f5902..00000000 --- a/docs/_static/custom.css +++ /dev/null @@ -1,30 +0,0 @@ -pre { - font-size: 0.7em; -} - - -h3 { - font-style: italic; -} - -h2 { - /* text-decoration: underline; */ -} - -code { - background-color: #fff; - font-size: 0.8em; - color: #444; -} - -code.file { - font-style: italic; -} - -/* make fixed sidebar scrollable - from: https://stackoverflow.com/questions/57031848/sphinx-alabaster-theme-scroll-inside-of-fixed-sidebar -*/ -div.sphinxsidebar { - max-height: 90%; - overflow-y: auto; -} diff --git a/docs/autodoc.rst b/docs/autodoc.rst deleted file mode 100644 index 7217f828..00000000 --- a/docs/autodoc.rst +++ /dev/null @@ -1,9 +0,0 @@ -Module documentation -==================== - -.. toctree:: - :maxdepth: 2 - - lib.common - lib.chipseq - lib.patterns_targets diff --git a/docs/changelog.rst b/docs/changelog.rst deleted file mode 100644 index 22039944..00000000 --- a/docs/changelog.rst +++ /dev/null @@ -1,912 +0,0 @@ -Changelog -========= - -v1.10.3 -------- - -- improve the deploy script (thanks @aliciaaevans) -- support the epic2 peak-caller for the ChIP-seq workflow (thanks @Mira0507) -- for later versions of featureCounts, add ``--countReadPairs`` argument to RNA-seq workflow (@therealgenna) - -v1.10.2 -------- - -Minor bugfix release. - -- Fix multiqc configs so that they coorectly ignore any cutadapt fastqc zips when building the raw fastq section -- Fix multiqc config for chipseq so it correctly cleans the ``_R2`` extension to better support PE ChIP-seq-like workflows -- Fix functional enrichment label truncation to ensure that truncated labels are unique - -v1.10.1 -------- -This is a bugfix and minor patch release. - -- Bugfix: the references workflow was missing the ``resources:`` directives; - they have now been added. - -- Bugfix: kallisto strandedness was set incorrectly for libraries using - ligation prep (fr-secondstrand) - -- The new ``utils.autobump`` function can be used to easily specify default and - incremented resources, and the ``utils.gb`` and ``utils.hours`` make it - a little easier to specify when autobump is not required. - - In the following example, memory will be set to 8 * 1024 MB and will - increment by that much each retry. The runtime will be set to 2 * 60 minutes, - and will increment by 10 * 60 minutes each retry. The disk will be set to 100 - * 1024 MB, and will not increase each retry. - - .. code-block:: python - - resources: - mem_mb=autobump(gb=8), - runtime=autobump(hours=2, increment_hours=10), - disk_mb=gb(100) - -- WRAPPER_SLURM no longer has the ``--latency-wait=300``, - ``--max-jobs-per-second=1``, and ``--max-status-checks-per-second=0.01`` - which would override any profile settings. - -- In RNA-seq and ChIP-seq, the cutadpt rule now defaults to using - ``--nextseq-trim 20`` instead of ``-q 20``, to better handle the majority of - sequencing data we have recently been working with (NovaSeq). See `this - section of the cutadapt docs - `_ for - details. - -- Updated requirements to use a recent version of salmon to avoid segfaults - -- rnaseq.Rmd, when saving the Rds file at the end, now disables compression. - This can have a dramatic improvement on downstream performance for - a reasonable disk space cost. - -- functional-enrichment.Rmd, now supports KEGG pathways & parallel operation. - -- functional-enrichment.Rmd, gene-patterns.Rmd, now saves Rds file at the - end (without compression) adding the respective object lists. - -- added ``--overlap 6`` to cutadapt to avoid greedy trimming - - -v1.10 ------ -The major change here is refactoring the Snakefiles to use the ``resources:`` -directive in each rule, and removing the ``--clusterconfig`` mechanism which -has long been deprecated. - -For running on a cluster, this requires a `profile -`_. -E.g., on `NIH's Biowulf `_, use the `NIH-HPC -snakemake_profile `_. - -General -~~~~~~~ -- No longer using clusterconfig, instead using resources to configure cluster resources -- Migrated to a unified testing script that simplifies local and CI testing -- If sampletable is from SRA, raise an error if a Layout column can't be found - (to prevent incorrect interpretation of samples as single-end) -- Ensure bam indexes are made for the markdups bams, even if bigwigs are not created -- Remove libsizes table, which was largely redundant with fastqc results - -RNA-seq -~~~~~~~ -- Fix R tests -- All ``lcdbwf`` R functions use the ``:::`` namespace lookup syntax -- Fix library loads in rnaseq.Rmd to ensure they come before parallelization configuration -- New function ``lcdbwf:::lfc_scatter`` for comparing multiple DESeq2 contrasts -- Updates and fixes to ``gene-patterns.Rmd`` - - -v1.9 ----- - -This version has substantial changes in the ``rnaseq.Rmd`` file to streamline -its use in a production environment. This involves moving most of the code -complexity into the ``lcdbwf`` R package and using a new config file as much as -possible. See details below. - -General -~~~~~~~ -- environments have been updated with recent versions of all tools -- WRAPPER_SLURM arguments updated with arguments better suited for cluster submission -- PhiX reference configs have been removed -- compatibility with Python 3.10 -- fastq-dump rules have been converted to scripts. This is because sra-tools in - versions earlier than 3.0 have issue with SSL certs, however sra-tools=3 - cannot be installed alongside recent versions of salmon (due to conflicting - pinnings with the ``icu`` package). Therefore, fastq-dump is now run as - a script in its own conda environment. -- new idxstats rule for chipseq and rnaseq - -RNA-seq -~~~~~~~ - -**This version has major changes to** ``rnaseq.Rmd``. Briefly: - -1. This file has been overhauled to be driven by a config file. This - dramatically reduces the need to scroll through the RMarkdown file and make - all the customizations for a particular experiment. Now, editing the config - file sets up most of the project-specific components. Note that contrasts - still need to be customized in the Rmd file. -2. The narrative and explanatory text has been moved to ``text.yaml`` and is - included at render time. This reduces the need to scroll through lots of - boilerplate text in the RMarkdown while still retaining the ability to - easily edit it. -3. Most of the complexity has been offloaded to the ``lcdbwf`` R package. -4. Caches are much improved. See the :ref:`downstream-detailed` section for - more information. -5. Functional enrichment is moved into a separate RMarkdown file. - -Downstream RNA-seq config -,,,,,,,,,,,,,,,,,,,,,,,,, - -The file, `workflows/rnaseq/downstream/config.yaml` is heavily commented to -describe the various settings. The sections of the config are designed such -that they can be used as additional chunk options to chunks in which they are -used. This additional chunk option is used by RMarkdown to compute the hash of -the chunk. The result is that making a change in the config file is sufficient -to invalidate the cache of any chunks that specify that section as a chunk -option. - -Complexity moved to ``lib/lcdbwf/R`` -,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, - -Another major change is that most of the complexity in the ``rnaseq.Rmd`` file -has been factored out into the ``lcdbwf`` R package that is stored inn -``lib/lcdbwf``. While this means that all code is no longer included in the -final rendered HTML file, it does make the Rmd much more streamlined to work -with. It also has the side effect of making it easier to write unit tests on -separate functions. - -Many helper functions have been added to the ``lcdbwf`` R package, including -ones to streamline the creation of dds and results objects, composing and saving -them, and generating many of the outputs. - -Improved caching of results chunks -,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, - -A somewhat major change is a new strategy for allowing ``results()`` calls to be -split across multiple, independently-cached chunks that are then properly merged -together into a single ``res.list`` object while handling dependencies and -parallelization (thanks to `@njohnso6 `_). This -dramatically speeds up the process of incrementally adding contrasts to complex -experimental designs. - -Other changes -,,,,,,,,,,,,, - -In addition to these major changes, there are also many other improvements -to ``rnaseq.Rmd``: - - - AnnotationHub databases are only retrieved from cache when they are - needed. This dramatically speeds up rendering of the HTML, since before - the OrgDb would always load no matter what. - - Toggle Kallisto or Salmon quantification with a simple true/false; this - automatically sums to gene level using automatically retrieved TxDb. This - also now supports creating dds objects from featureCounts, Salmon, or - Kallisto in such a way that they can be easily compared with each other. - - ``lcdbwf::compose_results()`` to combine res_list and dds_list objects - together by inspecting the global namespace for specially-named objects - - Helper functions for retrieving global config and data structures (e.g., - ``lcdbwf::get_config()``, ``lcdbwf::get_dds()``) - - Helper function ``lcdbwf::match_from_dots`` for working with `...` - arguments and splitting them up to only go to the functions they are - intended for - - Much faster to attach info (e.g., adding SYMBOL to all results) since the - AnnotationDbi calls are only done once instead of for each results - object. - - Refactored functional enrichment to be much more generalized, currently - using Gene Ontology and MSigDB. MSigDb, via the ``msigdbr`` package, is - available for multiple species and so this incorporates Reactome and - KEGG. But the generalized method can be applied to any arbitrary gene - sets, allowing for much more customization. - - Fixes to clusterProfiler::emapplot calls in particular corner cases - - Functional enrichment is now a completely separate file, using the - ``combined.Rds`` file as an intermediate between ``rnaseq.Rmd`` and - ``functional_enrichment.Rmd``. - - All-in-one enrichment function that runs either overrepresentation or - GSEA. Makes it much easier to do *ad hoc* tests. - - Helper function ``lcdbwf::enrich_list_lapply()`` to apply arbitrary - functions to the highly-nested `enrich_list` data structure - - Helper function ``lcdbwf::collect_objects`` to help compile discovered - results objects - - ``lcdbwf::get_sig()`` has more options for what to return - - Plotting wrappers for clusterProfiler plot functions, allowing plots to be - configured via the config file. - - New dds diagnostics and results diagnostics functions and sections of the - Rmd, useful for troubleshooting - - Refactored the results tabs: MA plots come first; ensure 10 genes are always plotted in MA - plots, added volcano plots with labeled genes, removed top 3 and bottom - 3 gene plots - - PCA plots using plotly no longer need "unrolled" for-loops; multiple PCA - coloring and clustered heatmap row side colors are now configured in the - YAML config file - - Moved size factor plots and gene version removal to lcdbwf package - - Use datatable to show initial sampletable for cleaner output - - Make original dds_initial object the same way as later dds objects and - always using a design of ``~1`` to be used in PCA and heatmaps - - "Differential expression" header moved so that code is no longer hidden - under the size factors plot - - Option for filling in NA in symbol with Ensembl IDs - - collapseReplicates2 uses ``collapse_by`` rather than ``combine.by`` - - Updated the code style throughout to use the tidyverse/google style guide - - RNA-seq differential expression output is additionally included in an - Excel file with one sheet per contrast. - -Tests -~~~~~ - -- ``lcdbwf`` R package now has its own tests via ``devtools`` and ``testthat`` -- recent versions of Snakemake are broken when ``--until`` is used in certain - circumstances; a ChIP-seq test has been disabled temporarily. -- after a successful test, the environment is written as an artifact on circleci - -References -~~~~~~~~~~ - -- Fixed a longstanding issue with *S. cerevisiae*, now the GFF file is properly converted to GTF. - -v1.8 ----- - -General -~~~~~~~ - -- Complete shift to using pinned ``env.yaml`` files to specify conda - environments, and using ``mamba`` for building environments (consistent with - recent versions of Snakemake). This is now reflected in documentation and - the updated-and-improved ``deploy.py``. - -- Reorganization/cleanup of the ``include`` directory - -- Added conda troubleshooting notes to the documentation (see - :ref:`conda-troubleshooting`). - -- The ``lib.helpers.preflight`` function no requires the first column of the - sampletable to be named `samplename` when checking configs. - -- Improvements to the deployment script ``deploy.py``: - - - now requires Python >3.6 - - proper logs (so you can easily see how long it takes to build an env) - - supports downloading and running the script directly, which will clone - a temporary copy and deploy from there - - using Control-C to stop the deployment will also stop mamba/conda - - colored output - - mamba is used by default, but ``--conda-frontend`` will use conda instead - -- fastq-dump log is sent to file rather than printed to stdout - -- Threads: cutadapt single-end now uses specified threads (it was using - 1 thread by default); use 6 threads for fastqc - -- Added new preflight checks for RNA-seq and ChIP-seq specific configs. - -- Added a ``run_complex_test.sh`` driver script for testing the workflows on - full-scale publicly available data - -RNA-seq -~~~~~~~ - -- **Configuration change:** The ``stranded:`` field is now required for RNA-seq. - This is used to choose the correct parameters for various rules, and avoids - one of the main reasons to edit the Snakefile. See :ref:`cfg-stranded` for - more details on its use. - -- added ``stranded:`` field to all configs used in testing - -- The ``strand_check`` rule now runs MultiQC for a convenient way of evaluating - strandedness of a library. - -- Kallisto is now supported in both the RNA-seq Snakefile, references - Snakefile, included reference configs, and downstream ``rnaseq.Rmd`` - - -References -~~~~~~~~~~ - -- When checking URLs in reference configs, don't use ``curl`` to check - ``file://`` URIs. - -- There is a new feature for reference configs that allows chaining - post-processing functions together, see :ref:`advanced-postprocessing`. This - means that it is possible, for example, to add ERCC spike-ins (which need - post-processing) onto references that themselves need post-processing. - -- ``lib/postprocess/ercc.py`` has new helper functions for adding ERCC - spike-ins to fasta files and GTF files. - -- added ``'kallisto'`` to included reference configs - -ChIP-seq -~~~~~~~~ - -- symlinks rule is now local -- added collectinsertsizes pattern to support PE ChIP-seq experiments -- merging bigwigs log no longer goes to stdout - - -v1.7 ----- - -Setup -~~~~~ - -Use mamba for installation of environments, consistent with Snakemake recommendations - -Testing -~~~~~~~ - -- We now recommend using `mamba `_ to - create conda environments. This is dramatically faster and solves some - dependency issues. Our automated tests now use this. - -- We have moved from requirements.txt files to env.yaml files. We also now - encourage the use of the strictly-pinned environments for a more stable - experience to hopefully avoid transient issues in the packaging ecosystem. - -- ``tbb=2020.2`` as a dependency to fix a recent packaging issue with conda-forge. - -- many documentation improvements - -- symlinks rule is only set to localrule when it exists (it does not exist when - running an analysis exclusively from SRA) - -References -~~~~~~~~~~ - -- updated URLs for those that have changes (e.g., Sanger -> EBI; using https - instead of ftp for UCSC-hosted genomes) - -- new ``gff2gtf`` post-process tool for when an annotation is only available as - GFF. *S. pombe* needs this, for example, and the - `Schizosaccharomyces_pombe.yaml`` reference config has been updated - accordingly. - - -- The references workflow no longer reads the config file in its directory. - This fixes some subtle overwriting issues when providing config files or - items from the command line during as is used during certain test runs. If - running the references workflow alone, it must be called with - ``--configfile`` - -RNA-seq -~~~~~~~ - -- featureCounts now uses BAM files with duplicates marked. Previously if you - wanted to run featureCounts in a mode where it excluded duplicates you would - need to reconfigure rules. - -- improved comments in RNA-seq downstream RMarkdown files - -Testing -~~~~~~~ - -- new test that checks all URLs identified in config files to ensure that the - included reference files remain valid - -- there is now a separate ``run_downstream_test`` script` - -- simplified the CircleCI DAG to optimize testing resources - -v1.6 ----- - -References -~~~~~~~~~~ -- overhaul the way transcriptome fastas are created. Instead of requiring - separate download, they are now created out of the provided GTF and fasta - files. The reference config section now uses keys ``genome:``, - ``transcriptome:``, and ``annotation:`` rather than the ``fasta:`` and - ``gtf:`` keys. -- **backwards-incompatible change:** reference config files have been updated - to reflect the changes in the references workflow -- Update PhiX genome fasta to use NCBI rather than Illumina iGenomes - -ChIP-seq workflow -~~~~~~~~~~~~~~~~~ -- ChIP-seq workflow now properly supports paired-end reads -- A ChIP-seq workflow can now be run when the ``chipseq:`` and/or - ``peak_calling:`` sections are omitted. -- added a missing bowtie2 config entry in ``clusterconfig.yaml`` which could - result in out-of-memory errors when submitting to a cluster using that file - - -RNA-seq workflow -~~~~~~~~~~~~~~~~ -- if colData is a tibble this no longer causes issues for importing counts -- dupRadar removed from RNA-seq workflow. We ended up never using it, and it - depends on R which we've since removed from the main environment. -- new ``strand_test`` rule, which can be run explicitly with ``snakemake -j2 - strand_check``. This generates ``strandedness.tsv`` in the current directory, - which is the summarize output of RSeQC's ``infer_experiment.py`` across all - samples. -- implement STAR two-pass alignment. Default is still single-pass. -- Clean up hard-coded STAR indexing Log.out file -- Include ``ashr`` and ``ihw`` Bioconductor packages in the R requirements, for - use with recent versions of DESeq2. - - -RNA-seq downstream -~~~~~~~~~~~~~~~~~~ - -- Functional enrichment and gene patterns are now separate child documents. - This makes it easier to turn them on/off by only needing to adjust the chunk - options of the child chunk -- Created a new documentation method for rnaseq.Rmd. Now there is a separate, - dedicated documentation page with sections that exactly correspond to each - named chunk in the Rmd, as well as a tool for ensuring that chunks and docs - stay synchronized. See :ref:`rnaseqrmd` for the new docs. -- New ``counts.df`` and ``counts.plot`` functions to make it much easier to - make custom dotplots of top counts by melting and joining the counts table - with the metadata in colData. -- DEGpatterns cluster IDs are now added as additional columns in the output - TSVs for each contrast -- Many functions in the rnaseq.Rmd now expect a list of :term:`dds` objects. - See :ref:`dds_list` for more info on this. -- Created a new R package, ``lcdbwf`` stored in :file:`lib/lcdbwf`. This can be - edited in place, and it is loaded from disk within ``rnaseq.Rmd``. -- Modified some output keys to support recent versions of Snakemake, for which - ``count`` is a reserved keyword - - -General -~~~~~~~ -- Conda environments are now split into R and non-R. See :ref:`conda-envs` for - details. Updated ``deploy.py`` accordingly -- symlinks rules are now set to be localrules -- updated workflows to work on recent Snakemake versions -- split environments into non-R and R. This, along with a loose pinning of - versions (``>=``), dramatically speeds up environment creation. -- updates to support latest Snakemake versions -- improvements to testing: - - environment YAML files, rendered HTML, and docs are stored as artifacts on CircleCI - - consolidations of some RNA-seq tests to reduce total time - - additional comments in the test config yaml to help new users understand the system -- new "preflight check" function is run to hopefully catch errors before running workflows -- updates to support recent Picard versions -- added wildcard constraints to help Snakemake solve DAG - - -v1.5.3 ------- - -General -~~~~~~~ -- default 12-hr wall time in WRAPPER_SLURM -- update .gitignore (`#223 `_) -- remove the FastQC status checks section from the MultiQC report (which shows - up in recent MultiQC versions) (`#246 `_ - -Bugs -~~~~ - -- add bed12 conversion for all species with default reference configs -- presence of an orig_filename_R2 in sampletable is sufficient to consider the - experiment PE -- ensure DEGpattern output only contains unique genes -- bring back featurecounts in multiqc report -- "attach" chunk in rnaseq.Rmd was not properly set to depend on the "results" chunk - -RNA-seq -~~~~~~~ - -- dds objects can now be created from a full featureCounts input file and - a subsetted colData table, if subset.counts=TRUE -- improve the dependencies between rnaseq.Rmd chunks so that cache=TRUE behaves - as expected: (`#232 `_) -- add plots for rnaseq.Rmd size factors (`#222 `_) -- run rseqc instead of CollectRnaSeqMetrics (the multiqc output is nicer for - it, and it's pretty much doing the same thing) (`#218 `_) -- when converting Ensembl to symbol, if there is no symbol then fall back to - the Ensembl ID to avoid NA (`#246 - `_) -- in rnaseq.Rmd, all caches will be invalidated if the sampletable or the - featurecounts table have changed. - -Tests -~~~~~ -- using continuumio/miniconda3 container; finally got en_US.utf8 locale - installed and working correctly in that container so that multiqc works. - - -v1.5.2 ------- - -Bug fixes -~~~~~~~~~ - -- When some samples were substrings of other samples (e.g., `WT_1_1` and - `WT_1_10`), DESeqDataSetFromCombinedFeatureCounts was assigning the wrong - names. This has now been fixed in `helpers.Rmd`. - -v1.5.1 ------- - -Bug fixes -~~~~~~~~~ - -- DESeqDataSetFromCombinedFeatureCounts (added in v1.5) was incorrectly - assigning labels to samples when the order of the sampletable did not match - the order of the samples in the featureCounts table columns. This has been - fixed. - -General -~~~~~~~ - -- `deploy.py` deployment script now only pays attention to files checked in to - version control and optionally can create a conda environment in the target - directory. - -- tests now work out of a newly-deployed instance to better reflect real-world - usage - - -ChIP-seq and RNA-seq -~~~~~~~~~~~~~~~~~~~~ -- reorder cutadapt commands to avoid a MultQC parsing bug in the cutadapt - module (see https://github.com/ewels/MultiQC/issues/949) - -RNA-seq -~~~~~~~ -The majority of these changes affect ``rnaseq.Rmd``: - -- modifications to MultiQC config to get back featureCounts output -- `plotMA.label` function (in ``helpers.Rmd``) now defaults to FDR < 0.1 - (instead of 0.01), and additionally supports labeling using different columns - of the results object (e.g., "symbol"). -- remove some now-redundant featureCounts code -- add a comment showing where to collapse replicates -- convert colData's first column to rownames -- implement lower limit for DEGpatterns clustering (default is 0, but can - easily set to higher if you're getting issues) -- expose arbitrary additional function arguments to ``top.plots``. This allows - different `intgroup` arguments to be passed to the `my.counts` function, - enabling different ways of plotting the gene dotplots. - - -v1.5 (Sept 2019) ----------------- - -Major change: **it is no longer possible to mix single-end and paired-end -samples within the same run of the workflow.** See `#208 -`_ and the corresponding issue -description at `#175 `_. - -This version also has many improvements to the ``rnaseq.Rmd`` file for RNA-seq, -as described below. - -RNA-seq -~~~~~~~ - -Many changes and improvements to ``rnaseq.Rmd``, including: - -- Differential analysis summaries now include labeled MA plots (`#192 `_) -- PCA plots now use plotly for improved insepction of samples (`#192 `_ -- don't use knitrBootstrap any more (`#192 `_ -- heatmaps use heatmaply package for better interaction (`#192 `_ -- allow ``sel.list`` to be used for UpSet plots and fix some typos `#205 `_ -- workaround for degPatterns for corner cases where there are few clusters because of the ``minc`` parameter (`#205 `_) -- alpha and lfc.thresh are now pulled out into a separate chunk (`#206 `_) -- Support AnnotationHub http proxy handling in new version of AnnotationHub (`#207 `_). - -As well as the following changes to other parts of the RNA-seq workflow, such as: - -- better bigWig file nomenclature (`#194 `_), uses "pos" and "neg". -- featureCounts only runs once on all BAMs rather than individual samples (`#195 `_) -- support `rseqc infer_experiment`, which replaces running featureCounts in multiple stranded modes (`#199 `_, `#203 `_) -- use ``--validateMappings`` for salmon (`#203 `_) - -References -~~~~~~~~~~ -- fix typo in *S. pombe* name - -All workflows -~~~~~~~~~~~~~ - -- Documentation now recommends creating an environment for each directory using the `-p` argument (`#195 `_) - - -v1.4.2 (Jul 2019) ------------------ - -Bugfixes -~~~~~~~~ - -- Don't require ChIP-seq configs to have at least one block for each supported - peak-caller - -v1.4.1 (Jul 2019) ------------------ - -RNA-seq -~~~~~~~ - -- KEGG results were not being added to the ``all.enrich`` list in ``rnaseq.Rmd`` -- symlinking bigWigs is now a local rule -- default cutadapt options have changed to reflect current recommendations from - the author, and the cutadapt rule is now explicity using arguments rather - than requiring a separate ``adapters.fa`` file. -- featureCounts now auto-detects whether it should be run with the ``-p`` - argument in paired-end mode (previously it was up to the user to make sure - this was added). The rule does have an override if this behavior is not wanted. - -References -~~~~~~~~~~ - -- The reference config for *Drosophila* is now fixed. Previously it depended on - `chrom_convert`. That script was a fly-specific script in lcdblib, but - lcdblib is no longer a dependency since v1.3. This fix uses the - `convert_fastq_chroms` and `convert_gtf_chroms` used in reference configs for - other species. - -v1.4 (May 2019) ---------------- -RNA-seq -~~~~~~~ -Much-improved ``rnaseq.Rmd``: - -- tabbed PCA plot -- improved DEGpatterns chunk -- dramatically improved functional enrichment section, with tabbed clusterprofiler plots and exported data in two flavors (combined and split) -- improved upset plots, with exported files showing sets of genes -- improved comments to highlight where to make changes -- add new helper functions to ``helpers.R``: - - ``fromList.with.names``, for getting UpSet plot output - - ``rownames.first.col``, to make tidier dataframes - - ``nested.lapply``, for convenient 2-level nested list apply - - clusterprofiler helper functions - - -v1.3 (May 2019) ---------------- -Bugfixes -~~~~~~~~ -- Fix broken paired-end support for RNA-seq. Previously, when using data from - elsewhere on disk and using the symlink rules, R2 would be symlinked to the - same file as R1. -- Support for Snakemake 5.4.0 which changes behavior of the ``expand()`` - function. - -Infrastructure -~~~~~~~~~~~~~~ -- new deploy script to copy over only the files necessary for an analysis, - avoiding the clutter of testing infrastructure. -- lcdblib, an external package, is no longer a dependency. In the interest of - better transparency and to make the code here easier to follow, the relevant - code from lcdblib was copied over to the ``lib`` directory in this - repository. - -ChIP-seq and RNA-seq -~~~~~~~~~~~~~~~~~~~~ - -- Bowtie2, HISAT2, and rRNA rules no longer use wrappers. This makes it easier - to track down what parameters are being used in each rule. -- RSeQC is now available in Python 3 so wrappers have been removed. -- NextGenMap support removed - -v1.2 (Mar 2019) ---------------- - -RNA-seq -~~~~~~~ -- First-class paired-end support, including mixing PE and SE samples in the - same sampletable - -- Support for STAR aligner - -References -~~~~~~~~~~ -- FASTA files are always symlinked into the directories of indexes that were - created from it - -- Reference configs: - - - updated existing - - added more species - - new post-process for fasta or gtf: you can now use - NICHD-BSPC/chrom-name-mappings to convert chromosome names between UCSC - and Ensembl (see reference configs for examples of use) - -ChIP-seq and RNA-seq -~~~~~~~~~~~~~~~~~~~~ -- Updates to dependencies and MultiQC config - -Infrastructure -~~~~~~~~~~~~~~ - -- Updated requirements in ``requirements.txt`` and in wrappers - -- Changed all ``pd.read_table()`` to ``pd.read_csv(sep="\t")`` to prevent warnings - -- Changed all ``yaml.load()`` to ``yaml.load(Loader=yaml.FullLoader)`` to - prevent warnings - -- Using DeprecationWarning rather than UserWarning in the deprecation handler - so there's less spam in the logs - -- Improved tests: - - - using data from pybedtools repo because modENCODE seems to be down - - append rather than prepend base conda to PATH on circleci - - separate isolated tests for STAR, ngm, and SRA - - updated conda - -- Docs additions: - - - TMPDIR handling - - clusterconfig - - WRAPPER_SLURM - - docs for developers - - symlinking fastqs - - using SRA sampletables - - paired-end data - -Colocalization -~~~~~~~~~~~~~~ -- From colocalization, removed the GAT "fractions" heatmap due to unresolved - pandas index errors - -v1.1 (Aug 2018) ---------------- - -Infrastructure -~~~~~~~~~~~~~~ - -- The default settings in Snakefiles are for real-world use, rather than for - testing. This reduces the amount of editing necessary before running actual - data. See :ref:`test-settings` for the extra step to take when testing - locally. - -- new ``run_test.sh`` script in each workflow directory to automatically run - the preprocessor when running test data - -- added extensive comments to Snakefiles with ``NOTE:`` string to make it - obvious where and how to make changes. - -- Documentation overhaul to bring everything up to v1.1. This includes Sphinx - autodocs on the ``lib`` module. - -- pytest test suite is run on the ``lib`` module - -References -~~~~~~~~~~ - -- new `metadata` section in references config, which can be used to store - additional information like mappable bases and genome size. - -- References can now be included from other YAML files into the main config - file. This dramatically simplifies individual configfiles, and allows - multiple workflows to use identical references without having to do - error-prone and hard-to-maintain copy/pastes between workflow configs. See - :ref:`references-config` for details. - -- New GTF conversion, ``mappings``. This is intended to replace the - ``annotation_hub`` conversion, which was problematic because 1) a particular - annotation hub accession is not guaranteed to be found in new versions of - AnnotationHub, resulting in lack of reproducibility, and 2) it was difficult - to synchronize the results with a particular GTF annotation. The - ``annotation_hub`` conversion is still supported, but if it's used then - a DeprecationWarning will be emitted, recommending ``mappings`` instead. - - -Both RNA-seq and ChIP-seq -~~~~~~~~~~~~~~~~~~~~~~~~~ - -- `fastq_screen` is now configured via ``config.yaml``. This reduces the need - to edit the Snakefile and coordinate between the config and the fastq_screen - rule. Now everything is done within the config file. - -- `fastq_screen` wrapper now handles additional output files created when using - the ``--tag`` and ``--filter`` arguments to ``fastq_screen``. - -- In the config file, ``assembly`` has been changed to the more-descriptive - ``organism``. The change is backwards compatible, but a DeprecationWarning is - raised if ``assembly:`` is still used, and changed to ``organism`` (though - only in memory, not on disk). - -- Patterns no longer use ``{sample_dir}``, ``{agg_dir}``, etc placeholders that - need to be configured in the config YAML. Instead, these directories are - hard-coded directly into the patterns. This simplifies the config files, - simplifies the patterns, and removes one layer of disconnect between the - filenames and how they are determined. - -- removed 4C workflow since it used 4c-ker - -ChIP-seq -~~~~~~~~ -- macs2 and sicer can accept mappable genome size overrides - -RNA-seq -~~~~~~~ - -- RNA-seq downstream: - - - ``downstream/help_docs.Rmd`` can be included for first-time users to - describe the sections of the RNA-seq analysis - - - ``rnaseq.Rmd`` now uses the same ``NOTE:`` syntax as the Snakefiles for - indicating where/what to change - - - Easy swapping of which strand to use from the three featureCounts runs - performed by the workflow - - - Be explicit about using DESeq2::lfcShrink as is now the default in recent - DESeq2 versions - - - improved the mechanism for keeping together results objects, dds objects, and - labels (list of lists, rather than individual list object; refactored - functions to use this new structure - -v1.0.1 (Jun 2018) ------------------ -Bugfixes, last release before references changes. - -Infrastructure -~~~~~~~~~~~~~~ - -- Transition to CircleCI for testing -- Use production settings by default; see :ref:`test-settings` for - more. -- lots o' docs -- new ``include/references_configs`` to help organize references. These are - currently not used by the workflows directly. -- bugfix: use additional options when uncompressing downloaded reference files - (``--no-same-owner`` for ``tar``, ``-f`` for ``gunzip``) -- additional dependencies in the top-level environment to support the - additional features in rnaseq.Rmd and track hubs. -- colocalization workflow, external workflow, figures workflow to demonstrate - vertical integration - -RNA-seq -~~~~~~~ -- remove kallisto indexing, use salmon -- improvements to how chipseq sampletables are parsed (with more informative - error messages) -- run preseq for RNA-seq library complexity QC -- support for merging bigwigs -- featureCounts is now run in all three strandedness modes, and results - incorporated into MultiQC as separate modules. -- RNA-seq now symlinks "pos" and "neg" bigWigs, which describe how reads map to - the *reference*, to "sense" and "antisense" bigWigs, which describe the - *originating RNA*. This makes it easy to swap strands depending on protocol. -- new ``downstream/helpers.Rmd`` which factors out a lot of the work previously - done in ``rnaseq.Rmd`` into separate functions. -- track hub building respects new sense/antisense bigwig symlinks - -``downstream/rnaseq.Rmd`` -~~~~~~~~~~~~~~~~~~~~~~~~~ -- AnnotationHub uses cache dir that will not clobber default home directory cache -- use varianceStabilizingTransform instead of rlog -- print a size factors table -- use multiple cores for computationally expensive DESeq2 operations -- using separate lists for results, dds objects, and nice labels for automated - plots for each contrast -- UpSet plots for comparing gene lists across contrasts -- DEGpattern plots for showing clusters of expression patterns (from the - DEGreport package) -- attach normalized counts per sample and per factor (parsed from the model - used for the contrast) as well as TPM estimates to the results tables -- trim the labels in GO enrichment plots when too long - -ChIP-seq -~~~~~~~~ -- sicer for chipseq domain calling -- pin snakemake <4.5.0 so that subworkflows behave correctly -- chipseq peak-calling rules (and therefore wrappers) now expect a chromsizes - file as input -- bigbed files for narrowPeak and broadPeak files are created correctly - depending on their format -- run multiBigWigSummary and plotCorrelation from deepTools for ChIP-seq QC -- ChIP-seq track hub generation script - -Both RNA-seq and ChIP-seq -~~~~~~~~~~~~~~~~~~~~~~~~~ -- update deeptools calls to reflect >v3.0 syntax -- support for SRA run tables so it's trivial to re-run experiments - in SRA -- multiple FastQC runs are shown separately in MultiQC output - -v1.0 (May 2018) ---------------- -First official full release. diff --git a/docs/chipseq.png b/docs/chipseq.png deleted file mode 100644 index 051e0df1210824231d7ecfd9d78cfa3e318a5e3b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 25746 zcmdS>_ghoX7dDL25rjx6QVhLUsj@+&cOfVpMNnzd2~Bzz5=!Vjf=H7hpdf;D34|gb zMg{4F5&;pB4g&Ay^L?J{{o$PR2b}8?$eyxh%{}*8vu5^+s5@{ydRh)z5)u-61AT2% z5)ujo2?_blOXOh5vFX19@Q>>qBXgaz^Km@(mu32ZuU8Kkwq|atA+=`J2McNJs)RM*#4N#z)`AkAwt*z4#%`7KdCT zA>ko0(AF>y!fZY#=4^B$ygHTz?&s7A4$ISIeCC@w=e1(JXUj33-I-$2YCg%Lb7SiPageJqD^Y zbYW&j-#i?(;FMrBU$yHstE_bkQ1UYiCZBb~{Q|HwrrP7mMD`{NSW0Q}<7%D6?ByY( z18vpYtCb(Z>V^5yqGjthd;)0fjX`1h-#j0>`_kq@am=RSC+|Cx>jf_w9PfLFv)kXk zXt1|Ae=OUXlMGa(txtFDIfY5GBaC=C_qR^{u5dJYA8hA62qU?pi+OvY1vr(#i@`vR zGqORMAPNHF7XnAlI^GxK+NgU^_k!Ag&z_72jGZ~{tbM>Ws!jMaAv%d|`#ZrZR)LMH zOG)Ei-9GJ}o%8K3-m@Fq!2$S*=E&2CG;eYe}1 zf8}>|_V2NeoHJJR{{QJbbBq3F*uSa0j=r65O$U04f&9Hoh1U@|J6mV_Yj~-^q|&Yb z1gOlZR(d;o<0lsPUUfFQPA@udob=_tsZzOslQI;ux*f6~KHy{DBs5wtzIH`*TI;eL zw|(-zTcgtxOGY7)-|f-tg+mv!H~pyuH%U6{^9#eY35{+knam?qrg|L;Ru0Do4qz@G zfe~Yf4@XMyo%I4@xcbOJlCXmr0wjheZMfw(tp%A~A@Zv?u=!(Z+{J-Eo%V|@MsTeB zTP_y8QQurFQYSp-lI`Pn=CA)wHtqx0kydp0EIcTpf?9wd{^LRl_~Ur7)&i}OTlh+& z`k%r4qo|7l^ZB#a*O%Nc`c!qE-nZ|c0GI#&7p=Z++MjI5JR81f{NeQegO67KP0dqM z%k#+21jP1Ew&&u9w;eA*;=MS#!{S^0KX!whXZ=5%?=!#RyFR%40aWN^wG;O>AF(uW zJPXb--}QmNf5B41vDQ&rYYek-EGVSq>`%f+MgO&<{Hq}1Jf|gLD@A-v+MvPRX^m#Q z`+H5M2V`%wAPsVQQ)o@<;A`FsP{i9;!tmeBMoM6nQ#zFR@}g(}>30_tVdIExeFX}Z zkR{lfg9?LUrNlt4CnW={AosI$bTLVwEe!GTkLrN%{QqTl!@6n1RzDfR9B@AO6!}M= zffFguff9#6988Cieoi0`gHGM84ImE7AP&|b4k7G5br&|@kqk^+;9#eV*|@-ghkVrO z0*5px@h#Y07KG6T?5^b4-CCUt2nC$(+~r5WaDJ5`k4~@Y97^lLGAm$rY|@r)<=$Ky zMp0bj3jmwx1eY)hpk;(#2X$-6;*btsm^gUiF?28Y#LUh3(FV88{n>m?6KStilY>tN zbUti;87O*K?G^AzPr{AOm(hR>0Iq|w6Y~b7mj%)KF=TW_RDOf|p<|*TKFTj7`#nC$|{@AD4^DWI$m6>7b0#{*C=Je7b*wA`vtJ!rzx)88-Kkp2!jMu!1_Y{OF3fX!3w3SQ*qX((HnkjRXH*{w`o$ z=cgx9-}~i(_!+R7{;J7e$HlRrfsFtVxuJ|baQF}y9%SIfj)1pW=Ibk6`*1>>yH&L5 za@cu9=hpAWg?>l>Db>L6zIr=%FJn9RY)F1l>R8SF-_@(-*HHN8#uQu$XlI!%G1#@P zVATqTvB2upZR+jnA!mO)v*F9rYezkmn#K*cHCMjexHA;)I#5xHm+gwTV=Fr`=>X zNUi@FlA$tmkAD5}lJ8HDj{;Jxq9HRXbK(S*&Bp3aQ)GmrL!&Ly)WCm=nwW@xr!HVK z&=m1O3te=ieXpsZ|>L6Ang|>8H=yIUFN-$NaDUuqUn4*td!@ zu1se@P{_(02^zyb0~f?2J4L+%)-ybOyF34*Y4D@&@mwC??xKsQQ#n?p?*xI*T_|4a z-{AOPpjX(bF!GF=kdG-)T$uos{3K@vjTp`Hi+kX{4qAuTzeZBwlF6r*=kYrau`*St zICSKVg)){5Hn@9SE7+wnvO?@BA@@tyn`bCztc>eWkXh{6=De38VDHt|(*hcJW}S-p z+VJ-vvqY~@Bn3?)XG+S2S>LsP*J>smuD~PEz>$zy`wnD0=qb&T`MME|37tXzbC8C2 zv$%0P;AfD>StZu7_+X@#-L3yQO`S_5unAMpBfJ|#n#J&RZ!;u(2>V|*S_V2;oreE1 z+kaknQ0flYF$ZK|^=)0uObi+BHf)NF<9~G$(mpD;L47O)`QdiFA-84+p9f|+b@-6ZsGK!|>`Tu-B9iKe#AgBs7aYIL} z!F3Lw$5@S!|1+Ge_Cj3lh5lN)n6G^2%}3it2=@*#ALkz#9SfyO1wt^FG^`=<7E4Nje$$41kT@W|IL|0r?&VVep|< zmhjW9$f9bEl98GX+_ViP`O(##crnE`9<~3f7oHe~1(sl=pMmS1F^nt~IU0NxeBrFJ zN%c(r`dt=+h?M=e8Nd`on^M*7!A@^v2~P}`s#GvQ8UlQzV)x0)N(Z|C_Zi{uKWqf+ zRtM(_x=Cfz1$u&>Cpg*1PTk{aRNg*VR4NE2cK5nZN*kGs>Opz=_pXex9d)kXpF!;e zDO9d5zwq?6cjOpN((i9-cu*4*hrJs@g2@3g@&@!UKcDcUQ6+SoIGaM)c&GMP z_5z@15sW{ylmgF{Q2hUX334MI_oe_s@yUQG<5`&Kb%f>;`1s-N%p&G*v*9t5r$OZ8 zJO=cmO<68m#aTd~vH*kdL(q=Xv#X{7hs;t9m}u(Hc?ifI|15XnHAF^883WGy5!k>9 z3&mFwd* z&1=6&L38~MGQcP{5F6ZF%q1}U|Hl3iF7m~{_m0A*T(#P{iUa2)dy)0=ezB(x4O?C9 zI7tQi&MP9J@8()6Kj&OIeRd$vC0tsJ+}aISO?7!+P;}I-vW1sU)k7n(+ial2aNI=V zx6(*uJVU$QZ?;%Xaa|qTKlwe4UT8_B*6oP6HL8#4=}fqDvc8!hsc^+h_26LXcSxqx|)e?m4HBRq~y7)^AIPDHmAm-x~ch7COVv?=?3|Jmom(F~Y2QEwnxO z&YT~D8TW$WwVV|Wj}GO}KphLN?A*df0}s@m5AS}WUJX_xqYC=@|WoG?x zO_Oz~os(KOjv1p=-*$va$tie(Fo8%g$-w8O*|lyr!k&XuP^I4qT1Fne{{-^b^~aV4 z3e`hBHl_hCT4g3w&xk019v;15Ovla^e#By7g43UnQ^~}Lb8xxf>@9R2e80(03qW*XGSAZiFD=lZB>2%;#~rR%nS8#tU0nPn(`8JjdwJQ5<6n>tFNe=G zxvbvn5RWbgKe`PzmO2sdhRs?ka~pDuiF2pwtRRO`c#qLHEV1|-yoM7KwE7w z?2C@9f&zD@l8>!aB`Iuwfg2c0g@DLm?r~J?%aIq6r&dDJE&Fg8RHGc|W)6>Y&48sPWg{kMKS7yapZ%caRrRbGqxbDRIhN98peYts~iXSLX&ReJ2bxL zuK3o0VPZ<@L0)1GvtqQnFoPbglE=jsXj-z-$v`Np zFlH^NCP#eX1rHz9w|^!|z8D--FO&*eY?J$3EARyrTg_PVvyqROlW^k!9Uwr^Z&G4} zL~wx=cDhS@gVp!%`06#z5ZWTnbHhV9&@~G(S3Q*TIBaiP`#51F4B@^&)0ppj=+uMa zYd8ELC7C@F{$o6T@q2TzV0$e|RDlleg7fMA`;qX7(ZGKj= zD?b4Dg93u8`sZ$qM@?%atYYf0kU~&^Md`2Nj27D*okEE0ZAL6Fzg|4hcyXY*`{^;u zs%G%WNLi3FoKU8T#(IE;EsC9+^7RnOzQOsEkCXq^AD&YZKs93`0aQ^q;`I>8%ZD@S z(fnF4H+OC1@(u_CGA zZPV!yZY3i;(moihK@{vwD;PI03$`sni`-rsb^wCwSfvMM^^0G-fgkoTY+(#7P3o!k`f1*Z4{Uw$K(c`3ZC= zA*Q@&yxHrfwY2WLJP(qv7Q+gb+6fPK$q< z&($gapoLdV!gI^ge3B9UU=W$vB>%#PYs!DprE{+5SCu+L0_)EWmwzBc2Db|?SH3h( zI7<`NhDDpCJsfz{zXsB>`i$+H^~;^h&P~&Hd=qIFlQn;?u;!u9&$faepHEl*`JwZ9 z7z`V_6MLO`NB1wu%tgyrszdfUnvmqqw3mRpzd+ttlSc30$bCdUAr2ITc_1ZycX=T& zu}cJsJdi+(+OgDwLy@c&k(sfbG2KCItv}T@$Yj*;PS$b6)#%E z0WG!aXh0>Oc7WY7u1dbeCDM|Rnw|J43-FR%kqvs z8`= z*7l)FAuy9naBif6l$d&cE>GQWNSNEG4B^G|b#l-%5cF&N)>h}E9*#SkKN~1t$tlkW5_N0M})E`h?%<=g^2=i zr-4VLdIdm}_T;0B5^a>iAgwp_vEcd-aJzXBgN5!{TtVc$Pp#bv1<6@n^N{d0 z8um?Z`5OZf+{1gCO0Y|k#H1R-wBF8O$LI6z2{9AI=rJ1Ziu-~fGoC%;gI`moqdX1-v=uKt1U+P=0a7_z14+UDrFscp$lnAF#olXnsC#4zJDsf~22V*TUbOI%pMy zp&mq0iv)3pv{@twRG`oGHpb;-P2sm;;P+GI*C6b=2zn*?QVG$`JHyihhhln{G>1)!lxQM z-Rq50y#I)w9nPw&Q;k43eCqWf_(vp`edtk;$r~=5K*fVpxqmxc)t(>!y90LU8NsrT z1l`nODgX#cK07W`K?-Gp8`7}3NWeSQ0@i;EZvo;Mir+e+}0Ut2Fj z_Pi6vtnhQ(v!?#1Y9lV@a>+qz99`GhS&U$$96~&!ZKrdL3Z5s&bxSRs2hRY!@0k5b z`2(a}EqL}8sD7A=SFsyATlivq{nqF!Vsq~}AG~BBeZZ&~I56q*+A;ofLcWFI0NsR6 z2@wcy-Me12WY!lLHU1A)$&Q9am1m7Xn+SVZ=d|5+>(u~U#~PpW%gfA05o2ErH+HK zr_YDi7Nm$oEwpu#A0uG~ByO&kR5lz#md>9A#U}9~&gT8?e;xKX`sdl_fOl9t!fySt zqI;1w);03W`YV~)01V_2*O7r-4HbcTUqp{CN}a$H1C|9d3y(;ooClcO>L@7r4o1Pg zsbdPkJH?pNRdE}Fe(O{X&+?@TUm?P$c@BgWpS-K{Hw{co8b2+JlCXTZA)|}Aj4~{+ z$!4g`(^IOtk74BIq~=;;LyU?`xpiioig(Kv*@~47rW}pyAKgQ&QhwH*M*TRJ z^={FHyJ+d1sJR9WCI*+MSfn4LYo>ELi6kS5aA+p~6__O92_NnUfeo>4Df{&kBQIKC zNMl6@?eG8UroXktGJx@BlqY;G|N81E@kByz-R6#wO*%j4{cON+0P*1l!MsYM%%*xJ z73SQKj2O~~ZByRmM|1d{O>a&h&GP@gSY-`4TuqHEdmQU!aJN%PPgOPskYB!}g(L1b zW25-V;_CNWK!Z&F7NBpoX#62R=8kAP4m4Kr9SLVh%V-_ zIh}jNpohI~f5LN=(^ZT3U_2T61|kQ5hFV5F59NoGiqR<4#0AznT!lu;=4)g{oSBn- zeEv3?L5@ge-6cqP1&hz6#x00`@SP}KBh-T_!0A;S?PNgIp+jUtJW5%Q%^?(2C}*2?H8U;tfiH*>E;40hRgpga%q*@L;O)N`PnSj!%K!<%;0qtY2CIB zPwq#gS*Kf!F>y8JUrNLfN}tj2eP0g)bv9K9=d0W`qyskH>On30GMqV6<=^Ut(ZD!e z$z<$;RkQ}?l$RgK*yXFTkEKEk1h`=b zYmOh`p9ROxoBRxI7)mdVr2I?g@>Ohblyd|bi)Xi%6B)WTdqpmTPY>rFV>0KY=eM5i zxZW&7hzhtmNk?l(y0U)h*4+Y{uQXXap{|5KZaEJ(P%c=~HSLDA$D0akw4%m$I|dBW z+a(Y!PU2RKx|qCC%0j7-n%R0Fm`rgG8{lIeHMKrClaEu zyiq=v_$>kB-wi#E7D?x;V8DfcL^gg?x?gCpefy=o?2%pVjb#BN8|QW-&T9yH|HqU5 zFN)6}+uk^Ju9aFAsG;K}=x59Fa`MCVUA#PXHKdaLRt&cK|MnSJ!DO+}8s|4T;A@lp z@6OCWr>w2Uc;Gt5@Q-BB9|T{PlwBrds3!r_`Hr2kO?i$YEyZ0EKEwg#K|31rKcLjL z&joU$jHy@XpDK>OW}35-As2kjM27If@xSCAM`RqWWnf!%-Q{RQP&2^~p_QLZB4PV9 zw<$S2ym?gWtqDP65759wv@tc2K2lNUmzkut>MJ2svjRi0d2-vsU*b@r=Iap=Y{1)! z*qfDA>9O;}?i<-G5~ z&OnT5(%6namthI@W#UWF#_VSI@}Ldh0tAZDqUV5BoN>Yu{Q=Fh0lG>XF2kby>I6;9 z^xu@Qw?4(`e=3DW6lmoF&+I>MY(`gIem^IU$GgDk$=py zmjVSf$w=(@ZDtM3!jT^%@k5iwj_%{iVWZ|Vx4T0GT^UA(6d=)+j8l&Z;6>CS3s>%K zvr0)DYvNybK8_o*Ca(-5JaMKGh6i;IA~S1LW!*dNK4#zkl=U@PSQ|3;l9BLAU?<>; zk*rgv_5aNU;8P(OVOxbfKa_tLnUIpRF>S2t@*N->q=Jbd%zg7`dwr2~O7sY+q-)=s z849E{2wBfC9u_(w8+VKu4Kb5xW(>rSO>5RqGfd@#ZXB$o#j+Qc)II4QG=}NZ?lBRk zS{5I4O=3{Sk%<;qCOswIr;f{iG?7f4W{LJi2NSB8ZKFed8&P5u~I zziz*Q;G282=q@m7@C1Jv6?-F3vbj6`Py-fuz$+fUEKIwc$l-xBrp;Cs0C%nQvnK@+0M*GOw4rm|*{eT8B z8efQFH}Z?`UHp_Ym?piDxXuruONPMvHN13 zqylLde)0)NoYv8&X9MbrC$kCo@ZM{D_^B7l1FIon^#iLU z?4|4+`*Hv7RiBv=7Kafg(n|cVauRMjCEaB{h;zbKBTVBL=R1J}*p*dsbSm)mnF$rf z1H0{c`P(d`5MDyz zKA~tLxqF+UvMy42L8L!%<|D;^CtQ_xzU7%%8sLIUxs#+7xI_-OBOrN&A0=fz1A1Pk zCjplf%b~=7Eo)`>U@wcw?!)YZ#1NBf%U7D?BD zbTXPU=?NDQha3vE{I)1eKemV>0oESl++3iDMk)g+qtD6$+{XVE>*z#A-xuWWa`(WR zBKiu?!nAS%|bL>B2MI=3u@-T|_J)Pt>)6XI{S@lH-^hrjVS z);=_8{a9?;6@`L4ST6k(&j&05w1^54f-Rlm0}S(L<0N79(*h?{Yz10P<%4uHT~N(m zAwk5$`vTjg4j-FGj4>OF6~jN4f`spN;Apf&7>e{T#LI=W@G2zvEI*2ylj-G(;HY45 zW;$?IB~<{t`a3DR5C9=`D;k&*?lgPp5^R|jXQP=@ou@idT~63Lx!+?u&d+YaJXT+y zhJ;Op4w%@U4bFKTiDnX)s}L_A>OMEMk$U(A^EqqR|Ml(W`TF7G!HPJE8EKX=Yei_-!t9F-iHB=9SYmhmi>y$K>3*&@!w`yecapng2s^z|9Ys`V!X@uHI{N*-D3Esn~q8ywc{aJxZej_=rKI z@WY|N!4Mm6Z+Rt2tK?a+HKf+e$10RV%Nhpj8i@U|3#8OY0>>y(`Hcv|jM`}uQ4@3E zllo=@!4a9-Xnh`nsh{Ka4zgNZh*4s2F?fVt|B2|V1YZk?O^@_^#cFJ^^mPP;HEVBY zxTTkL;lNAQkPL*rN(*Jx;S9syG{Xia}(3z-ZuEK{OCIl92`oT&Dq5&VR?>_xH`$ z0$`_S-bHL|mUxqbaA-pHyE?w5hRX$LFmyMV+XjzWPO~cU*9wm~j_WtZek^g1_HL{O zSt(ThCq3L!L0De!^prQsqS-SOyg!)DKmyAtm_#7HF#AQB&w2Odxv^J&mK!7M-N+f5 z2#Y=MVjrq|L^zd}$lFSKc9ro$!o@3j(c8|YwpbdBWD&6N+@FKMplBpZVCX30K`%O2 z!mua_brvkC1w$fm?tKg!+KwGmUt>&oBKrK(xSYey4JMqA-yQV4FH%CI2n( zk#kFp8M%H*pL;JSaseLL+N*fh< zYgI&F!{U6xp@qBY2_y0Q8*XM-C@zS}biFH$+B$It8%&yUWc_t)jdY_nR%x8WK0`o`H*>7l$N=(cDW;9$4d#yz z0Ghg0W>~QAr4x6_9Kb`^7b`ROP^Y8u_-PjJ!@Ci;bMutW*HT!qkUyXTerLD8pKkNz zWAFY*kc}t@@!-9bfKExDtoQkSw)OISS4-XYjdjF&RC+1d^Ye_r&B>d!BV~Ucw$hBiNCh?-qB;~ZnC-tRXia3pzFSmRCHX$SuYU9G-Ng%c2rJUyC z+DH!`Bo)%VFWhP&YGWbT!tkQobqygm$*A=)hw9dZg(N>tr&dRSNsz1)^5Yw>&Bc!;f zNGcAlw`|5K>L&M9>ZM<=`GWKlzgSS_$53JrMVphB!xB91a$F*8=1^OQQ$T50a(0rq zEwr4g16wlYNOuW#(Vu#~I8LJXS3&cq|D~{(EAh~zWoVbbdspua2&w>6EU>Y~5<-rY z%a-lAOZ{9kaR}lxnsgm=igB?cEVxf-Uqw_zI?GRC=Pe#WvT0YE&4PI56y(~>ec1_zb%>|m$(yqK*7`>$Qusx(f zE2Y+k;43oGFxNKqNpPAs8U%svgF3=TYLV zH6vXWu~3%ay@Aj+oj)1WfRV$cdW7tjXF0=`k}sD$!`x9Bb$pfefoiyKB;4x@Q^G;N zD?vncZ+f=x!}F*m`i;Cirat1PNUeKBL>*?=VYn<}KtTyX2Pm-=o%A;IU1tf)9_P*A zbCF|V{x#rI1H5~=nv^-ccxhTZ9-zY1#&i^t>*9L5i`mwcaEH6kl;ipz=_y-%tIC!y z4hJHdP@7jZz}0o8Z2pd5g|c3rrg(Ln&_>a(VX6mMeR@X}MvMPw!pbZx=lGF^O(!(p zJATUPn#PnE(~WklZn;ckO2}_bom8pki~&dcr7RU4IsG|De6UVG3Ue6H{fa+NETd_r zn{5C0d&QgyjXMN3AKK4>^f4!bIwu;EG%U*uWF)Wnb6l%iS>7Hv7N&th7@O`rn>>+w z)X`8FbKuH0!nn*Z{fctcRK#retFsV#`DD^g8Wc11+4Icc+R=1$gIYctV+A7NtNT-U zmHx}1g=Ims+b}gRj(6;xi!&u-Ut>Ey1k(G?SQ}gq znV7}+JKlqe&-{%g&sy$#s`dO*ah=xVt8I7XxEP2b>&E<>Clgs=4L5vORUy-vbLet z+wSH2m=p*3z@W8dUbwuI6DYt})O=>!qK7mgjY~{lo}5&1R@O;&6jEmeefV|J8H|!+ z%2%wSKv61*777dDm(GN0|3KYT)e?e z#$pV9J>SKWrhMZEP3GN;mNg!1bSsau^38?3;Sa|Tj9v=4a2_`J@P#IHsXQe7D>ALB zR2VP@nR<_Ro&W3L;hg_0IjU6(0_?x+{0{kwWa=EAz0Jor#<90bZbu;r&eoLHrZReQ zT(eEmt!BErO3SsQp~Q#8{}=tZ7ZNnfwh& z9x}ii&sjlEhA=Ht7`QN5zVRU@T?|rGVI`Zt;<9LDs9E_sWpKx4IqXUXjsL@pCaP9c z@19#?qFpWnA&y@3==ZHR;lDUDRFtLksFfJ%{bU&@7wsqt0QOK=xgrhTu-#XHQ4a#p zdK)&ihc)t~L#H-wunPq+La7T&uOeFD#`l#XY-8W;m11ew4QlH(Q||orwNDNW1v?&`(9#f5 zBl>=&jR!$?^)dEQh|f)(_oQdHvj;ByR=824xcf_(7t$$^h3wsPz2|CoM`PySj3&hE zi%1?|eTO2xhF6&3%DydoJDJ;-;qNR4)mB$_gn7X9?eB66dHh#UpKZB4x_WwCQRKzu zL6Z7V>MZ&pVa3nXhgZsj!Obn0$S_Dp+?Shz)T#f9kEy zI8ACnt(jxyJVCEUJFRsIKQ*KQs~disMRmjLm;OROOtgUosD6k#K+J@1p3yQyHgNgvd=jTlv}&$ zmTI7w`MnOtArdbQgW)QEL<`qtQ`yNDO{F&`Bnl;5!4wxhD0v7o z`>fF71{gu8c*i~8}pQZrDyHJZcO{84Rr-da}0SI`2SvED`i(m=D z)9~nZ=zb*aUd&Ew+`bNr+U4pafy`OQ0=5Z#B23=OqRT%?5>jEO95kCZus0?I(PTcB zX!5T=IG6rZXz_z@zFgc9Zs^yPm_dbJWsQ`>4Fy)-&2nCa8$2%iwUO>?WuyCo+yKC;3#)%imN38kge+lV zxoj==)&bMNJ-rMFn6abt_$R}%^~$OsJNWi{!ZN{mTn?G;R< zIK{EhRgib64^?2U9i{$#ft@0PHj7b^Ar9G#j|YtGR|b%!%shy6 ziBJreg~$E6Kf;9T93zETUjx}I=oEm=ly`LLxx_Pi1|I|sDTAS%z`ystC0V)hUz&nn z@aUPgt6tG;P{AtUjkBTDEF^VwIv+cqrVR#X>-J!S&4R!Kxseqm>Ne*4yTKZv(cg(b zvCsj^YZ>h3JZr{`luX1UWr**0Uu-Iv4){J92>oUVi>5C7C*)G7nW3Aii?PzdDt$Bl z=GCTFv`HLS;VwqpvP(4yErxV%ouUqiXE&ACE|SRR-%L0vwI1D*5>Bx4l#_2-Vl|=F z_eAZae2!2}B`*4|W>w}MOe}44)YWlEp3mf#by+P#NUU;jz^^IhCmA=m5(VM*yv0OY zq4`9NWrXPNM6<_CL`yK`J3a{B%wP)YGUHo+68`h5-7VpwP5%b>li1A`hWS$al&#A< zZ<;&i=SLj3!@?w4W4C`2BhKX??Ce%Vea!*P;ufc!TVh65KG8(_!8)5a3xN4Doe<2j z_ieIbbmK_sEz399{7}prqt!&1RfN=TE1WMPFPJibhOoOc7k2DzH|BaR@Kmk0XIlq< z-UvIb4u3lQ<|Hhr*;=Bl3dIKR*iv(_qSfCF!FaX43+Zak-T4L!|EIiqxPhqfE<!-Gt{XJmv zZR|Ghl#gyrskT*k+Hb5dN1*X~LGSUe$B%tG3;Zyh5UckL#6tfRtj%UXMBzv2+3AWW zdq1)H!;W)ox&=QZkf6#7^`w`Y=yTy zZ4?|GTR6!o51#vmYz#5o^3a)$bj^epVs+&>~=v4R2^MyBC4sYM0b|f=f z`Ec3xw{UB}l+ZC`*tH^jg5i=SjB8O%53Zuft)P%va{YNKeet>fO@f3l(In$|qA>N% zidqc^Uq}c>5C^^&U1>xqt*s@_kiHnAfvNgF5eq`r`)f!PIf*ul!*Ld390$`*zN z>2&fx^nP`84*|rNGnR+^Xsi76-juo|jIUZ%Of#=Jar=yBJ19RZuJ$P}p_}>|VrIQw zNZfa_=e!mL%h(d2B#VgG)P;Y#zRKip3b{jkNbpII^?QUx{JdcScP98e5aFwwv`2J@ zVFZ42_gxDofaf-_!9(nijT_>`@NE4j3d_vXgzon$;jao7G?0T%g z_02KDy)kx`E1Q2`%zpbUjkw8`9zi$4tXz8jA~Azw%DLtk#k4T;0^qGEo`o)@{4uKj zv#RLcAkf6Ul=g4fU&wLcCnL3cM!cdfM#NjUFKS6(bg{u@2Cax;+`kl;p(+%kxRD=Q zc8AZ$5$Lm)yhWD@;|}@~YIDXyNP$v+ycTtN3^A#00^d#6f9a)aUH4(4cTF~EVDYuN z4R$*S3jaHWU0f2n(}Y`k6`J;S#~|`5>7d1)8=Iew&gc;4z*$Gp@@beHVZiW3#~Pm+ zBNf+%e_^1m%>mULM~nJr)6SpOzOUA$Pc`X7^*&AXh-+mt>QN~66d*7?6hH1^ga&x?APYD zd^n2sTB}Y2OxtPfn|5GQ9)`>M?e-^MEZS*jRTUihcF}zSC9B_&{DMkzaqi!4!;F^w zKS9?IF-KSFVTK_wTh<(M{J~~U*hhwGis>QSU9+Vzm73_8tI8p!Z85edc5mYZW<-J> zgs~xjjAi56JK-({0|wIXW7y_gYO^B?-b08!{E&(3HY`xw^j>C%&M3c09N(Hroo(`| zynFd|BI}Ali*b&RpO4sKJ`$QBW;2J)RnUgf`ZiTs$$kv1PNIekP53|3?6LT6(bOkH zU=aLtO}*-nn&FFnH|%Jrz|68OfWiOro>2H*U0BmP4?JWcPAD!d!Q;5-5&7jL#(>; zey+X0hOhr4`{bjHk6Yv@ve)0(Y#cjCgBeI>PJkH`{9l^*o1dt>cX3(g3r5-}NA;#$ z=7-pFv};H_@p$-X;I0rF=C>QQw6i(fHXsDvt;rC&v)O@dcHYX0x5V9hm7BzyGtvd! zh|q&oiEqmd;g?RPud`1Sk0~0LGrBAqH(xQ}!9L>ATf#qb&DE>_r7%v39**Q$Btw}U z8h;B$WpV>i?4p;I43NDYtzW3n*V4P+FmBWOR0f*`SQ38zp|bfM)w4^%l2-aQX0DC- zo_zSiFU>fnY<<1MjATGbKHK2;EgF7^l1>U0{L6y@(QGA}s789m;kPY9ddC$*Nb=9d zPykGKqE(_n;i23=%S7J_)(6FF8bcLwk%ByKW3{MEY0Ct{$m#=N^ng_^l5Y4PS#s5GH_?{x-a6uB^* zci=7`{Fe%w@>*2vTGx=9uOkUt(BSu0q+Bja)05GbOfpde4k7U5WikSs66L`HYHaw} z7d5AQYzVS~nlieDD3`eWy2)>D9XhTyUU zh0iW5&~+Q^#E%<{h8`(exbTgtOqY{T>tSfJ4|y7Ijr?8E`$#|H;Fv2+4N1)T6{JEP zQ`F>L4cCUTEHo4dwR|%r#ECTC8uusac3+-#h*F}tkD0E{=%$)MN`$r=p3wn{_M4Td-mUbnPS$f8go>011&YN0hX;ZU_dn=qrfQ8nAbcEtRo83&M~I3kfSWcN z=H-)fI+g+zPD!MIth^CrQtDIbSyq}ErS!A&;eDDOl}=dzL$_iId!G%MoQfh#DPXI{ zEc=5pPhGYZD|Q**R(R#3nX%haBrv56fe4nLsf>qZ_TWx+1It>!jv~8{J=c0!{c$%ALZt zL44*iN*np7|7k9n(xbZQ^j^a#F9{-+2FzU2HHhj(Xk5~VDBgTz)A({Yw9Qj6%|asn zL}h?RnEdT)eKOT6v}SgS2unW!yHqkAbko%z7DOSmE~@XeMo~#qPH?LS%!~M-O}2P$ zIR2iDnEUnLA6Eai(<}O<<$5IcO`?2upGb){Ku*d5WSp#xn zJ@p(zUTl6wJ}*FM$?Fxd2UA9F)p{4W5RhcV>o;dw^2zh@?IY!uNe)<>CTcXla;p;b zZ(Lr+Ku5-WcM7cGn&@qMm!gn^_k13d=}dB?Q?`jS=Fw<`g(RZc>=&&R#kF{~KUoV8 znHo#2ikCUoe?`MD-$=hCcmJ9H|IGy$GDeTu@^Z2vVy4K647MBqMeb+O5;B~N;oPr} z*he>kkk8a;;Y$zUa&x0%i?hfbv!=egB#f``6zRLy2&RAafJ0g z7x*2VtDI4dsjqtqAxW^wET15JYr0nWus9O1TkyfeWPcbO&y$FzNCp}M<$ie~{56x9 z5@;}kp~ie**QVi0fg8GTVJK!6m@SnGM3Up9rFEzS;utY*n6gx=zSJ;ULUiSKuvM2 zH#*CkWZXWCOw_3uiKmSh4RK5&o=LHPA$$-y&SF;6Z7?y%3DePsz{T<;Feb@8C;;13 z`hC)xx=OvGsw~VWiN+bf!)}uat07*tN%!tFez=EGMlmV9xC~2;T+5$t<$~}4 zNw^ahVVp{VfUrD;5FFiottHXKPtic{%T3ma6i6`=OJmP<)J*G*Z6F7v2|uRUJM^rnEmKS=kWWY=HC+|66Ml4x+ix1q5L1^%6)jYg>BVNzDmM9 zv80?c2f*wXwJv-7#wxxm$aBL7h~Q2-P&IY_@P^}T?LNAHi~ITZJsg)yu5I!50mM6? z^gZQ^;W1_MuWy$H;DD7$Dn2FYrpYKp)}x5kmH(%$^NebudD}QuiVzY)CzQ}a73muU z>4q9Q3Zfu_pcF{}l_LFtgc7ML5IRUzP(e|Ol+cu>zyktOLZqp5M5Mj@{6D{6ww*mY zJ7;J1%-r{N{bu@`nDV}LkX%an=eFiKe7J-J-pYh5bd{V-SS*+Wa0x=bh2NS*F05_s zHB?2Vc5c7KxZxJxwVga$k8mG69zAn}v4d~{Q1AR1t;MYnEgjYM=DLNv{ZitAL0P-d z`cCntP3NhzRbMayZw34ik&4c7yDH18^P6w+}A;4>oI8((QrnZV%1 zFVOC_Kjz;{c>U{tHZaDI#*`ej>sne}Rm;U)L@3MabA}XT?gH_3fV)IxN8FNp zx5vM*ulaVtKJG_TH2m*C4Ih?rhfcdswOb%{5+{$JNVO(Cc3=uJRWqERhrYb_ZLMwvfq4hXQ(M$S9J8#!tU5H z=EyHdUooX!o_kL8rqjZ3bzA1BSoqL=%;@%@ciWx6aUbj^Bb@eJqbfsE2xbC$%ncYl z-Z#J=v{#G8mfBZSP>4>DHV{=@m2g??KDs`x_*n+^wfE}z@a{oZIIgHS6*!cdSCHIX zdba!00!M)z`{e|SJ^PosGV3sIfVf*if(J^>R9wyKQs#_z6vT0Bp&K%igkKVjTvmLH zJ7iFwE3OI$eJzxY?fw4V^b%rI@COd!T$GErE^n@cjnT!H^&51T%p~B5l221@T%Tq@ zSsLG5Lih<$UV1>prz`ZSOxryS$nLnp{LsgIm^2{QS>o`rZOuIe8 z(de&<&{J0?v$iH~{_cz@fo;VJ4Ep0r+TUs8>cwc;yZqZ3>S$YK6FAG21ru%*0bzW8 z*QYhSI%1H~<~9MYlhe>|fVLi`QS#d@92&2A=kRP^j=l!!kOMIB$}DsyBB~IL2vi?TVD-IAK|(QGwvZtecc(vn^k~^^pz>EH2@q%sJ!g;!Y<6IGSOZ^R z$&)<5u&)N%^n*UNK8^9UwGkrXT?t2q2D(ZAI<@l1GInj*;iD2(gM#E3F@XH5;N$e5 zkKV=TN7m)x_exl8+I;6`9%9DE3;YxIQPM9#W>J1l0(~*}P0eyau{LVF6cNm`ujwcW z39g*h_Iq)bEKh(ZzVMD#4>~aqWF_;i<&U3=bJ#;!j?WOIFyr1sZmh@%O8Tjs>^&|a zW)cjwHmPM1Rert`m8-T=>pt>QE!G7YP?Huq;xD&%%+C(1pMAOh&I*`F*ilrc^C)9Cv#14oTgTk(OT;~t~B>n&F;uER`5|b&zMx<djH~ zY^h$0uP{0nEO5ORkvhm{YOq8BUBCWWct)PQt*RN(D79?fg#kUL16HM_0&bYWR%C(r zmBJM#lKf>iJ*CAk5LCMeHr#LA$GM^aGS>=bLiG6FhqjZ*n`EJf6Sxx_;5Ul zs6eGhh|gLBO#7f8eJ@h&POs~k&$rF+SF@9V@SkNa16wUpA(ZGsU*?Cs4D z8=q6V^LZ~JJP=kBinOLY#8YH9`ijlG!)GcO>(Dtnw)!PNvz%BuN2?cNdj2_y2&6FQ zoDF(56T`zmb|FY!#Cw;YtQ%^KZ$5LL^7g^CEu~b%X31a2N@nNXbMt_E+7o9 zYc>Aci34ZukGsX<5hSj$$QEsHmTb?C^I@Y)y|NLeZnU5Sh1kIod8AT(Rq>>IhHB@g z?!$uStItr~mvS0prXCT?Dx^&^CEt84JEO1!!yC3*JrboCc(;x8WPEEmN_8+mly_Va z?#UDFw$x5-WQ;-jy?%9-Tqo>dAd-iOhK0xLiTHrg>*!A=J)iXk*RfWJHD+^bi`eOE zCW>5Zca8`wLc~w%7C9)h;%JgCJ0+#`A3$~e-_AkCmlxH)J;5Lv_w9dL@gj}yOnB1V z)sj+eSg<9073@@5NN;ZCMOiy5kL76HSYJn@;oDe}R>Qr`m z>YWwO3b4IMk;jdkS(&m~FLAg>H$fj1-OCF0T)Hhq8>}-pmhd%vQ2Je5^8_Q*14W+^G>UVhJJ+9KDT4%l3;4|k~w&{na zsRJZI>z-0dOfy5TGPNof3$>#9OhXON&#f~#cxJ|h`Pf`e-q{=Q5iYsLF$U8KgNY&J zndMS}DV_FzWIeCjK!VY(HZWg z>q7pjyC*N{!1j**%uKvI4B&+(KK4q2#Bbd~N&K3Y_ouq&cj+qJ73IS-NHSBOE8^$& zF2tE$vXz2-wmg=5xD?L%O9qn(?7aczpoFt~ZR{K~n7C!ltL`!O)4Gm0irID*TjOTD z0|KWy4EFe2&?{NL^gjrGx0FTyM4|Q?0cmc`2YMu5L0Z<`8dD04IAD`I7-vM<=iOwY z2V|4g_Z_IV=91W4Z+1#&X8N{&_BmYIi*9=;D-(N&55A=<4aIO97SI{U$3w)(r2}kh zVnXGe)AFq!pSN}A7!{{4HnJglFQ$EN=3g?sn8b|}AU<2wn-9S}>PWF_nD8C49-#(r z-eVY}Js_dzu;u#>V0QfNcWEox3o2Eb=sOg&NwQjEa__Vjy3IhvzBu@SN~Im4M4v4D z{aLU#*V*DpxuY{Uk3hr3U(ERx$cRat`Mmx#Idoui!~ej0nSw!hs%|bc)3VB;CI!IT zcDr_6OShXyhxDN4ehRMnR_-Sa^vEZBe_Swv#i(<3qV37UT1M~N10iQWhW6VIsCUS&7GE~A(!*F@WKzUa2_n4N zHzT%VV&<1|Cct1tdb+_)C|v}EH2Dthr5(;JJauJ{G>UbG*?yEE^~MNrTnc~0?@z7N z{rxYJBtVK`ik7_p0aN zsUl#ib~%t0p7&Ivzn&l)R_Le_acTcOf`b86rgZt?O_%#h$E$Tvft|i7(va%wKuEU7 z$E=TSqu?Wr$N)`%4wFl{fen?FQ=*EKOFEaOus-R<6kI9`vzMo(x=FVUJKmofpKJ^V zlB^&fRnBuZG7NBnj$i~TzY7TyJm9vnRcVNo<;Jv|@Ssx!nt@`OM8R(1tGz-Nh>jaR z<&ziF5Ji1BaxjF{yXa}rs3dVIhe99a(wzi*&h1Y;R#4{flMmS7iGrocV|lIu);_$xBGmS45)O2M?)U%w?@r@5mDgTR`JslXfL-Ja ziOCFFr#C^NRddw#Qi z#}iFwVgoUdAqi{JQ|GG|>64!TK0oP380a85+AZVR;t<8S*Nom{1whnPC$lkG@3{Y$ zINx$Sf3|Cl zp$@pcVcD;PKy-h>m34W`=Q(qL2hl}JBXkB5iQ@%Lhy|tWT*M8k?D!35r6H%YrVnIF zz)4xX6ekuV1=)MY(MkI2e#Q^QApGL(0Hr_F-ccv;PeMtH83V%PG&zzIlPIajnvkbj zz|>{w6=M|A=i*hTk^rQf*U5(gSEqBMGpjpR`Ka#f(e#K)6>z$F^Y(0TFl|IqMiL@{ zs$>tq3AC&_^}(XzOF0p``@##vt_3R=BS@l_2osvUy%?D|O)Da0p1J;_dx75DVCa_& z0dY-2$}dn`0`~w}3i&S!ACIk}^WIL@_}uh7_Ak@;*Krrbxj-5I@cGRs2xe#5Ci7fR z5>Q_`z{fHl!$l^XRhqX!_^P8>srovSSWqt7EHY~q2xEoSco)Nn;!A6c*s{R@n! zpGKSj8jQYus#PMzUZyvr^WK%%qpPrI#)i$N5xPh@g@E5F{smfL{Uvi)ym6a*vVF>v zx?LFsmb)XAbRV1qmMNe*7Rz%Lk#YAxhULnouP)@07)fJ<3eCsru{%6rG#7|o;ch2B zNTu6D>~<#oCuU+r|4-10^qVyPrcafz=PxIODoj!rRv}5 zAi(X{+>j_Z%RPG&izI%{QgewRbm(goRxyWz654Vb8BGa&D5`!cT7D9R2S7{?RC(x# zSo4 z2jvAMpiY(0<%^e{eboSTJG5GRK{0$>CJWClaHGgqqIouYEBwRsb*2X|Hxx4B+DHjr zZ6F)+Fi33UpMv9wB{k7PE|z0O7q#N!h>OD=JJ9bcWzB!W_9lW(d!m#ttF^fZw}f#$ z@4aBSn-S=24#gCkT2zO>)hv4Y7zDC#Y?ybs%pJa8nTT}R4$&ozZ*OaNuFkZ4g`r#$ zBkgANtR{30vsa(oitae#{jqFHt}ERd{2sEwe{fOh;oOnVbULuJP*iA=DC*7B2laZ8 zpmP|u*CaUe$U71&dP<0}3%s%-wJ+P4?>l@s_fMlV_5-EP+KtO7uT^5Gf%YTp=hW}W zD6NYaSu=(bota<=m1*=W%zJ95 zJr5XO<-X-M`aLBj^hL(gHfLd0?!;MIG-;vp+E&5Z$YStK{%@@X0aV3*?(4NQ+7Ts9 z39kxfj~b-0wId8=K2rsM-1-`Zzbi~V+CgJCD%}=v2fgDD+j4)OtUMDry@1$$-DgEc ze%!z4zd0UpNn=j!)40yk+qYl!lYm_%?EB{*N2okkj#xn9L;-OJ5BHX9*%JOB*+gN* zDP(Jbj37LR9y&70ZVwnNjhYLM*tlaHMpo8;ql?vY>#FMo)!VL_k_HGDRpg`Zj;Gge zP}M)`l6WN8d#vt@W6Q>l%6tm;iGWCCcStO+BD z_#FyV--xVphOitdvctW%_97zm^26#f0-+V5rV&o#D!Qj;5*4$iicw8c=qO>1;J1b zVj%X0LXm=0d6MqciY~ox@8A63xM^m~8=#HCG4Og&h{W&LYWo+s6SGX(-K6u7XxwCF zC8t(?9&jS0LZwioA&vb+gnoSbxw+goMlRI8t%P-nRx)MFsTPKkDY);khZrcaWV;|) z2E||glptVdg+ZKCtfpA;i<5Q?|5E@^j(5V%a>-drSf#5Tvu#x;VH}x!jnrp5FjPDu zrsy~Sc0`F5I-PMO^D@(rf4O=T@=;)VD>XY@KbP<6m7S#HNE6w;1bp4{F}d^a^Qnjw8KjSIgj!?$sg|48q%Nq)-o_P{VQZ5G30;9{mbQ5ee#r) zz!b;d(L$ert(=#kac>o`z^H#jn~sznAsYqG#}^#e>3V(tEA7)3xKIte%*w{hX{%GyzD4%sOH@_Cy!oshf2Pg*bIoKY+c|}T(|q( zuVmw@!rLj_aAPY8Oe!!rMQhctzBunU`;!4N#JVH1-6Di+c=FtUpkPQ}Tt(xK)6Cr7 zTB`juSP9^0_iS|%u9H1|WqCn0UK$}TnZHl+&}l(ki|Py=#19ug7N?jwn0z$JsAcl% zw8=#bpCw#=N$hWP{#cZ{)q@hiWi+7k^-&x_bqmw5#9vGzW5*2CU8#0C4y`O}L{P;G zC2`!B?2Lf^GI>mlth^QJUUFtobC#j-qLqz1RVrfNoKb{%f^^=Jt8c4-|8tj}686op z9mKzq*~#+nR4g3j-&^afPvmVM8!9#RTNA6$`+`py(-T07ILpZE>)8}B0xfWDsc?s;2cSPdt@{3XGZ zM13Lg&j_V|N0L9uPf#9gbYYrE~u7CD&x%5GDm02GvM$I=nwTFhA2*aB?UcdVPsh;8Y8=!j));R}R}NNFAm;pY6#WK(j>sFPeU;RQ_d0d=#^@g4g|{{SDZ zNQ>5|o)U1>{>*L1k~LjPV@fSnHn?H8QivY5qF6C4>FZM zk^D{Ts#e-f^Qv^$C+`i|aED{^a$uwhEFy-P^`K2#2dSNI#2VG#`+23gcPK|R8P2knDx_4eu6NvA6B8jtSvNV8+zFN+2f$k9lOFvGUHZ&YmpC}z`79qP*ld`}XzGVsy3E6o%wb z!4VZ+Bx%kU#>yWjC8dkv&hWnBzPiTRX8Wo9j3B?rf-3sgZAv?y_FEcYag@v^$1>?Ua7JIP?Jq&kI2 z4s7#Wt!R|z$DI)jp@Vp=8`&CB=*!~=+eW^-e)7q5MibRcJ?<))iE9)*>>F9uG)4qp ztfZX1*4>z*$tuC&+M`k^jK{V(es57}OZedk-a?ZAP%9XW;%;feu)_21}J{c%wzaA=ckB|`VX&muuXF6&*wGGjGqpHa|GP2dKAkd#`#uxPC zSBh?)n^!PJ`9v~bo*+7^*7Ot_rTkJ=Sc>Ygz}(Q-3+|jCaLeT?nn$t@!vPE5;45B*&*6n@T^;z1>Y%{ z1%Oz;iss4KcSUQGLzkK3Sn~P(YDeWDntJ{}4HrJ;&DObRy85NNI*oICe+g}Zi>+;A zn1_1C#NIWTi7@9*;?FKptI2>Hy2IW$rZ|HkWd`C-x9t7XtiGN|X380|v*=J?qpSiG z;6euZ|E&>#pd5VUEI49L>_-8Chnav#qSm(*gm=LAfp3a^C45c0&swo2WV7q*o-g!_ zRSRx3Ke3#*6h%hy9tH!jCY^mDRQ?LFL{>} z8r9N6l&FgAr6@1vIFIHY))@!%-I~dZRY`;K#Y;TzWpR;7uTW;TjGgPl=5*fr*b^m0 z^8=&7oZwLE5tI8Pe14+jy0wEfd4)sx4i+Sp*^B>`9dA>9iv0t6Gp)R-tD5E6FjO@V z>Tu7vR&><&sZ;%qNL+LcBbH# z#z6R}S@``bc;b(7@ta8?DOP&xTg7WD;xMl7GAggNN?Q<@L2O3fX8}`3Q}@-|!uw3g z;%qCS&2?SDpgrT2xgTQ?VR7^0$4_}>{hky#Y}w!u=*#w>0JYxUAji*7ZzKFLv_)@! zfwnk0`@v}Tcz;lve0sk(RaJ&r5ciSa^X4$8<<827P@S;KhPrUB`~sj(RBQO25dJ^q zwyW?fLS_KO(mNeiA@u8B^D=tqP-7Qhyc#yCeyq2sb?tQ6n0z?o`xG!cHxMi>h^zkm znCUbSy8MlFANy@&S-miT!4fn+TN>rQ*?9PG!um2GlL>0X#(h39vHXwGx%>oZ7OgmJ zG}suP+T7Hmt+)|ZL7uupsS@lrB{Wj(i%uH_bP(UZKAN@^dY>EBz2TT+nGdUw@W<6l zVs}>VRumlT1w9(&EOgYxH0e|fS1chP4OY-%^@99VyS*q?4n=eAexjy&APeqQ3Xm;t zP6nmMdeOozz=SNNba MGr=0aFm#XoKUfQ+`Tzg` diff --git a/docs/chipseq.rst b/docs/chipseq.rst deleted file mode 100644 index 5302e973..00000000 --- a/docs/chipseq.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. _chipseq: - -ChIP-seq workflow ------------------ -The ChIP-seq workflow starts with raw FASTQ files and performs various QC steps. It -aligns and prepares BAM and bigWig files, performs peak-calling, and combines -everything together into a track hub for visualization. - -Specifically, the workflow does the following: - - - trims reads with cutadapt - - maps reads with Bowtie2 - - runs FastQC on raw, trimmed, and aligned reads - - Removes multimappers (samtools) and duplicates (Picard MarkDuplicates) - - performs fastq_screen on multiple configured genomes to look for evidence of - cross-contamination - - QC aggregation using MultiQC, along with a custom table for library sizes - - merges technical replicates and then re-deduplicates them - - creates bigWigs from unique, no-dups BAM files - - optionally merges bigWigs to create one signal track for all replicates - - runs deepTools plotFingerprint on grouped IP and input for QC and - evaluation of enrichment - - calls peaks using macs, spp, and/or sicer, with support for multiple - peak-calling runs using different parameters to assist with assessing - performance and to help make decisions for downstream analysis - - optionally runs a template diffBind RMarkdown file used for differential binding analysis - - converts BED files into bigBed (or bigNarrowPeak where possible) - - builds and optionally uploads a track hub of bigWigs and bigBeds to - visualize peak-calling in UCSC Genome Browser - -To configure a ChIP-seq experiment, see :ref:`config-yaml`. - -.. image:: chipseq.png diff --git a/docs/conda.rst b/docs/conda.rst deleted file mode 100644 index 1cf44f84..00000000 --- a/docs/conda.rst +++ /dev/null @@ -1,209 +0,0 @@ -.. _conda-envs: - -conda and conda envs in `lcdb-wf` -================================= - -Conda basics ------------- - -If you're not familiar with ``conda``, it is a way of keeping software isolated -on a computer in an "environment" (basically a directory with the executables -for all the software you want to use). When you "activate" the environment, it -places that location at the beginning of your ``$PATH`` variable, so that any -executables there are found first. It does not affect any existing installation -of any software on your machine and does not need root privileges. - -If you don't already have conda installed and the Bioconda channel set up, see -the `Bioconda docs `_ for details. - -You'll also probably want `mamba `_. Mamba -is a drop-in replacement for conda that is faster and more robust. In fact, it -is now the default conda front-end for Snakemake. If you don't already have -mamba, you can install it into your base conda environment with: - -.. code-block:: bash - - conda install -n base -c conda-forge mamba - -It's recommended that you install mamba into the base env (just like conda -itself is) so that it behaves like conda. It does *not* need to be installed -into each individual environment. - - -Building the environments -------------------------- - -**It is recommended that you create a separate environment directory for -each project**, rather than a single environment for all projects. That way you -can update packages in each project independently of any others, and yet the -environment will always be close at hand. This is an especially good practice -in shared space as others can easily find and activate the environment specific -to the project. - -.. note:: - - We recommend using mamba rather than conda for the speed increase and - ability to more correctly solve environments. See the `snakemake docs - `_ - for more info. - - -If you use the ``--build-envs`` argument when deploying lcdb-wf to a project -directory (see :ref:`setup-proj`), two conda environments will be built in the -directories: ``env``, which has all of the non-R requirements, and ``env-r`` -which has the R packages used in particular for downstream RNA-seq analysis. -These environments will use the fully-pinned environments in ``env.yml`` and -``env-r.yml``. If you've already deployed but didn't use the ``--build-envs`` -argument, then then the equivalent command to run in the deployed directory is: - -.. code-block:: bash - - mamba env create -p ./env --file env.yml - mamba env create -p ./env-r --file env-r.yml - - -.. _conda-troubleshooting: - -Troubleshooting environments ----------------------------- - -Sometimes there is a problem with creating an environment. For example, the -exact package specified in the env yaml might not be available for some reason -(this should not happen, but in practice sometimes it does in corner cases). - -If this happens, you can try a couple things. - -First, some terminology with how packages are specified in the environment -yamls. Here's an example for ``libpng`` version 1.6.37:: - - libpng=1.6.37=hed695b0_2 - |____| |____| |________| - | | | - name | | - version | - build string - -The package name (libpng) and version (1.6.37) are pretty standard and -self-explanatory. The `build` string refers to different built versions of the -*conda package*, but for the same version (1.6.37 in this case) of the package. -For example, if a conda package was built for version 1.1 of a tool, but that -package itself had an error unrelated to the tool, then a fixed build would be -made. The package version would remain the same (1.1) but the build string -would change. - -In this example, the build string contains a hash ``hed695b0`` which is a hash -of all the pinned dependencies for this package at packaging time. The -`conda-forge pinning docs -`_ give more detail -on what this pinning is about, but basically if that pinning changes then this -hash will change. The ``_2`` on the end of the build string hash indicates that -this is the third built package (build numbers start at zero) for this version -of ``libpng`` using the same pinning. In other words, there also likely exists -``libpng=1.6.37=hed695b0_1`` and ``libpng=1.6.37=hed695b0_0``. At the time of -this writing, there is also ``libpng-1.6.37-h21135ba_2`` (notice the different -hash) which is the same libpng version but uses different pinnings. - -What does this mean for troubleshooting? - -For any package that seems to be problematic, try editing the respective -environment yaml (e.g., ``env.yml``) to remove the build string (so in the -example above, you would try changing it to just ``libpng=1.6.37``) and try -building the environment again. If that doesn't work, try removing the version -as well (so just ``libpng``). - -Alternatively for very problematic cases or cases where there are multiple -problematic packages, you can try creating an environment with the "loose" -pinning in ``include/requirements.txt`` which effectively does not require any -particular versions with the exception of a few corner cases. Keep in mind that -using that file may cause the environment to take a while to build as conda (or -mamba) solves the dependencies of all the specified packages. - - -Conda envs in lcdb-wf ---------------------- - -Given all of the software used across all of `lcdb-wf`, the environments can -take a lot of time to build because the solver needs to figure out the entire -dependency tree and come up with a solution that works to satisfy the entire -set of specified requirements. - -We chose to split the conda environments in two: the **main** environment and the **R** -environment (see :ref:`conda-design-decisions`). These environments are -described by both "strict" and "loose" files. By default we use the "strict" -version, which pins all versions of all packages exactly. This is preferred -wherever possible. However we also provide a "loose" version that is not -specific about versions. The following table describes these files: - -+----------------+--------------------------------+----------------------------------+ -| strict version | loose version | used for | -+================+================================+==================================+ -| ``env.yml`` | ``include/requirements.txt`` | Main Snakefiles | -+----------------+--------------------------------+----------------------------------+ -| ``env-r.yaml`` | ``include/requirements-r.txt`` | Downstream RNA-seq analysis in R | -+----------------+--------------------------------+----------------------------------+ - -When deploying new instances, use the ``--build-envs`` argument which will use -the strict version. Or use the following commands in a deployed directory: - -.. code-block:: bash - - mamba env create -p ./env --file env.yml - mamba env create -p ./env-r --file env-r.yml - -When getting ready to release a new lcdb-wf version, create a new environment -using the loose version to prepare the env and then when tests pass, export it -to yaml. That is: - -.. code-block:: bash - - # use loose version when preparing a new version of lcdb-wf - mamba create -p ./env --file include/requirements.txt - mamba create -p ./env-r --file include/requirements-r.txt - - # then do testing.... - - # when tests pass, export the envs - conda env export -p ./env > env.yml - conda env export -p ./env-r > env-r.yaml - - # commit, push, finalize release - - -.. _conda-design-decisions: - -Design decisions ----------------- - -We made the design decision to split the conda envs into two different -environments -- one for R, one for non-R. We found that by by removing the -entire sub-DAG of R packages from the main environment we can dramatically -reduce the creation time. - -We also made the decision to use large top-level environments rather than -smaller environments created for each rule using the ``conda:`` directive. -There are two reasons for this choice. First, it allows us to activate a single -environment to give us access to all the tools used. This streamlines -troubleshooting because we don't have to dig through the ``.snakemake/conda`` -directory to figure out which hash corresponds to which file, but comes with -the up-front cost of creating the environment initially. Second, it simplifies -running the tests on CircleCI, allowing us to cache the env directories as -a whole to be re-used for multiple tests rather than caching the individual -.snakemake directories for each tested workflow. - -Given that the conda and snakemake ecosystem are in flux, this may change in -the future to using small conda environments for each rule separately if it -turns out to be more beneficial to do so. - -.. note:: - - Prior to v1.7, we used requirements.txt files with loose pinning. Moving to - yaml files allows us the option of also installing pip packages if needed. - It also allows us to specify channels directly in the yaml file for - streamlined installation. - - Using strictly-pinned yaml files that are consistently tested will - hopefully result in a more stable experience for users. For example, if you - happen to create an environment around the time of a new R/Bioconductor - release, the environment may not build correctly using a loose pinning. - Other transient issues in the packaging ecosystem can similarly cause - issues. diff --git a/docs/conf.py b/docs/conf.py index a8c11dc9..2f653095 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,7 +34,6 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'generate_guide', 'sphinx.ext.autodoc', 'sphinx.ext.autosummary', 'sphinx.ext.doctest', diff --git a/docs/config-yaml.rst b/docs/config-yaml.rst deleted file mode 100644 index ad1d3fb3..00000000 --- a/docs/config-yaml.rst +++ /dev/null @@ -1,587 +0,0 @@ -.. _config-yaml: - -Config YAML -=========== - -This page details the various configuration options and describes how to -configure a new workflow. - -Note that the ``references:`` section is detailed separately, at -:ref:`references-config`. - -Config files are expected to be in a ``config`` directory next to the -the Snakefile. For example, the RNA-seq workflow at -``workflows/rnaseq/Snakefile`` expects the config file -``workflows/rnaseq/config/config.yaml``. - -While it is possible to use Snakemake mechanisms such as ``--config`` to -override a particular config value and ``--configfile`` to update the config -with a different file, it is easiest to edit the existing -``config/config.yaml`` in place. This has the additional benefit of reproducibity -because all of the config information is stored in one place. - -The following table summarizes the config fields, which ones are use for which -workflow, and under what conditions, if any, they are required. Each option -links to a section below with more details on how to use it. - -================================================================================== =================== ================ ================= ========= -Field Used for References Used for RNA-seq Used for ChIP-seq Required -================================================================================== =================== ================ ================= ========= -:ref:`references ` and/or :ref:`include_references ` yes yes yes yes -:ref:`references_dir ` yes yes yes if `REFERENCES_DIR` env var not set -:ref:`sampletable ` . yes yes always -:ref:`organism ` . yes yes always -:ref:`aligner ` . yes yes always -:ref:`stranded ` . yes no usually (see :ref:`stranded `) -:ref:`fastq_screen ` . yes yes if using `fastq_screen` -:ref:`merged_bigwigs ` . yes yes if you want to merge bigwigs -:ref:`gtf ` . yes . always for RNA-seq -:ref:`rrna ` . yes . if rRNA screening desired -:ref:`salmon ` . yes . if Salmon quantification will be run -:ref:`chipseq ` . . yes always for ChIP-seq -================================================================================== =================== ================ ================= ========= - -Example configs ---------------- - -To provide an overview, here are some example config files. More detail is -provided later; this is just to provide some context: - -RNA-seq -~~~~~~~ - -The config file for RNA-seq is expected to be in -``workflows/rnaseq/config/config.yaml``: - -.. code-block:: yaml - - references_dir: "/data/references" - sampletable: "config/sampletable.tsv" - organism: 'human' - aligner: - tag: 'gencode-v25' - index: 'hisat2' - rrna: - tag: 'rRNA' - index: 'bowtie2' - gtf: - tag: 'gencode-v25' - - fastq_screen: - - label: Human - organism: human - tag: gencode-v25 - - label: rRNA - organism: human - tag: rRNA - - # Portions have been omitted from "references" section below for - # simplicity; see references config section for details. - - references: - human: - gencode-v25: - genome: - url: 'ftp://.../genome.fa.gz' - indexes: - - 'hisat2' - - 'bowtie2' - annotation: - url: 'ftp://.../annotation.gtf.gz' - - transcriptome: - indexes: - - 'salmon' - - rRNA: - genome: - url: 'https://...' - indexes: - - 'bowtie2' - -ChIP-seq -~~~~~~~~ - -The config file for ChIP-seq is expected to be in -``workflows/chipseq/config/config.yaml``. - -The major differences between ChIP-seq and RNA-seq configs are: - -- ChIP-seq has no ``annotation`` or ``rrna`` fields -- ChIP-seq has an addition section ``chipseq: peak_calling:`` - -.. code-block:: yaml - - sampletable: 'config/sampletable.tsv' - organism: 'dmel' - genome: 'dm6' - - aligner: - index: 'bowtie2' - tag: 'test' - - chipseq: - peak_calling: - - - label: gaf-embryo-1 - algorithm: macs - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - - - label: gaf-embryo-1 - algorithm: spp - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - - - label: gaf-wingdisc-pooled - algorithm: macs - ip: - - gaf-wingdisc-1 - - gaf-wingdisc-2 - control: - - input-wingdisc-1 - - input-wingdisc-2 - - - label: gaf-wingdisc-pooled - algorithm: spp - ip: - - gaf-wingdisc-1 - - gaf-wingdisc-2 - control: - - input-wingdisc-1 - - input-wingdisc-2 - - - label: gaf-wingdisc-pooled-1 - algorithm: epic2 - ip: - - gaf-wingdisc-1 - control: - - input-wingdisc-1 - extra: '' - - - label: gaf-wingdisc-pooled-2 - algorithm: epic2 - ip: - - gaf-wingdisc-2 - control: - - input-wingdisc-2 - extra: '' - - fastq_screen: - - label: Human - organism: human - tag: gencode-v25 - - merged_bigwigs: - input-wingdisc: - - input-wingdisc-1 - - input-wingdisc-2 - gaf-wingdisc: - - gaf-wingdisc-1 - - gaf-wingdisc-2 - gaf-embryo: - - gaf-embryo-1 - - - # Portions have been omitted from "references" section below for - # simplicity; see references config section for details. - - references: - human: - gencode-v25: - genome: - url: 'ftp://.../genome.fa.gz' - indexes: - - 'hisat2' - - 'bowtie2' - annotation: - url: 'ftp://.../annotation.gtf.gz' - - fly: - test: - genome: - url: "https://raw.githubusercontent.com/lcdb/lcdb-test-data/master/data/seq/dm6.small.fa" - postprocess: 'lib.common.gzipped' - indexes: - - 'bowtie2' - - 'hisat2' - - - -Field descriptions ------------------- -Required for references, RNA-seq and ChIP-seq -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _cfg-references: - -``references`` -`````````````` - This section defines labels for references, where to get FASTA and GTF - files and (optionally) post-process them, and which indexes to build. - - Briefly, the example above has a single organism configured ("human"). That - organism has two tags ("gencode-v25" and "rRNA"). - - This is the most complex section and is documented elsewhere (see - :ref:`references-config`). - - -.. _cfg-inc-refs: - -``include_references`` -`````````````````````` - - This section can be used to supplement the ``references`` section with - other reference sections stored elsewhere in files. It's a convenient way - of managing a large amount of references without cluttering the config - file. - - See :ref:`references-config` for more. - - -.. _cfg-references-dir: - -``references_dir`` -`````````````````` - Top-level directory in which to create references. - - If not specified, uses the environment variable ``REFERENCES_DIR``. - - If specified and ``REFERENCES_DIR`` also exists, ``REFERENCES_DIR`` takes - precedence. - - This is useful when multiple people in a group share the same references to - avoid duplicating commonly-used references. Simply point references_dir to - an existing references directory to avoid having to rebuild references. - -Required for RNA-seq and ChIP-seq -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -.. _cfg-sampletable: - -``sampletable`` field -````````````````````` - Path to sampletable file which, at minimum, list sample names and paths to - FASTQ files. The path of this filename is relative to the Snakefile. See - :ref:`sampletable` for more info on the expected contents of the file. - - Example: - - .. code-block:: yaml - - sampletable: "config/sampletable.tsv" - -.. _cfg-organism: - -``organism`` field -`````````````````` - This field selects the top-level section of the ``references`` section that - will be used for the analysis. In RNA-seq example above, "human" is the - only organism configured. In the ChIP-seq example, there is "human" as well - as "fly". - - Example: - - .. code-block:: yaml - - organism: "human" - -.. _cfg-aligner: - -``aligner`` config section -`````````````````````````` - This field has two sub-fields, and automatically uses the configured - ``organism`` to select the top-level entry in the references section. - ``tag`` selects the tag from the organism to use, and ``index`` selects - which aligner index to use. The relevant option from the example above - would be "gencode-v25", which configures both bowtie2 and hisat2 indexes to - be built. For RNA-seq we would likely choose "hisat2"; for ChIP-seq - "bowtie2". - - Currently-configured options are ``hisat2``, ``bowtie2``, and ``star``. - - Example: - - .. code-block:: yaml - - aligner: - tag: "gencode-v25" - index: "hisat2" - -Required for RNA-seq -~~~~~~~~~~~~~~~~~~~~ - -.. _cfg-stranded: - -``stranded`` field -`````````````````` - This field specifies the strandedness of the library. This is used by - various rule to set the parameters correctly. For example, if this is set to ``fr-firststrand`` then - ``featureCounts`` will use ``-s2``; CollectRnaSeqMetrics will use - ``STRAND=SECOND_READ_TRANSCRIPTION_STRAND``, and deepTools bamCoverage will - use ``-filterRNAstrand reverse``. - - This field can take the following options: - - =================== =========== - value description - =================== =========== - ``unstranded`` The strand that R1 reads align to has no information about the strand of the gene. - ``fr-firststrand`` R1 reads from plus-strand genes align to the *minus* strand. Also called reverse stranded, dUTP-based - ``fr-secondstrand`` R1 reads from plus-strand genes align to the *plus* strand. Also called forward stranded. - =================== =========== - - Example: - - .. code-block:: yaml - - stranded: "fr-firststrand" - - Rules that require information about strand will check the config file at - run time and raise an error if this field doesn't exist. - - -Optional fields -~~~~~~~~~~~~~~~ - -.. _cfg-fastq-screen: - -``fastq_screen`` config section -``````````````````````````````` - - This section configures which Bowtie2 indexes should be used with - `fastq_screen`. It takes the form of a list of dictionaries. Each - dictionary has the keys: - - - `label`: how to label the genome in the output - - `organism`: a configured organism. In the example above, there is only a single configured organism, "human". - - `tag`: a configured tag for that organism. - - Each entry in the list must have a Bowtie2 index configured to be built. - - Example: - - .. code-block:: yaml - - fastq_screen: - - label: Human - organism: human - tag: gencode-v25 - - label: rRNA - organism: human - tag: rRNA - - The above example configures two different indexes to use for fastq_screen: - the human gencode-v25 reference, and the human rRNA reference. - -.. _cfg-merged-bigwigs: - -``merged_bigwigs`` config section -````````````````````````````````` - This section controls optional merging of signal files in bigWig format. - Its format differs depending on RNA-seq or ChIP-seq, due to how strands are - handled in those workflows. - - Here is an RNA-seq example: - - .. code-block:: yaml - - merged_bigwigs: - arbitrary_label_to_use: - pos: - - 'sample1' - - 'sample2' - neg: - - 'sample1' - - 'sample2' - - This will result in a single bigWig file called - `arbitrary_label_to_use.bigwig` in the directory - `data/rnaseq_aggregation/merged_bigwigs` (by default; this is configured - using ``config/rnaseq_patterns.yaml``). That file merges together both the - positive and negative signal strands of two samples, `sample1` and `sample2`. The - names "sample1" and "sample2" are sample names defined in the :ref:`sample - table `. - - In other words, if samples 1 and 2 are replicates for a condition, this - gets us a single merged (averaged) track for that condition. - - Here's another RNA-seq example, where we merge the samples again but keep - the strands separate. This will result in two output bigwigs. - - .. code-block:: yaml - - merged_bigwigs: - merged_sense: - sense: - - 'sample1' - - 'sample2' - merged_antisense: - antisense: - - 'sample1' - - 'sample - - Here is a ChIP-seq example: - - .. code-block:: yaml - - merged_bigwigs: - arbitrary_label_to_use: - - 'label1' - - 'label2' - - This will result in a single bigWig file called - `arbitrary_label_to_use.bigwig` in the directory - `data/chipseq_aggregation/merged_bigwigs` (by default; this is configured - using ``config/chipseq_patterns.yaml``) that merges together the "label1" - and "label2" bigwigs. - - See :ref:`sampletable` for more info on the relationship between a *sample* - and a *label* when working with ChIP-seq. - - -RNA-seq-only fields -~~~~~~~~~~~~~~~~~~~ -.. _cfg-rrna: - -``rrna`` field -``````````````` - - This field selects the reference tag to use for screening rRNA reads. - Similar to the ``aligner`` field, it takes both a ``tag`` and ``index`` - key. The specified index must have been configured to be built for the - specified tag. It uses the already configured ``organism``. - - Example: - - .. code-block:: yaml - - rrna: - tag: 'rRNA' - index: 'bowtie2' - - -.. _cfg-gtf: - -``gtf`` field -````````````` - - This field selects the reference tag to use for counting reads in features. - The tag must have had a ``gtf:`` section specified; see - :ref:`references-config` for details. - - The organism is inherited from the ``organism:`` field. - - Example: - - .. code-block:: yaml - - gtf: - tag: "gencode-v25" - -.. _cfg-salmon: - -``salmon`` field -```````````````` - This field selects the reference tag to use for the Salmon index (if used). - The tag must have had a FASTA configured, and an index for "salmon" must - have been configured to be built for the organism selected with the - ``organism`` config option. - - -ChIP-seq-only fields -~~~~~~~~~~~~~~~~~~~~ - -.. _cfg-chipseq: - -``chipseq`` config section -`````````````````````````` - This section configures the peak-calling stage of the ChIP-seq workflow. It - currently expects a single key, ``peak_calling``, which is a list of - peak-calling runs. - - A peak-calling run is a dictionary configuring a single execution of - a peak-caller which results in a single BED file of called peaks. - A peak-calling run is uniquely described by its ``label`` and - ``algorithm``. This way, we can use the same label (e.g., `gaf-embryo-1`) - across multiple peak-callers to help organize the output. - - The currently-supported peak-callers are ``macs``, ``spp``, and ``sicer``. - They each have corresponding wrappers in the ``wrappers`` directory. To add - other peak-callers, see :ref:`new-peak-caller`. - - The track hubs will include all of these called peaks which helps with - assessing the peak-calling performance. - - Here is a minimal example of a peak-calling config section. It defines - a single peak-calling run using the `macs` algorithm. Note that the - ``ip:`` and ``control:`` keys are lists of **labels** from the ChIP-seq - sample table's ``label`` column, **not sample IDs** from the first column. - - .. code-block:: yaml - - chipseq: - peak_calling: - - - label: gaf-embryo-1 - algorithm: macs - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - - The above peak-calling config will result in a file - ``data/chipseq_peaks/macs/gaf-embryo-1/peaks.bed`` (that pattern is - defined in ``chipseq_patterns.yaml`` if you need to change it). - - We can specify additional command-line arguments that are passed verbatim - to `macs` with the ``extra:`` section, for example: - - .. code-block:: yaml - - chipseq: - peak_calling: - - - label: gaf-embryo-1 - algorithm: macs - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - extra: '--nomodel --extsize 147' - - - `macs` supports multiple IP and input files, which internally are merged - by `macs`. We can supply multiple IP and input labels for biological - replicates to get a set of peaks called on pooled samples. Note that we - give it a different label so it doesn't overwrite the other peak-calling - run we already have configured. - - .. code-block:: yaml - - chipseq: - peak_calling: - - - label: gaf-embryo-1 - algorithm: macs - ip: - - gaf-embryo-1 - control: - - input-embryo-1 - extra: '--nomodel --extsize 147' - - - - label: gaf-embryo-pooled - algorithm: macs - ip: - - gaf-embryo-1 - - gaf-embryo-2 - control: - - input-embryo-1 - - input-embryo-2 - - - diff --git a/docs/config.rst b/docs/config.rst index 649a3cab..6275af7d 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -5,75 +5,352 @@ Configuration ============= -General configuration -~~~~~~~~~~~~~~~~~~~~~ +Configuration happens in two places: + +**Config file:** + +- :ref:`rnaseq-config` +- :ref:`chipseq-config` + +**Sampletable:** + +- :ref:`rnaseq-sampletable` +- :ref:`chipseq-sampletable` + + +.. _configfiles: + +Config file +----------- + +Config files, at a minimum, specify which reference FASTA to use (:ref:`reference-config`). + +For RNA-seq (:ref:`rnaseq-config`) the config file also specifies strandedness. + +For ChIP-seq (:ref:`chipseq-config`) the config file specifies peak-calling runs. + +Config files are in YAML format. By default, they are expected to be at +:file:`config/config.yaml`, but you can override from the command line like this:: + + snakemake --configfile="otherdir/myconfig.yaml" ... + +Snakemake will merge the config file(s) given on the command line with the +default config file (:file:`config/config.yaml`). + +.. _reference-config: + +Configuring genome fasta (RNA-seq & ChIP-seq) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Both RNA-seq and ChIP-seq need a reference fasta configured, like this: + +.. code-block:: yaml + + genome: + url: + +The value of ``url`` can be a file, like +``file:///data/references/Homo_sapiens/gencode.fa.gz``, or any FTP or HTTP URL. + + +You could optionally use the included reference configs to fill in the genome +and annotation from the commandline, and Snakemake would be called like this:: + + snakemake --configfile=../../include/reference_config_templates/Homo_sapiens/GENCODE.yaml ... + +Or you could copy the contents of the reference config templates and paste in +your own :file:`config/config.yaml`. + + +- url can be file +- postprocessing +- overrides +- included reference configs + + +RNA-seq config +~~~~~~~~~~~~~~ + +For RNA-seq, in addition to the genome fasta file described above, you also need: + +- ``annotation``, structured similar to ``genome``, which specifies a gzipped + GTF file. A transcriptome fasta is automatically built from the genome fasta + and this GTF. +- ``organism`` which will be used to screen ribosomal RNA. Technically, this is + searching for the string in the SILVA rRNA database's fasta records. +- ``stranded`` of the libraries, which is used for automatically + configuring strand-specific tools. The options are: + - ``fr-firststrand`` for dUTP libraries + - ``fr-secondstrand`` for ligation libraries + - ``unstranded`` for libraries without strand specificity. + +See https://rnabio.org/module-09-appendix/0009/12/01/StrandSettings for more +info on strandedness. If you don't know ahead of time, you can use +``fr-firststrand`` and inspect the results for RSeQC's infer_experiment in the +MultiQC output. Correct the strandedness in the config, and re-run. Only the +jobs affected by strandedness will be re-run. + +Here is an example for human: + +.. code-block:: yaml + + organism: "Homo sapiens" + genome: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/GRCh38.primary_assembly.genome.fa.gz" + annotation: + url: "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_49/gencode.v49.primary_assembly.annotation.gtf.gz" + stranded: "fr-firststrand" + +In :file:`include/reference_configs` you can find configs for common model +organisms. These have both genome and annotation, so you can point Snakemake to +them on the command line. You would still need to specify strandedness, which +can be the only config entry in :file:`config/config.yaml`. Or it could be +specified directly on the command line, like this: + +.. code:block:: bash + + snakemake \ + --configfile=../../include/reference_configs/Homo_sapiens/GENCODE.yaml \ + --config stranded=fr-firststrand + +(in this case no separate :file:`config/config.yaml` would be needed, as long +as you use the default :file:`config/sampletable.tsv` as your sampletable) + + +ChIP-seq config +~~~~~~~~~~~~~~~ + +For ChIP-seq, in addition to the genome fasta file described above, you also +need a peak-calling section if you want to to run peak-calling. + +The idea is that the ``peak_calling:`` entry in the config is a list. Each item +in the list is a dictionary with the following keys: -The majority of the work in setting up a new project is in the configuration -- -which samples to run, where the data files are located, which references are -needed, etc. +- ``label`` for the peak-calling run. This is intentionally free-form since you + may want to run the same samples through multiple algorithms or different + parameters. Output will be in :file:`data/peak_calling//