Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ jobs:
- uses: actions/checkout@v4
- uses: prefix-dev/setup-pixi@v0.8.1
with:
pixi-version: v0.37.0
pixi-version: v0.46.0
cache: true
- run: pixi run test
3 changes: 3 additions & 0 deletions config/config_4col.tbl
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
sample bam ref ref_name
test test.cram /mmfs1/gscratch/stergachislab/minkinaa/FIRE/fire-test-data/test.fa.gz hg38
test2 test.cram /mmfs1/gscratch/stergachislab/minkinaa/FIRE/fire-test-data/test.fa.gz hg38
3 changes: 3 additions & 0 deletions config/config_4col.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#ref:
#ref_name:
manifest: test_2row.tbl
3,434 changes: 969 additions & 2,465 deletions pixi.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,6 @@ snakefmt = "*"
ruff = "*"
awscli = "2.22.*"
taplo = "*"
pysam = "*"

[pypi-dependencies]
70 changes: 60 additions & 10 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,23 @@ VERSION = ".".join(FULL_VERSION.split(".")[:2])
if config.get("full-version", False):
VERSION = FULL_VERSION


# thread options
MAX_THREADS = config.get("max_threads", 4)
SORT_THREADS = config.get("sort_threads", 8)

# sample, haplotype, and chromosome wildcard building
DEFAULT_ENV = config.get("env", "../envs/env.yaml")
MANIFEST = get_manifest()
MIN_FIRE_FDR = config.get("min_fire_fdr", 0.10)

# reference genome and reference regions
REF = get_ref()
FAI = get_fai()
REF_NAME = config["ref_name"]
EXCLUDES = get_excludes()
KEEP_CHRS = config.get("keep_chromosomes", ".*")

# coverage requirements
MIN_COVERAGE = config.get("min_coverage", 4)
COVERAGE_WITHIN_N_SD = config.get("coverage_within_n_sd", 5)

# sample, haplotype, and chromosome wildcard building
FAI_DF = get_fai_df()
DEFAULT_ENV = config.get("env", "../envs/env.yaml")
MANIFEST = get_manifest()
MIN_FIRE_FDR = config.get("min_fire_fdr", 0.10)

# FDR / peak calling thresholds
MAX_PEAK_FDR = config.get("max_peak_fdr", 0.05)
Expand Down Expand Up @@ -109,8 +105,62 @@ if DSA:
include: "rules/levio.smk"


## mod manifest to include reference
if "ref" not in MANIFEST.columns:
if "ref" not in config or str(config.get("ref")) == "None":
raise ValueError("FIRE: ref parameter is missing in config.yaml")
else:
temp_ref = config.get("ref")
MANIFEST["ref"] = temp_ref
if "ref_name" not in config or str(config.get("ref_name")) == "None":
raise ValueError("FIRE: ref_name parameter is missing in config.yaml")
else:
temp_ref_name = config.get("ref_name")
MANIFEST["ref_name"] = temp_ref_name

# make list of all chrs for
chroms_list_joined = ""

for index, row in MANIFEST.iterrows():
first_report_var = True
min_contig_length = config.get("min_contig_length", 0)
fai = row['ref'] + ".fai"
fai_df = pd.read_csv(fai, sep="\t", names=["chr", "length", "x", "y", "z"])

skipped_contigs = fai_df["chr"][fai_df["length"] < min_contig_length]
if len(skipped_contigs) > 0 and first_report_var:
print(
f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
file=sys.stderr,
)

chroms = fai_df["chr"][fai_df["length"] >= min_contig_length]
chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
chroms = [chrom for chrom in chroms if "_random" not in chrom]
chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]

if first_report_var:
first_report_var = False
print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)

if len(chroms) == 0:
raise ValueError(
f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
f"Your fai file contains the following chromosomes: {fai_df['chr']}"
)
temp_chroms_list="|".join(chroms)
if chroms_list_joined == "":
chroms_list_joined = temp_chroms_list
else:
chroms_list_joined = chroms_list_joined + "|" + temp_chroms_list

unique_chroms_list_joined = "|".join(set(chroms_list_joined.split("|")))

#print(unique_chroms_list_joined)

wildcard_constraints:
chrom="|".join(get_chroms()),
#chrom="|".join(get_chroms()),
chrom=unique_chroms_list_joined,
call="|".join(["msp", "m6a"]),
sm="|".join(MANIFEST.index),
types="|".join(types),
Expand Down
1 change: 1 addition & 0 deletions workflow/envs/env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ dependencies:
- csvtk
- mosdepth==0.3.7
- bioconda::bigtools==0.5.5
#- pysam==0.22
2 changes: 1 addition & 1 deletion workflow/envs/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ dependencies:
- pip
- pip:
- polars-lts-cpu[pyarrow]==1.7.1
#- pysam==0.22
#- tqdm
#- numba::numba==0.60
#- pysam==0.21
2 changes: 1 addition & 1 deletion workflow/envs/runner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ dependencies:
#- tqdm
#- pip
#- pip:
#- pysam==0.22.1
- pysam==0.22.1
5 changes: 3 additions & 2 deletions workflow/rules/apply-model.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
rule fire:
input:
bam=ancient(get_input_bam),
ref=ancient(REF),
ref=ancient(get_ref)
#ref=ancient(REF),
output:
cram="results/{sm}/{sm}-fire-{v}-filtered.cram",
crai="results/{sm}/{sm}-fire-{v}-filtered.cram.crai",
Expand Down Expand Up @@ -75,7 +76,7 @@ rule fire_sites_chrom:
rule fire_sites:
input:
beds=expand(
rules.fire_sites_chrom.output.bed, chrom=get_chroms(), allow_missing=True
rules.fire_sites_chrom.output.bed, chrom=get_chroms, allow_missing=True
),
output:
bed="results/{sm}/additional-outputs-{v}/fire-peaks/{sm}-{v}-fire-elements.bed.gz",
Expand Down
101 changes: 79 additions & 22 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
@@ -1,29 +1,23 @@
import re
import logging
import sys
import pysam

FIRST_REPORT = True


def get_ref():
if "ref" not in config:
raise ValueError("FIRE: ref parameter is missing in config.yaml")
ref = config["ref"]
def get_ref(wc):
ref = MANIFEST.loc[wc.sm, "ref"]
if not os.path.exists(ref):
raise ValueError(f"FIRE: reference file {ref} does not exist")
return os.path.abspath(ref)

def get_ref_name(wc):
return MANIFEST.loc[wc.sm, "ref_name"]

def get_fai():
fai = f"{get_ref()}.fai"
if not os.path.exists(fai):
raise ValueError(f"FIRE: reference index file {fai} does not exist")
return fai


def get_excludes():
def get_excludes(wc):
excludes = config.get("excludes", [])
if REF_NAME == "hg38" or REF_NAME == "GRCh38":
ref_name = get_ref_name(wc)
if ref_name == "hg38" or ref_name == "GRCh38":
files = [
"../annotations/hg38.gap.bed.gz",
"../annotations/hg38.blacklist.ENCFF356LFX.bed.gz",
Expand All @@ -32,37 +26,95 @@ def get_excludes():
excludes += [workflow.source_path(file) for file in files]
return excludes

def get_fai(wc):
ref = get_ref(wc)
fai = f"{ref}.fai"
if not os.path.exists(fai):
raise ValueError(f"FIRE: reference index file {fai} does not exist")
return fai

def get_fai_df():
fai = get_fai()
def get_fai_df(wc):
fai = get_fai(wc)
return pd.read_csv(fai, sep="\t", names=["chr", "length", "x", "y", "z"])


def get_chroms():
def get_chroms(wc):
global FIRST_REPORT
min_contig_length = config.get("min_contig_length", 0)
skipped_contigs = FAI_DF["chr"][FAI_DF["length"] < min_contig_length]

chroms=[]
skipped_contigs=[]
input_bam_path=get_input_bam(wc)
input_bam = pysam.AlignmentFile(input_bam_path)
name_length=zip(input_bam.references, input_bam.lengths)

for chrom_name, chrom_length in name_length:
if chrom_length >=min_contig_length:
chroms.append(chrom_name)
else:
skipped_contigs.append(chrom_name)

input_bam.close()

chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
chroms = [chrom for chrom in chroms if "_random" not in chrom]
chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]

if len(skipped_contigs) > 0 and FIRST_REPORT:
print(
f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
file=sys.stderr,
)

chroms = FAI_DF["chr"][FAI_DF["length"] >= min_contig_length]
if FIRST_REPORT:
FIRST_REPORT = False
print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)

if len(chroms) == 0:
raise ValueError(
f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
f"Your bam file contains the following chromosomes: {skipped_contigs}"
)
return chroms

def get_chroms_first_element(wc):
global FIRST_REPORT
min_contig_length = config.get("min_contig_length", 0)

chroms=[]
skipped_contigs=[]
input_bam_path=get_input_bam(wc)
input_bam = pysam.AlignmentFile(input_bam_path)
name_length=zip(input_bam.references, input_bam.lengths)

for chrom_name, chrom_length in name_length:
if chrom_length >=min_contig_length:
chroms.append(chrom_name)
else:
skipped_contigs.append(chrom_name)


input_bam.close()

chroms = sorted([chrom for chrom in chroms if "chrUn_" not in chrom])
chroms = [chrom for chrom in chroms if "_random" not in chrom]
chroms = [chrom for chrom in chroms if re.fullmatch(KEEP_CHRS, chrom)]

if len(skipped_contigs) > 0 and FIRST_REPORT:
print(
f"WARNING: Skipping contigs with length < {min_contig_length:,}: {skipped_contigs}",
file=sys.stderr,
)

if FIRST_REPORT:
FIRST_REPORT = False
print(f"INFO: Using N chromosomes: {len(chroms)}", file=sys.stderr)

if len(chroms) == 0:
raise ValueError(
f"No chromosomes left after filtering. Check your keep_chromosomes parameter in config.yaml. "
f"Your fai file contains the following chromosomes: {FAI_DF['chr']}"
f"Your bam file contains the following chromosomes: {skipped_contigs}"
)
return chroms
return chroms[0]


def get_manifest():
Expand All @@ -80,6 +132,11 @@ def get_manifest():
def get_input_bam(wc):
return MANIFEST.loc[wc.sm, "bam"]

def get_input_ref(wc):
return MANIFEST.loc[wc.sm, "ref"]

def get_input_ref_name(wc):
return MANIFEST.loc[wc.sm, "ref_name"]

def get_mem_mb(wildcards, attempt):
if attempt < 3:
Expand Down
16 changes: 8 additions & 8 deletions workflow/rules/coverages.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
#
rule genome_bedgraph:
input:
ref=ancient(REF),
fai=ancient(FAI),
ref=ancient(get_ref),
fai=ancient(get_fai),
cram=rules.fire.output.cram,
crai=rules.fire.output.crai,
output:
Expand All @@ -17,7 +17,7 @@ rule genome_bedgraph:
DEFAULT_ENV
shell:
"""
mosdepth -f {input.ref} -t {threads} tmp {input.cram}
mosdepth -F 4 -f {input.ref} -t {threads} tmp {input.cram}
bgzip -cd tmp.per-base.bed.gz \
| LC_ALL=C sort --parallel={threads} -k1,1 -k2,2n -k3,3n -k4,4 \
| bgzip -@ {threads} \
Expand All @@ -43,7 +43,7 @@ rule coverage:
params:
coverage_within_n_sd=COVERAGE_WITHIN_N_SD,
min_coverage=MIN_COVERAGE,
chroms=get_chroms(),
chroms=get_chroms,
script:
"../scripts/cov.py"

Expand Down Expand Up @@ -76,7 +76,7 @@ rule fiber_locations:
input:
fibers=expand(
rules.fiber_locations_chromosome.output.bed,
chrom=get_chroms(),
chrom=get_chroms,
allow_missing=True,
),
bg=rules.genome_bedgraph.output.bg,
Expand Down Expand Up @@ -119,14 +119,14 @@ rule fiber_locations:
rule exclude_from_shuffle:
input:
filtered=rules.fiber_locations.output.filtered,
fai=ancient(FAI),
fai=ancient(get_fai),
output:
bed="results/{sm}/additional-outputs-{v}/coverage/exclude-from-shuffles.bed.gz",
threads: 4
conda:
DEFAULT_ENV
params:
exclude=EXCLUDES,
exclude=get_excludes,
shell:
"""

Expand All @@ -147,7 +147,7 @@ rule unreliable_coverage_regions:
bg=rules.genome_bedgraph.output.bg,
minimum=rules.coverage.output.minimum,
maximum=rules.coverage.output.maximum,
fai=ancient(FAI),
fai=ancient(get_fai),
output:
bed="results/{sm}/additional-outputs-{v}/coverage/unreliable-coverage-regions.bed.gz",
bed_tbi="results/{sm}/additional-outputs-{v}/coverage/unreliable-coverage-regions.bed.gz.tbi",
Expand Down
Loading
Loading