imallona · imallona · Jan 14, 2026 · Jan 15, 2026 · Jan 15, 2026 · Jan 15, 2026
diff --git a/README.md b/README.md
@@ -44,46 +44,78 @@ bash build.sh
 
 ## Usage
 
-`yamet` processes (covered CpG) DNA methylation report(s) (`-cell` argument), a reference file listing all CpG positions in a genome (`--reference`), and a bedfile specifying the genomic regions to calculate scores for (`--intervals`; e.g. promoters, genes, etc). Full CLI args:
+`yamet` processes (covered CpG) DNA methylation report(s), a reference file listing all CpG positions in a genome, and a bedfile specifying the genomic regions to calculate scores for (e.g. promoters, genes, etc). Full CLI args:
 
 ```text
-Usage:
-  yamet (-c <cell>... | <cell>...) \
-        -r <reference> \
-        -i <intervals> \
-        [OPTIONS]
-
-Required inputs:
-  -c --cell <cell>...                 One or more tab-separated methylation files,
-                                      OR provide them directly as positional arguments.
-  <cell>...                           (Positional alternative) Cell files (same format as above).
-  -r --reference <reference>          Reference file, tab-separated and sorted by chromosome/position.
-  -i --intervals <intervals>          BED file of intervals of interest.
-
-Optional output:
-  -d --det-out <file>                 Path to detailed output file.
-  -m --meth-out <file>                Path to average methylation output file.
-  -o --out <file>                     Path to simple output file.
-
-Resource options:
-  --cores <n>                         Number of cores for parallel parsing
-                                      [default: 0, implying program decides].
-  --chunk-size <size>                 Buffer size per file (e.g., 64K, 128M, 2G) [default: 64K].
-
-Optional input control:
-  --skip-header[=<n>]                 Skip <n> lines in all input files [default: 1].
-  --skip-header-cell[=<n>]            Skip <n> lines in cell files (overrides --skip-header).
-  --skip-header-reference[=<n>]       Skip <n> lines in reference file (overrides --skip-header).
-  --skip-header-intervals[=<n>]       Skip <n> lines in intervals file (overrides --skip-header).
-
-Verbose/debugging:
-  --print-intervals                   Print parsed intervals file.
-  --print-reference                   Print parsed reference file.
-  --print-sampens[=<true|false>]      Print computed sample entropies [default: true].
-
-Miscellaneous:
-  -h --help                           Show help message.
-  --version                           Show version information.
+yamet
+
+input:
+  -c [ --cytosine_report ] arg          Per-cell cytosine report file(s) 
+                                        (Bismark-like for covered cytosines). 
+                                        Synonyms: --cytosine_report, --cell, 
+                                        -c.
+                                        Tab-separated, sorted by chromosome and
+                                        position:
+                                          chr   pos   meth_reads   total_reads 
+                                          rate
+  -r [ --cytosine_locations ] arg       Genomic locations of all cytosines 
+                                        (typically CpGs). Synonyms: 
+                                        --cytosine_locations, --reference, -r.
+                                        Required to reconstruct contiguous CpG 
+                                        sequences.
+                                        Columns:
+                                          chr   pos
+  -i [ --regions ] arg                  BED file defining genomic regions where
+                                        entropies will be computed. Synonyms: 
+                                        --regions, --features, --target, 
+                                        --intervals, -i.
+                                        Columns:
+                                          chr   start   end
+  --skip-header-all [=arg(=1)]          Header lines to skip in all input files
+                                        (default: 0). Synonyms: 
+                                        --skip-header-all, --skip-header.
+  --skip-header-cytosine_report [=arg(=1)]
+                                        Header lines to skip in 
+                                        cytosine_report/cell files (default: 
+                                        0).
+  --skip-header-cytosine_locations [=arg(=1)]
+                                        Header lines to skip in 
+                                        cytosine_locations/reference file 
+                                        (default: 0).
+  --skip-header-regions [=arg(=1)]      Header lines to skip in 
+                                        regions/features/target/intervals file 
+                                        (default: 0).
+
+output:
+  -d [ --det-out ] arg                  (optional) path to detailed output file
+  -n [ --norm-det-out ] arg             (optional) path to detailed normalized 
+                                        output file
+  -m [ --meth-out ] arg                 (optional) path to average methylation 
+                                        output file
+  --all-meth [=arg(=true)] (=false)     If true, include all CpGs in 
+                                        methylation summaries, including those 
+                                        not used for template construction 
+                                        (default: false).
+  -o [ --out ] arg                      (optional) path to simple output file
+
+resource utilisation:
+  --cores arg (=0)                      number of cores used for simultaneously
+                                        parsing methylation files
+  --chunk-size arg (=64K)               size of the buffer (per file) used for 
+                                        reading data. Can be specified as a 
+                                        positive integer (bytes) or with a 
+                                        suffix: B, K, M, G.
+
+verbose:
+  --print-intervals                     print parsed intervals file
+  --print-reference                     print parsed reference file
+  --print-sampens [=arg(=true)] (=true) print computed sample entropies
+
+misc:
+  -h [ --help ]                         print help message
+  --version                             current version information
+
+
 ```
 
 ## Repository

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -34,7 +34,7 @@ rule all:
         # op.join("annotation", "mm10", "done.flag"),  ## this to download regions for Ecker's
         # op.join("data", "crc", "download.flag"),  # this to download CRC cytosine reports, caution will redownload them
         # op.join("data", "crc", "results",  "crc.html") , ## this requires the `download.flag` being present (and the cytosine reports; dependencies are not well captured)
-        # op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet")
+        op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet"),
         # op.join('scnmt_gastrulation_data', 'downloaded_data.tsv'), ## this is Argelaguet's gastrulation
         # expand(
         #     op.join(
@@ -58,12 +58,16 @@ rule all:
 
 rule install_yamet:
     """
-    Binary to goes to ~/.local/yamet; export path accordingly
+    Binary to goes to ~/.local/yamet; exported path accordingly
     """
     conda:
         "envs/yamet.yml"
     output:
         op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet")
+    priority:
+        100
+    threads:
+        workflow.cores
     log:
         op.join("logs", "build.log")
     shell:

diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk
@@ -45,6 +45,8 @@ ANNOTATIONS = {
     "chip": ["H3K27me3", "H3K9me3", "H3K4me3"],
     "lad": ["laminb1"],
     "genes": ["genes"],
+    "lines": ["lines"],
+    "sines": ['sines'],
     "cpgIslandExt": ['cpgIslandExt'],
     "scna": ["crc01_nc_scna", "crc01_gain_scna", "crc01_lost_scna"]
 }
@@ -224,10 +226,12 @@ rule run_yamet_on_separate_features:
     output:
         simple_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out")),
         det_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out")),
+        norm_det_uncomp=temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out")),
         meth_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out")),
         simple=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out.gz"),
         det=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out.gz"),
-        meth=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out.gz")
+        meth=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out.gz"),
+        norm_det=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out.gz")
     log:
         op.join('logs', 'yamet_{subcat}_{cat}_{patient}_{location}.log')
     group:
@@ -246,7 +250,8 @@ rule run_yamet_on_separate_features:
          --print-sampens F \
          --out {output.simple_uncomp} \
          --det-out {output.det_uncomp} \
-         --meth-out {output.meth_uncomp} &> {log}
+         --meth-out {output.meth_uncomp} \
+         --norm-det-out {output.norm_det_uncomp} &> {log}
 
         gzip --keep -f {params.path}/{wildcards.subcat}_{wildcards.cat}_{wildcards.patient}_{wildcards.location}*out  &>> {log}
         """
@@ -267,6 +272,8 @@ rule render_crc_report:
         list_relevant_yamet_outputs()
     params:
         output_path=CRC_OUTPUT
+    threads:
+        round(workflow.cores/2)
     output:
         op.join(CRC, "results", "crc.html")
     log:
@@ -376,9 +383,11 @@ rule run_yamet_on_windows:
         simple_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.out")),
         det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.det.out")),
         meth_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out")),
+        norm_det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.norm.det.out")),        
         simple=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.out.gz"),
         det=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.det.out.gz"),
-        meth=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out.gz")
+        meth=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out.gz"),
+        norm_det=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.norm.det.out.gz")
     log:
         op.join('logs', 'yamet_{win_size}_{patient}_{location}.log')
     group:
@@ -397,7 +406,8 @@ rule run_yamet_on_windows:
          --print-sampens F \
          --out {output.simple_uncomp} \
          --det-out {output.det_uncomp} \
-         --meth-out {output.meth_uncomp} &> {log}
+         --meth-out {output.meth_uncomp} \
+         --norm-det-out {output.norm_det_uncomp} &> {log}
 
         gzip --keep -f {params.path}/{wildcards.win_size}_{wildcards.patient}_{wildcards.location}*out  &>> {log}
         """
@@ -416,17 +426,17 @@ rule get_scna_patient1_from_supplementary_data:
     conda:
         op.join("..", "envs", "r.yml")
     input:
-        op.join("src", "scna_hg19.xlsx"),
+        op.join(".", "src", "scna_hg19.xlsx"),
     output:
-        scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz")
+        scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz.pre")
     script:
         "src/parse_scna.R"
 
 rule split_patient1_crc_in_kept_lost_gained:
     conda:
         op.join("..", "envs", "r.yml")
     input:
-        scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz"),
+        scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz.pre"),
         genome_sizes = op.join(HG19_BASE, "genome.sizes"),
     output:
         uncomp = temp(op.join(HG19_BASE, "crc01_scna.bed")),

diff --git a/workflow/rules/hg19.smk b/workflow/rules/hg19.smk
@@ -43,6 +43,23 @@ rule build_genome_hg19_ref:
         "src/make_ref.sh"
 
 
+rule get_hg19_lines:
+    conda:
+        op.join("..", "envs", "processing.yml")
+    output:
+        op.join(HG19_BASE, "rmsk.lines.bed.gz"),
+    script:
+        "src/download_hg19_lines.sh"
+
+
+rule get_hg19_sines:
+    conda:
+        op.join("..", "envs", "processing.yml")
+    output:
+        op.join(HG19_BASE, "rmsk.sines.bed.gz"),
+    script:
+        "src/download_hg19_sines.sh"
+
 rule get_genes_hg19:
     conda:
         op.join("..", "envs", "processing.yml")
@@ -51,6 +68,16 @@ rule get_genes_hg19:
     script:
         "src/download_hg19_genes.sh"
 
+rule uncompress_repeats:
+    conda:
+        op.join("..", "envs", "processing.yml")
+    input:
+        op.join(HG19_BASE, "rmsk.{type}.bed.gz"),        
+    output:
+        op.join(HG19_BASE, "{type}.{type}.bed"),
+    shell:
+        "gunzip -c {input} > {output}"
+
 
 rule uncompress_hg19_genes:
     conda:

diff --git a/workflow/rules/mm10.smk b/workflow/rules/mm10.smk
@@ -6,7 +6,7 @@ CHRS = [str(i) for i in range(1, 20)] + ["X", "Y"]
 MM10_BASE = "mm10"
 
 
-rule mm10_chr_ref:
+rule mm10_per_chr_ref:
     conda:
         op.join("..", "envs", "processing.yml")
     output:
@@ -18,7 +18,7 @@ rule mm10_chr_ref:
         "src/get_chr_ref.sh"
 
 
-rule mm10_ref:
+rule mm10_aggregate_ref:
     input:
         expand(op.join(MM10_BASE, "{chr}.{{meth_pat}}.ref"), chr=CHRS),
     params:
@@ -46,7 +46,24 @@ rule get_mm10_genes:
     script:
         "src/download_mm10_genes.sh"
 
+rule get_mm10_lines:
+    conda:
+        op.join("..", "envs", "processing.yml")
+    output:
+        op.join(MM10_BASE, "lines.bed.gz"),
+    script:
+        "src/download_hg19_lines.sh"
+
+
+rule get_mm10_sines:
+    conda:
+        op.join("..", "envs", "processing.yml")
+    output:
+        op.join(MM10_BASE, "sines.bed.gz"),
+    script:
+        "src/download_mm10_sines.sh"
 
+
 ENCODE_MAP = {
     "h3k4me3": "ENCFF160SCR",
     "h3k9me3": "ENCFF658QTP",