From fce1271b66b997f9d5d20d8b4c31c971b42e12b2 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Wed, 14 Jan 2026 12:49:45 +0100
Subject: [PATCH 1/9] Fix metadata path

---
 workflow/rules/crc.smk | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk
index 6b2d8a2..e8b3d20 100644
--- a/workflow/rules/crc.smk
+++ b/workflow/rules/crc.smk
@@ -416,9 +416,9 @@ rule get_scna_patient1_from_supplementary_data:
     conda:
         op.join("..", "envs", "r.yml")
     input:
-        op.join("src", "scna_hg19.xlsx"),
+        op.join(".", "src", "scna_hg19.xlsx"),
     output:
-        scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz")
+        scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz.pre")
     script:
         "src/parse_scna.R"
 
@@ -426,7 +426,7 @@ rule split_patient1_crc_in_kept_lost_gained:
     conda:
         op.join("..", "envs", "r.yml")
     input:
-        scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz"),
+        scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz.pre"),
         genome_sizes = op.join(HG19_BASE, "genome.sizes"),
     output:
         uncomp = temp(op.join(HG19_BASE, "crc01_scna.bed")),

From cfcb6a5c662ea188027c554e3e9320cf1eab6960 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Thu, 15 Jan 2026 14:17:01 +0100
Subject: [PATCH 2/9] Update CLI documentation and default names; shorthands
 are now a bit confusing

---
 README.md            | 112 +++++++++++++-------
 method/src/boost.cpp | 239 +++++++++++++++++++++----------------------
 2 files changed, 191 insertions(+), 160 deletions(-)

diff --git a/README.md b/README.md
index 5da46e4..0b31a70 100644
--- a/README.md
+++ b/README.md
@@ -44,46 +44,82 @@ bash build.sh
 
 ## Usage
 
-`yamet` processes (covered CpG) DNA methylation report(s) (`-cell` argument), a reference file listing all CpG positions in a genome (`--reference`), and a bedfile specifying the genomic regions to calculate scores for (`--intervals`; e.g. promoters, genes, etc). Full CLI args:
+`yamet` processes (covered CpG) DNA methylation report(s), a reference file listing all CpG positions in a genome, and a bedfile specifying the genomic regions to calculate scores for (e.g. promoters, genes, etc). Full CLI args:
 
 ```text
-Usage:
-  yamet (-c <cell>... | <cell>...) \
-        -r <reference> \
-        -i <intervals> \
-        [OPTIONS]
-
-Required inputs:
-  -c --cell <cell>...                 One or more tab-separated methylation files,
-                                      OR provide them directly as positional arguments.
-  <cell>...                           (Positional alternative) Cell files (same format as above).
-  -r --reference <reference>          Reference file, tab-separated and sorted by chromosome/position.
-  -i --intervals <intervals>          BED file of intervals of interest.
-
-Optional output:
-  -d --det-out <file>                 Path to detailed output file.
-  -m --meth-out <file>                Path to average methylation output file.
-  -o --out <file>                     Path to simple output file.
-
-Resource options:
-  --cores <n>                         Number of cores for parallel parsing
-                                      [default: 0, implying program decides].
-  --chunk-size <size>                 Buffer size per file (e.g., 64K, 128M, 2G) [default: 64K].
-
-Optional input control:
-  --skip-header[=<n>]                 Skip <n> lines in all input files [default: 1].
-  --skip-header-cell[=<n>]            Skip <n> lines in cell files (overrides --skip-header).
-  --skip-header-reference[=<n>]       Skip <n> lines in reference file (overrides --skip-header).
-  --skip-header-intervals[=<n>]       Skip <n> lines in intervals file (overrides --skip-header).
-
-Verbose/debugging:
-  --print-intervals                   Print parsed intervals file.
-  --print-reference                   Print parsed reference file.
-  --print-sampens[=<true|false>]      Print computed sample entropies [default: true].
-
-Miscellaneous:
-  -h --help                           Show help message.
-  --version                           Show version information.
+yamet
+
+input:
+  -c [ --cytosine_report ] arg          per-cell cytosine report file(s) 
+                                        (resembling Bismark's for covered 
+                                        cytosines). Synonyms: --cell, -c.
+                                        These tab-separated files describe 
+                                        covered CpGs sorted by chromosome and 
+                                        position, with columns:
+                                          chr   pos   meth_reads   total_reads 
+                                          rate
+  -r [ --cytosine_locations ] arg       genomic locations of all cytosines 
+                                        (typically CpGs). Used to account for 
+                                        missing values in cytosine 
+                                        reports.Synonyms: --cytosine_locations,
+                                        --reference, -r.
+                                        Required to reconstruct contiguous CpG 
+                                        sequences. Columns:
+                                          chr   pos
+  -i [ --regions ] arg                  BED file defining genomic regions where
+                                        entropies will be computed (both per 
+                                        feature and globally)Synonyms: 
+                                        --regions, --features, --target, 
+                                        --intervals, -i.
+                                        Columns:
+                                          chr   start   end
+  --skip-header-all [=arg(=1)]          Number of header lines to skip in all 
+                                        input files (default: 1). Synonyms: 
+                                        --skip-header-all, --skip-header.
+  --skip-header-cytosine_report [=arg(=1)]
+                                        Number of header lines to skip in 
+                                        cytosine_report/cell files (default: 
+                                        1).
+  --skip-header-cytosine_locations [=arg(=1)]
+                                        Number of header lines to skip in 
+                                        cytosine_locations/reference file 
+                                        (default: 1).
+  --skip-header-regions [=arg(=1)]      Number of header lines to skip in 
+                                        regions/features/target/intervals file 
+                                        (default: 1).
+
+output:
+  -d [ --det-out ] arg                  (optional) path to detailed output file
+  -n [ --norm-det-out ] arg             (optional) path to detailed normalized 
+                                        output file
+  -m [ --meth-out ] arg                 (optional) path to average methylation 
+                                        output file
+  --all-meth [=arg(=true)] (=false)     if true, include all CpGs in 
+                                        methylation summaries, including those 
+                                        not used for template construction 
+                                        (default: false).
+  --all-meth [=arg(=true)] (=false)
+  -o [ --out ] arg                      (optional) path to simple output file
+
+resource utilisation:
+  --cores arg (=0)                      number of cores used for simultaneously
+                                        parsing methylation files
+  --chunk-size arg (=64K)               size of the buffer (per file) used for 
+                                        reading data. Can be specified as a 
+                                        positive integer (bytes) or with a 
+                                        suffix: B (bytes), K (kilobytes), M 
+                                        (megabytes), G (gigabytes). Example: 
+                                        4096, 64K, 128M, 2G
+
+verbose:
+  --print-intervals                     print parsed intervals file
+  --print-reference                     print parsed reference file
+  --print-sampens [=arg(=true)] (=true) print computed sample entropies
+
+misc:
+  -h [ --help ]                         print help message
+  --version                             current version information
+
 ```
 
 ## Repository
diff --git a/method/src/boost.cpp b/method/src/boost.cpp
index 9c69807..a7055a0 100644
--- a/method/src/boost.cpp
+++ b/method/src/boost.cpp
@@ -10,55 +10,53 @@
 #include "boost.h"
 #include "version.h"
 
+namespace po = boost::program_options;
+
 po::variables_map parseCommandLine(int argc, char **argv) {
   po::variables_map vm;
 
   po::options_description inp("input");
   // clang-format off
   inp.add_options()
-    ("cell,c", po::value<std::vector<std::string>>()->composing(), 
-        "tab separated files, sorted by chromosome and position, for "
-        "different cells in the following format\n"
-        "\n"
-        " chr1    5    0    2    0\n"
-        " chr1    9    1    1    1\n"
-        " chr2    2    3    4    1\n"
-        "\n"
-        "where the columns are the chromosome, position, number of methylated reads, "
-        "total number of reads and the rate respectively")
-    ("reference,r", po::value<std::string>(),
-        "tab separated file, sorted by chromosome and position, for "
-        "reference sites in the following format\n"
-        "\n"
-        " chr1    5\n"
-        " chr1    7\n"
-        " chr1    9\n"
-        " chr1    11\n"
-        " chr2    2\n"
-        " chr2    4\n"
-        "\n"
-        "where the columns are the chromosome and start position respectively")
-    ("intervals,i", po::value<std::string>(),
-        "bed file, sorted by chromosome and start position, for "
-        "intervals of interest in the following format\n"
-        "\n"
-        " chr1    5     7\n"
-        " chr1    10    30\n"
-        " chr2    1     6\n"
-        "\n"
-        "where the columns are the chromosome, start position and the end position respectively")
-    ("skip-header", po::value<unsigned int>()->implicit_value(1),
-        "integer value indicating number of lines to skip in all file inputs"
-        "(this is a default value which can be overriden by other 'skip-header-*' options)")
-    ("skip-header-cell", po::value<unsigned int>()->implicit_value(1),
-        "integer value indicating number of lines to skip in the cell files"
-        "(overrides 'skip-header' if provided)")
-    ("skip-header-reference", po::value<unsigned int>()->implicit_value(1),
-        "integer value indicating number of lines to skip in the reference file"
-        "(overrides 'skip-header' if provided)")
-    ("skip-header-intervals", po::value<unsigned int>()->implicit_value(1),
-        "integer value indicating number of lines to skip in the intervals file"
-        "(overrides 'skip-header' if provided)");
+
+    ("cytosine_report,cell,c",
+      po::value<std::vector<std::string>>()->composing(),
+      "Per-cell cytosine report file(s) (Bismark-like for covered cytosines). "
+      "Synonyms: --cytosine_report, --cell, -c.\n"
+      "Tab-separated, sorted by chromosome and position:\n"
+      "  chr   pos   meth_reads   total_reads   rate")
+
+    ("cytosine_locations,reference,r",
+      po::value<std::string>(),
+      "Genomic locations of all cytosines (typically CpGs). "
+      "Synonyms: --cytosine_locations, --reference, -r.\n"
+      "Required to reconstruct contiguous CpG sequences.\n"
+      "Columns:\n"
+      "  chr   pos")
+
+    ("regions,features,target,intervals,i",
+      po::value<std::string>(),
+      "BED file defining genomic regions where entropies will be computed. "
+      "Synonyms: --regions, --features, --target, --intervals, -i.\n"
+      "Columns:\n"
+      "  chr   start   end")
+
+    ("skip-header-all,skip-header",
+      po::value<unsigned int>()->implicit_value(1),
+      "Header lines to skip in all input files (default: 1). "
+      "Synonyms: --skip-header-all, --skip-header.")
+
+    ("skip-header-cytosine_report,skip-header-cell",
+      po::value<unsigned int>()->implicit_value(1),
+      "Header lines to skip in cytosine_report/cell files (default: 1).")
+
+    ("skip-header-cytosine_locations,skip-header-reference",
+      po::value<unsigned int>()->implicit_value(1),
+      "Header lines to skip in cytosine_locations/reference file (default: 1).")
+
+    ("skip-header-regions,skip-header-features,skip-header-target,skip-header-intervals",
+      po::value<unsigned int>()->implicit_value(1),
+      "Header lines to skip in regions/features/target/intervals file (default: 1).");
   // clang-format on
 
   po::options_description out("output");
@@ -67,7 +65,10 @@ po::variables_map parseCommandLine(int argc, char **argv) {
     ("det-out,d", po::value<std::string>(), "(optional) path to detailed output file")
     ("norm-det-out,n", po::value<std::string>(), "(optional) path to detailed normalized output file")
     ("meth-out,m", po::value<std::string>(), "(optional) path to average methylation output file")
-    ("all-meth", po::value<std::string>()->default_value("false")->implicit_value("true"))
+    ("all-meth",
+        po::value<std::string>()->default_value("false")->implicit_value("true"),
+        "If true, include all CpGs in methylation summaries, including those "
+        "not used for template construction (default: false).")
     ("out,o", po::value<std::string>(), "(optional) path to simple output file");
   // clang-format on
 
@@ -87,14 +88,13 @@ po::variables_map parseCommandLine(int argc, char **argv) {
         "number of cores used for simultaneously parsing methylation files")
     ("chunk-size", po::value<std::string>()->default_value("64K")->notifier(validate_chunk_size),
         "size of the buffer (per file) used for reading data. Can be specified as a "
-        "positive integer (bytes) or with a suffix: B (bytes), K (kilobytes), "
-        "M (megabytes), G (gigabytes). Example: 4096, 64K, 128M, 2G");
+        "positive integer (bytes) or with a suffix: B, K, M, G.");
   // clang-format on
 
   po::options_description mis("misc");
   // clang-format off
   mis.add_options()
-    ("help,h", "produce help message")
+    ("help,h", "print help message")
     ("version", "current version information");
   // clang-format on
 
@@ -118,49 +118,71 @@ po::variables_map parseCommandLine(int argc, char **argv) {
   return vm;
 }
 
-// Helper get functions for different arguments
-
 std::vector<std::string> getCellFiles(const po::variables_map &vm) {
-  if (!vm.count("cell")) {
-    throw std::system_error(EINVAL, std::generic_category(), "No cell coverage files provided");
-  }
-  return vm["cell"].as<std::vector<std::string>>();
+  if (vm.count("cell"))
+    return vm["cell"].as<std::vector<std::string>>();
+  if (vm.count("cytosine_report"))
+    return vm["cytosine_report"].as<std::vector<std::string>>();
+  throw std::system_error(EINVAL, std::generic_category(),
+                          "No cell/cytosine_report files provided");
 }
 
 std::string getRef(const po::variables_map &vm) {
-  if (!vm.count("reference")) {
-    throw std::system_error(EINVAL, std::generic_category(), "No reference file provided");
-  }
-  return vm["reference"].as<std::string>();
+  if (vm.count("reference"))
+    return vm["reference"].as<std::string>();
+  if (vm.count("cytosine_locations"))
+    return vm["cytosine_locations"].as<std::string>();
+  throw std::system_error(EINVAL, std::generic_category(),
+                          "No reference/cytosine_locations file provided");
 }
 
 std::string getIntervals(const po::variables_map &vm) {
-  if (!vm.count("intervals")) {
-    throw std::system_error(EINVAL, std::generic_category(), "No intervals file provided");
-  }
-  return vm["intervals"].as<std::string>();
+  if (vm.count("intervals"))
+    return vm["intervals"].as<std::string>();
+  if (vm.count("regions"))
+    return vm["regions"].as<std::string>();
+  if (vm.count("features"))
+    return vm["features"].as<std::string>();
+  if (vm.count("target"))
+    return vm["target"].as<std::string>();
+  throw std::system_error(EINVAL, std::generic_category(),
+                          "No intervals/regions/features/target file provided");
 }
 
 unsigned int getSkipHeaderTemplate(const po::variables_map &vm, const std::string &file_type) {
-  if (vm.count(file_type)) {
+  if (vm.count(file_type))
     return vm[file_type].as<unsigned int>();
-  } else if (vm.count("skip-header")) {
-    return vm["skip-header"].as<unsigned int>();
-  } else {
-    return 0;
-  }
+  if (vm.count("skip-header-all"))
+    return vm["skip-header-all"].as<unsigned int>();
+  return 1;
 }
 
 unsigned int getSkipHeaderCell(const po::variables_map &vm) {
-  return getSkipHeaderTemplate(vm, "skip-header-cell");
+  if (vm.count("skip-header-cell"))
+    return vm["skip-header-cell"].as<unsigned int>();
+  if (vm.count("skip-header-cytosine_report"))
+    return vm["skip-header-cytosine_report"].as<unsigned int>();
+  return getSkipHeaderTemplate(vm, "skip-header-all");
 }
 
 unsigned int getSkipHeaderReference(const po::variables_map &vm) {
-  return getSkipHeaderTemplate(vm, "skip-header-reference");
+  if (vm.count("skip-header-reference"))
+    return vm["skip-header-reference"].as<unsigned int>();
+  if (vm.count("skip-header-cytosine_locations"))
+    return vm["skip-header-cytosine_locations"].as<unsigned int>();
+  return getSkipHeaderTemplate(vm, "skip-header-all");
 }
 
 unsigned int getSkipHeaderIntervals(const po::variables_map &vm) {
-  return getSkipHeaderTemplate(vm, "skip-header-intervals");
+  if (vm.count("skip-header-intervals"))
+    return vm["skip-header-intervals"].as<unsigned int>();
+  if (vm.count("skip-header-regions"))
+    return vm["skip-header-regions"].as<unsigned int>();
+  if (vm.count("skip-header-features"))
+    return vm["skip-header-features"].as<unsigned int>();
+  if (vm.count("skip-header-target"))
+    return vm["skip-header-target"].as<unsigned int>();
+  return getSkipHeaderTemplate(vm, "skip-header-all");
 }
 
 std::string getDetOut(const po::variables_map &vm) {
@@ -176,12 +198,8 @@ std::string getMethOut(const po::variables_map &vm) {
 }
 
 bool allMeth(const po::variables_map &vm) {
-  char t = (char)std::tolower(vm["all-meth"].as<std::string>()[0]);
-  if (t == 't' || t == '1') {
-    return true;
-  } else {
-    return false;
-  }
+  char t = std::tolower(vm["all-meth"].as<std::string>()[0]);
+  return (t == 't' || t == '1');
 }
 
 std::string getOut(const po::variables_map &vm) {
@@ -190,58 +208,42 @@ std::string getOut(const po::variables_map &vm) {
 
 unsigned int getCores(const po::variables_map &vm) {
   const unsigned int max_cores = std::thread::hardware_concurrency();
-  const int          c         = vm["cores"].as<int>();
-  if (c == -1) {
-    return max_cores;
-  } else if (c == 0) {
-    return max_cores - (unsigned int)(std::floor(std::log2(max_cores)));
-  } else {
-    return c;
-  }
+  const int c = vm["cores"].as<int>();
+  if (c == -1) return max_cores;
+  if (c == 0)  return max_cores - (unsigned int)(std::floor(std::log2(max_cores)));
+  return c;
 }
 
 unsigned int getChunkSize(const po::variables_map &vm) {
   static const std::regex pattern(R"(^(\d+(\.\d+)?)(B|K|M|G)?$)", std::regex::icase);
-  std::smatch             match;
-  const auto              chunk_size = vm["chunk-size"].as<std::string>();
+  std::smatch match;
+  const auto chunk_size = vm["chunk-size"].as<std::string>();
 
   std::regex_match(chunk_size, match, pattern);
 
-  double num  = std::stod(match[1].str());
-  char   unit = match[3].str().empty() ? 'B' : std::toupper(match[3].str()[0]);
+  double num = std::stod(match[1].str());
+  char unit = match[3].str().empty() ? 'B' : std::toupper(match[3].str()[0]);
 
   switch (unit) {
-  case 'K':
-    num *= 1024;
-    break;
-  case 'M':
-    num *= 1024 * 1024;
-    break;
-  case 'G':
-    num *= 1024 * 1024 * 1024;
-    break;
+    case 'K': num *= 1024; break;
+    case 'M': num *= 1024 * 1024; break;
+    case 'G': num *= 1024 * 1024 * 1024; break;
   }
-  if (num < 1) {
+
+  if (num < 1)
     throw std::system_error(EINVAL, std::generic_category(),
-                            "Invalid value " + chunk_size + " for 'chunk_size', less than 1 byte");
-  }
+                            "Invalid chunk_size < 1 byte");
+
   return static_cast<unsigned int>(num);
 }
 
 bool printSampens(const po::variables_map &vm) {
-  char t = (char)std::tolower(vm["print-sampens"].as<std::string>()[0]);
-  if (t == 't' || t == '1') {
-    return true;
-  } else {
-    return false;
-  }
+  char t = std::tolower(vm["print-sampens"].as<std::string>()[0]);
+  return (t == 't' || t == '1');
 }
 
-// Validators
-
 void validate_chunk_size(const std::string &chunk_size) {
   static const std::regex pattern(R"(^(\d+(\.\d+)?)(B|K|M|G)?$)", std::regex::icase);
-
   if (!std::regex_match(chunk_size, pattern)) {
     throw std::system_error(EINVAL, std::generic_category(),
                             "Invalid format " + chunk_size + " for 'chunk_size'");
@@ -250,17 +252,10 @@ void validate_chunk_size(const std::string &chunk_size) {
 
 void validate_cores(const int cores) {
   const int max_cores = std::thread::hardware_concurrency();
-
-  if (cores < -1) {
-    throw std::system_error(
-        EINVAL, std::generic_category(),
-        "Invalid value " + std::to_string(cores) +
-            " for 'cores'. Allowed values are\n  -1 : use all cores\n   0 : let the program "
-            "decide\n > 0 : specify the number of cores\n");
-  } else if (cores > max_cores) {
+  if (cores < -1)
     throw std::system_error(EINVAL, std::generic_category(),
-                            "Invalid value " + std::to_string(cores) +
-                                " for 'cores', exceeds the maximum number of available cores, " +
-                                std::to_string(max_cores));
-  }
-}
\ No newline at end of file
+                            "Invalid cores value: " + std::to_string(cores));
+  if (cores > max_cores)
+    throw std::system_error(EINVAL, std::generic_category(),
+                            "Requested cores exceed available hardware");
+}

From bf8478255f837ebd0482f99a7df342feed7d598d Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Thu, 15 Jan 2026 14:51:46 +0100
Subject: [PATCH 3/9] Rollback default for skip-header (separate and joint)

---
 method/src/boost.cpp | 40 +++++++++++++---------------------------
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/method/src/boost.cpp b/method/src/boost.cpp
index a7055a0..594c578 100644
--- a/method/src/boost.cpp
+++ b/method/src/boost.cpp
@@ -43,20 +43,20 @@ po::variables_map parseCommandLine(int argc, char **argv) {
 
     ("skip-header-all,skip-header",
       po::value<unsigned int>()->implicit_value(1),
-      "Header lines to skip in all input files (default: 1). "
+      "Header lines to skip in all input files (default: 0). "
       "Synonyms: --skip-header-all, --skip-header.")
 
     ("skip-header-cytosine_report,skip-header-cell",
       po::value<unsigned int>()->implicit_value(1),
-      "Header lines to skip in cytosine_report/cell files (default: 1).")
+      "Header lines to skip in cytosine_report/cell files (default: 0).")
 
     ("skip-header-cytosine_locations,skip-header-reference",
       po::value<unsigned int>()->implicit_value(1),
-      "Header lines to skip in cytosine_locations/reference file (default: 1).")
+      "Header lines to skip in cytosine_locations/reference file (default: 0).")
 
     ("skip-header-regions,skip-header-features,skip-header-target,skip-header-intervals",
       po::value<unsigned int>()->implicit_value(1),
-      "Header lines to skip in regions/features/target/intervals file (default: 1).");
+      "Header lines to skip in regions/features/target/intervals file (default: 0).");
   // clang-format on
 
   po::options_description out("output");
@@ -150,39 +150,25 @@ std::string getIntervals(const po::variables_map &vm) {
 }
 
 unsigned int getSkipHeaderTemplate(const po::variables_map &vm, const std::string &file_type) {
-  if (vm.count(file_type))
+  if (vm.count(file_type)) {
     return vm[file_type].as<unsigned int>();
-  if (vm.count("skip-header-all"))
-    return vm["skip-header-all"].as<unsigned int>();
-  return 1;
+  } else if (vm.count("skip-header")) {
+    return vm["skip-header"].as<unsigned int>();
+  } else {
+    return 0;
+  }
 }
 
 unsigned int getSkipHeaderCell(const po::variables_map &vm) {
-  if (vm.count("skip-header-cell"))
-    return vm["skip-header-cell"].as<unsigned int>();
-  if (vm.count("skip-header-cytosine_report"))
-    return vm["skip-header-cytosine_report"].as<unsigned int>();
-  return getSkipHeaderTemplate(vm, "skip-header-all");
+  return vm["skip-header-cell"].as<unsigned int>();
 }
 
 unsigned int getSkipHeaderReference(const po::variables_map &vm) {
-  if (vm.count("skip-header-reference"))
-    return vm["skip-header-reference"].as<unsigned int>();
-  if (vm.count("skip-header-cytosine_locations"))
-    return vm["skip-header-cytosine_locations"].as<unsigned int>();
-  return getSkipHeaderTemplate(vm, "skip-header-all");
+  return vm["skip-header-reference"].as<unsigned int>();
 }
 
 unsigned int getSkipHeaderIntervals(const po::variables_map &vm) {
-  if (vm.count("skip-header-intervals"))
-    return vm["skip-header-intervals"].as<unsigned int>();
-  if (vm.count("skip-header-regions"))
-    return vm["skip-header-regions"].as<unsigned int>();
-  if (vm.count("skip-header-features"))
-    return vm["skip-header-features"].as<unsigned int>();
-  if (vm.count("skip-header-target"))
-    return vm["skip-header-target"].as<unsigned int>();
-  return getSkipHeaderTemplate(vm, "skip-header-all");
+  return vm["skip-header-intervals"].as<unsigned int>();
 }
 
 std::string getDetOut(const po::variables_map &vm) {

From fe799e013e70f283fafdc733ddc8e01700d97a57 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Thu, 15 Jan 2026 14:57:12 +0100
Subject: [PATCH 4/9] Update getter

---
 method/src/boost.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/method/src/boost.cpp b/method/src/boost.cpp
index 594c578..2b39d25 100644
--- a/method/src/boost.cpp
+++ b/method/src/boost.cpp
@@ -160,15 +160,15 @@ unsigned int getSkipHeaderTemplate(const po::variables_map &vm, const std::strin
 }
 
 unsigned int getSkipHeaderCell(const po::variables_map &vm) {
-  return vm["skip-header-cell"].as<unsigned int>();
+  return getSkipHeaderTemplate(vm, "skip-header-cell");
 }
 
 unsigned int getSkipHeaderReference(const po::variables_map &vm) {
-  return vm["skip-header-reference"].as<unsigned int>();
+  return getSkipHeaderTemplate(vm, "skip-header-reference");
 }
 
 unsigned int getSkipHeaderIntervals(const po::variables_map &vm) {
-  return vm["skip-header-intervals"].as<unsigned int>();
+  return getSkipHeaderTemplate(vm, "skip-header-intervals");
 }
 
 std::string getDetOut(const po::variables_map &vm) {

From 7fed06a8fdda01b3d33a830fa101c9d789192008 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Fri, 30 Jan 2026 08:59:17 +0100
Subject: [PATCH 5/9] Download sines/lines and add them to the crc featureset

---
 workflow/rules/crc.smk                    |  2 ++
 workflow/rules/hg19.smk                   | 17 +++++++++++++++++
 workflow/rules/mm10.smk                   | 21 +++++++++++++++++++--
 workflow/rules/src/download_hg19_lines.sh | 12 ++++++++++++
 workflow/rules/src/download_hg19_sines.sh | 12 ++++++++++++
 workflow/rules/src/download_mm10_lines.sh | 12 ++++++++++++
 workflow/rules/src/download_mm10_sines.sh | 12 ++++++++++++
 7 files changed, 86 insertions(+), 2 deletions(-)
 create mode 100644 workflow/rules/src/download_hg19_lines.sh
 create mode 100644 workflow/rules/src/download_hg19_sines.sh
 create mode 100644 workflow/rules/src/download_mm10_lines.sh
 create mode 100644 workflow/rules/src/download_mm10_sines.sh

diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk
index e8b3d20..e2ada93 100644
--- a/workflow/rules/crc.smk
+++ b/workflow/rules/crc.smk
@@ -45,6 +45,8 @@ ANNOTATIONS = {
     "chip": ["H3K27me3", "H3K9me3", "H3K4me3"],
     "lad": ["laminb1"],
     "genes": ["genes"],
+    "lines": ["lines"],
+    "sines": ['sines'],
     "cpgIslandExt": ['cpgIslandExt'],
     "scna": ["crc01_nc_scna", "crc01_gain_scna", "crc01_lost_scna"]
 }
diff --git a/workflow/rules/hg19.smk b/workflow/rules/hg19.smk
index 43b4800..a506d16 100644
--- a/workflow/rules/hg19.smk
+++ b/workflow/rules/hg19.smk
@@ -43,6 +43,23 @@ rule build_genome_hg19_ref:
         "src/make_ref.sh"
 
 
+rule get_hg19_lines:
+    conda:
+        op.join("..", "envs", "processing.yml")
+    output:
+        op.join(HG19_BASE, "lines.bed.gz"),
+    script:
+        "src/download_hg19_lines.sh"
+
+        
+rule get_hg19_sines:
+    conda:
+        op.join("..", "envs", "processing.yml")
+    output:
+        op.join(HG19_BASE, "sines.bed.gz"),
+    script:
+        "src/download_hg19_sines.sh"
+        
 rule get_genes_hg19:
     conda:
         op.join("..", "envs", "processing.yml")
diff --git a/workflow/rules/mm10.smk b/workflow/rules/mm10.smk
index 2d730ed..7e09ec1 100644
--- a/workflow/rules/mm10.smk
+++ b/workflow/rules/mm10.smk
@@ -6,7 +6,7 @@ CHRS = [str(i) for i in range(1, 20)] + ["X", "Y"]
 MM10_BASE = "mm10"
 
 
-rule mm10_chr_ref:
+rule mm10_per_chr_ref:
     conda:
         op.join("..", "envs", "processing.yml")
     output:
@@ -18,7 +18,7 @@ rule mm10_chr_ref:
         "src/get_chr_ref.sh"
 
 
-rule mm10_ref:
+rule mm10_aggregate_ref:
     input:
         expand(op.join(MM10_BASE, "{chr}.{{meth_pat}}.ref"), chr=CHRS),
     params:
@@ -46,7 +46,24 @@ rule get_mm10_genes:
     script:
         "src/download_mm10_genes.sh"
 
+rule get_mm10_lines:
+    conda:
+        op.join("..", "envs", "processing.yml")
+    output:
+        op.join(MM10_BASE, "lines.bed.gz"),
+    script:
+        "src/download_hg19_lines.sh"
+
+        
+rule get_mm10_sines:
+    conda:
+        op.join("..", "envs", "processing.yml")
+    output:
+        op.join(MM10_BASE, "sines.bed.gz"),
+    script:
+        "src/download_mm10_sines.sh"
 
+        
 ENCODE_MAP = {
     "h3k4me3": "ENCFF160SCR",
     "h3k9me3": "ENCFF658QTP",
diff --git a/workflow/rules/src/download_hg19_lines.sh b/workflow/rules/src/download_hg19_lines.sh
new file mode 100644
index 0000000..a418bc6
--- /dev/null
+++ b/workflow/rules/src/download_hg19_lines.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+##
+## Downloads RepeatMasker (rmsk) for hg19 and extracts only SINE elements
+##
+
+mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \
+  'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily
+     FROM hg19.rmsk
+     WHERE repClass = "LINE";' |
+  awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' |
+  bedtools merge -i - |
+  gzip -c > ${snakemake_output[0]}
diff --git a/workflow/rules/src/download_hg19_sines.sh b/workflow/rules/src/download_hg19_sines.sh
new file mode 100644
index 0000000..fb8689b
--- /dev/null
+++ b/workflow/rules/src/download_hg19_sines.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+##
+## Downloads RepeatMasker (rmsk) for hg19 and extracts only SINE elements
+##
+
+mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \
+  'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily
+     FROM hg19.rmsk
+     WHERE repClass = "SINE";' |
+  awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' |
+  bedtools merge -i - |
+  gzip -c > ${snakemake_output[0]}
diff --git a/workflow/rules/src/download_mm10_lines.sh b/workflow/rules/src/download_mm10_lines.sh
new file mode 100644
index 0000000..8ea0e27
--- /dev/null
+++ b/workflow/rules/src/download_mm10_lines.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+##
+## Downloads RepeatMasker (rmsk) for mm10 and extracts only SINE elements
+##
+
+mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \
+  'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily
+     FROM mm10.rmsk
+     WHERE repClass = "LINE";' |
+  awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' |
+  bedtools merge -i - |
+  gzip -c > ${snakemake_output[0]}
diff --git a/workflow/rules/src/download_mm10_sines.sh b/workflow/rules/src/download_mm10_sines.sh
new file mode 100644
index 0000000..c618543
--- /dev/null
+++ b/workflow/rules/src/download_mm10_sines.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+##
+## Downloads RepeatMasker (rmsk) for mm10 and extracts only SINE elements
+##
+
+mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \
+  'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily
+     FROM mm10.rmsk
+     WHERE repClass = "SINE";' |
+  awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' |
+  bedtools merge -i - |
+  gzip -c > ${snakemake_output[0]}

From cfec1e77c024744fae8ec7bee7e05de1304c758b Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 2 Feb 2026 08:26:58 +0100
Subject: [PATCH 6/9] Restore old CLI as in master

---
 README.md            |  50 +++++------
 method/src/boost.cpp | 207 +++++++++++++++++++++++--------------------
 2 files changed, 136 insertions(+), 121 deletions(-)

diff --git a/README.md b/README.md
index 0b31a70..a81e9e9 100644
--- a/README.md
+++ b/README.md
@@ -50,43 +50,41 @@ bash build.sh
 yamet
 
 input:
-  -c [ --cytosine_report ] arg          per-cell cytosine report file(s) 
-                                        (resembling Bismark's for covered 
-                                        cytosines). Synonyms: --cell, -c.
-                                        These tab-separated files describe 
-                                        covered CpGs sorted by chromosome and 
-                                        position, with columns:
+  -c [ --cytosine_report ] arg          Per-cell cytosine report file(s) 
+                                        (Bismark-like for covered cytosines). 
+                                        Synonyms: --cytosine_report, --cell, 
+                                        -c.
+                                        Tab-separated, sorted by chromosome and
+                                        position:
                                           chr   pos   meth_reads   total_reads 
                                           rate
-  -r [ --cytosine_locations ] arg       genomic locations of all cytosines 
-                                        (typically CpGs). Used to account for 
-                                        missing values in cytosine 
-                                        reports.Synonyms: --cytosine_locations,
-                                        --reference, -r.
+  -r [ --cytosine_locations ] arg       Genomic locations of all cytosines 
+                                        (typically CpGs). Synonyms: 
+                                        --cytosine_locations, --reference, -r.
                                         Required to reconstruct contiguous CpG 
-                                        sequences. Columns:
+                                        sequences.
+                                        Columns:
                                           chr   pos
   -i [ --regions ] arg                  BED file defining genomic regions where
-                                        entropies will be computed (both per 
-                                        feature and globally)Synonyms: 
+                                        entropies will be computed. Synonyms: 
                                         --regions, --features, --target, 
                                         --intervals, -i.
                                         Columns:
                                           chr   start   end
-  --skip-header-all [=arg(=1)]          Number of header lines to skip in all 
-                                        input files (default: 1). Synonyms: 
+  --skip-header-all [=arg(=1)]          Header lines to skip in all input files
+                                        (default: 0). Synonyms: 
                                         --skip-header-all, --skip-header.
   --skip-header-cytosine_report [=arg(=1)]
-                                        Number of header lines to skip in 
+                                        Header lines to skip in 
                                         cytosine_report/cell files (default: 
-                                        1).
+                                        0).
   --skip-header-cytosine_locations [=arg(=1)]
-                                        Number of header lines to skip in 
+                                        Header lines to skip in 
                                         cytosine_locations/reference file 
-                                        (default: 1).
-  --skip-header-regions [=arg(=1)]      Number of header lines to skip in 
+                                        (default: 0).
+  --skip-header-regions [=arg(=1)]      Header lines to skip in 
                                         regions/features/target/intervals file 
-                                        (default: 1).
+                                        (default: 0).
 
 output:
   -d [ --det-out ] arg                  (optional) path to detailed output file
@@ -94,11 +92,10 @@ output:
                                         output file
   -m [ --meth-out ] arg                 (optional) path to average methylation 
                                         output file
-  --all-meth [=arg(=true)] (=false)     if true, include all CpGs in 
+  --all-meth [=arg(=true)] (=false)     If true, include all CpGs in 
                                         methylation summaries, including those 
                                         not used for template construction 
                                         (default: false).
-  --all-meth [=arg(=true)] (=false)
   -o [ --out ] arg                      (optional) path to simple output file
 
 resource utilisation:
@@ -107,9 +104,7 @@ resource utilisation:
   --chunk-size arg (=64K)               size of the buffer (per file) used for 
                                         reading data. Can be specified as a 
                                         positive integer (bytes) or with a 
-                                        suffix: B (bytes), K (kilobytes), M 
-                                        (megabytes), G (gigabytes). Example: 
-                                        4096, 64K, 128M, 2G
+                                        suffix: B, K, M, G.
 
 verbose:
   --print-intervals                     print parsed intervals file
@@ -120,6 +115,7 @@ misc:
   -h [ --help ]                         print help message
   --version                             current version information
 
+
 ```
 
 ## Repository
diff --git a/method/src/boost.cpp b/method/src/boost.cpp
index 2b39d25..9c69807 100644
--- a/method/src/boost.cpp
+++ b/method/src/boost.cpp
@@ -10,53 +10,55 @@
 #include "boost.h"
 #include "version.h"
 
-namespace po = boost::program_options;
-
 po::variables_map parseCommandLine(int argc, char **argv) {
   po::variables_map vm;
 
   po::options_description inp("input");
   // clang-format off
   inp.add_options()
-
-    ("cytosine_report,cell,c",
-      po::value<std::vector<std::string>>()->composing(),
-      "Per-cell cytosine report file(s) (Bismark-like for covered cytosines). "
-      "Synonyms: --cytosine_report, --cell, -c.\n"
-      "Tab-separated, sorted by chromosome and position:\n"
-      "  chr   pos   meth_reads   total_reads   rate")
-
-    ("cytosine_locations,reference,r",
-      po::value<std::string>(),
-      "Genomic locations of all cytosines (typically CpGs). "
-      "Synonyms: --cytosine_locations, --reference, -r.\n"
-      "Required to reconstruct contiguous CpG sequences.\n"
-      "Columns:\n"
-      "  chr   pos")
-
-    ("regions,features,target,intervals,i",
-      po::value<std::string>(),
-      "BED file defining genomic regions where entropies will be computed. "
-      "Synonyms: --regions, --features, --target, --intervals, -i.\n"
-      "Columns:\n"
-      "  chr   start   end")
-
-    ("skip-header-all,skip-header",
-      po::value<unsigned int>()->implicit_value(1),
-      "Header lines to skip in all input files (default: 0). "
-      "Synonyms: --skip-header-all, --skip-header.")
-
-    ("skip-header-cytosine_report,skip-header-cell",
-      po::value<unsigned int>()->implicit_value(1),
-      "Header lines to skip in cytosine_report/cell files (default: 0).")
-
-    ("skip-header-cytosine_locations,skip-header-reference",
-      po::value<unsigned int>()->implicit_value(1),
-      "Header lines to skip in cytosine_locations/reference file (default: 0).")
-
-    ("skip-header-regions,skip-header-features,skip-header-target,skip-header-intervals",
-      po::value<unsigned int>()->implicit_value(1),
-      "Header lines to skip in regions/features/target/intervals file (default: 0).");
+    ("cell,c", po::value<std::vector<std::string>>()->composing(), 
+        "tab separated files, sorted by chromosome and position, for "
+        "different cells in the following format\n"
+        "\n"
+        " chr1    5    0    2    0\n"
+        " chr1    9    1    1    1\n"
+        " chr2    2    3    4    1\n"
+        "\n"
+        "where the columns are the chromosome, position, number of methylated reads, "
+        "total number of reads and the rate respectively")
+    ("reference,r", po::value<std::string>(),
+        "tab separated file, sorted by chromosome and position, for "
+        "reference sites in the following format\n"
+        "\n"
+        " chr1    5\n"
+        " chr1    7\n"
+        " chr1    9\n"
+        " chr1    11\n"
+        " chr2    2\n"
+        " chr2    4\n"
+        "\n"
+        "where the columns are the chromosome and start position respectively")
+    ("intervals,i", po::value<std::string>(),
+        "bed file, sorted by chromosome and start position, for "
+        "intervals of interest in the following format\n"
+        "\n"
+        " chr1    5     7\n"
+        " chr1    10    30\n"
+        " chr2    1     6\n"
+        "\n"
+        "where the columns are the chromosome, start position and the end position respectively")
+    ("skip-header", po::value<unsigned int>()->implicit_value(1),
+        "integer value indicating number of lines to skip in all file inputs"
+        "(this is a default value which can be overriden by other 'skip-header-*' options)")
+    ("skip-header-cell", po::value<unsigned int>()->implicit_value(1),
+        "integer value indicating number of lines to skip in the cell files"
+        "(overrides 'skip-header' if provided)")
+    ("skip-header-reference", po::value<unsigned int>()->implicit_value(1),
+        "integer value indicating number of lines to skip in the reference file"
+        "(overrides 'skip-header' if provided)")
+    ("skip-header-intervals", po::value<unsigned int>()->implicit_value(1),
+        "integer value indicating number of lines to skip in the intervals file"
+        "(overrides 'skip-header' if provided)");
   // clang-format on
 
   po::options_description out("output");
@@ -65,10 +67,7 @@ po::variables_map parseCommandLine(int argc, char **argv) {
     ("det-out,d", po::value<std::string>(), "(optional) path to detailed output file")
     ("norm-det-out,n", po::value<std::string>(), "(optional) path to detailed normalized output file")
     ("meth-out,m", po::value<std::string>(), "(optional) path to average methylation output file")
-    ("all-meth",
-        po::value<std::string>()->default_value("false")->implicit_value("true"),
-        "If true, include all CpGs in methylation summaries, including those "
-        "not used for template construction (default: false).")
+    ("all-meth", po::value<std::string>()->default_value("false")->implicit_value("true"))
     ("out,o", po::value<std::string>(), "(optional) path to simple output file");
   // clang-format on
 
@@ -88,13 +87,14 @@ po::variables_map parseCommandLine(int argc, char **argv) {
         "number of cores used for simultaneously parsing methylation files")
     ("chunk-size", po::value<std::string>()->default_value("64K")->notifier(validate_chunk_size),
         "size of the buffer (per file) used for reading data. Can be specified as a "
-        "positive integer (bytes) or with a suffix: B, K, M, G.");
+        "positive integer (bytes) or with a suffix: B (bytes), K (kilobytes), "
+        "M (megabytes), G (gigabytes). Example: 4096, 64K, 128M, 2G");
   // clang-format on
 
   po::options_description mis("misc");
   // clang-format off
   mis.add_options()
-    ("help,h", "print help message")
+    ("help,h", "produce help message")
     ("version", "current version information");
   // clang-format on
 
@@ -118,35 +118,27 @@ po::variables_map parseCommandLine(int argc, char **argv) {
   return vm;
 }
 
+// Helper get functions for different arguments
+
 std::vector<std::string> getCellFiles(const po::variables_map &vm) {
-  if (vm.count("cell"))
-    return vm["cell"].as<std::vector<std::string>>();
-  if (vm.count("cytosine_report"))
-    return vm["cytosine_report"].as<std::vector<std::string>>();
-  throw std::system_error(EINVAL, std::generic_category(),
-                          "No cell/cytosine_report files provided");
+  if (!vm.count("cell")) {
+    throw std::system_error(EINVAL, std::generic_category(), "No cell coverage files provided");
+  }
+  return vm["cell"].as<std::vector<std::string>>();
 }
 
 std::string getRef(const po::variables_map &vm) {
-  if (vm.count("reference"))
-    return vm["reference"].as<std::string>();
-  if (vm.count("cytosine_locations"))
-    return vm["cytosine_locations"].as<std::string>();
-  throw std::system_error(EINVAL, std::generic_category(),
-                          "No reference/cytosine_locations file provided");
+  if (!vm.count("reference")) {
+    throw std::system_error(EINVAL, std::generic_category(), "No reference file provided");
+  }
+  return vm["reference"].as<std::string>();
 }
 
 std::string getIntervals(const po::variables_map &vm) {
-  if (vm.count("intervals"))
-    return vm["intervals"].as<std::string>();
-  if (vm.count("regions"))
-    return vm["regions"].as<std::string>();
-  if (vm.count("features"))
-    return vm["features"].as<std::string>();
-  if (vm.count("target"))
-    return vm["target"].as<std::string>();
-  throw std::system_error(EINVAL, std::generic_category(),
-                          "No intervals/regions/features/target file provided");
+  if (!vm.count("intervals")) {
+    throw std::system_error(EINVAL, std::generic_category(), "No intervals file provided");
+  }
+  return vm["intervals"].as<std::string>();
 }
 
 unsigned int getSkipHeaderTemplate(const po::variables_map &vm, const std::string &file_type) {
@@ -184,8 +176,12 @@ std::string getMethOut(const po::variables_map &vm) {
 }
 
 bool allMeth(const po::variables_map &vm) {
-  char t = std::tolower(vm["all-meth"].as<std::string>()[0]);
-  return (t == 't' || t == '1');
+  char t = (char)std::tolower(vm["all-meth"].as<std::string>()[0]);
+  if (t == 't' || t == '1') {
+    return true;
+  } else {
+    return false;
+  }
 }
 
 std::string getOut(const po::variables_map &vm) {
@@ -194,42 +190,58 @@ std::string getOut(const po::variables_map &vm) {
 
 unsigned int getCores(const po::variables_map &vm) {
   const unsigned int max_cores = std::thread::hardware_concurrency();
-  const int c = vm["cores"].as<int>();
-  if (c == -1) return max_cores;
-  if (c == 0)  return max_cores - (unsigned int)(std::floor(std::log2(max_cores)));
-  return c;
+  const int          c         = vm["cores"].as<int>();
+  if (c == -1) {
+    return max_cores;
+  } else if (c == 0) {
+    return max_cores - (unsigned int)(std::floor(std::log2(max_cores)));
+  } else {
+    return c;
+  }
 }
 
 unsigned int getChunkSize(const po::variables_map &vm) {
   static const std::regex pattern(R"(^(\d+(\.\d+)?)(B|K|M|G)?$)", std::regex::icase);
-  std::smatch match;
-  const auto chunk_size = vm["chunk-size"].as<std::string>();
+  std::smatch             match;
+  const auto              chunk_size = vm["chunk-size"].as<std::string>();
 
   std::regex_match(chunk_size, match, pattern);
 
-  double num = std::stod(match[1].str());
-  char unit = match[3].str().empty() ? 'B' : std::toupper(match[3].str()[0]);
+  double num  = std::stod(match[1].str());
+  char   unit = match[3].str().empty() ? 'B' : std::toupper(match[3].str()[0]);
 
   switch (unit) {
-    case 'K': num *= 1024; break;
-    case 'M': num *= 1024 * 1024; break;
-    case 'G': num *= 1024 * 1024 * 1024; break;
+  case 'K':
+    num *= 1024;
+    break;
+  case 'M':
+    num *= 1024 * 1024;
+    break;
+  case 'G':
+    num *= 1024 * 1024 * 1024;
+    break;
   }
-
-  if (num < 1)
+  if (num < 1) {
     throw std::system_error(EINVAL, std::generic_category(),
-                            "Invalid chunk_size < 1 byte");
-
+                            "Invalid value " + chunk_size + " for 'chunk_size', less than 1 byte");
+  }
   return static_cast<unsigned int>(num);
 }
 
 bool printSampens(const po::variables_map &vm) {
-  char t = std::tolower(vm["print-sampens"].as<std::string>()[0]);
-  return (t == 't' || t == '1');
+  char t = (char)std::tolower(vm["print-sampens"].as<std::string>()[0]);
+  if (t == 't' || t == '1') {
+    return true;
+  } else {
+    return false;
+  }
 }
 
+// Validators
+
 void validate_chunk_size(const std::string &chunk_size) {
   static const std::regex pattern(R"(^(\d+(\.\d+)?)(B|K|M|G)?$)", std::regex::icase);
+
   if (!std::regex_match(chunk_size, pattern)) {
     throw std::system_error(EINVAL, std::generic_category(),
                             "Invalid format " + chunk_size + " for 'chunk_size'");
@@ -238,10 +250,17 @@ void validate_chunk_size(const std::string &chunk_size) {
 
 void validate_cores(const int cores) {
   const int max_cores = std::thread::hardware_concurrency();
-  if (cores < -1)
-    throw std::system_error(EINVAL, std::generic_category(),
-                            "Invalid cores value: " + std::to_string(cores));
-  if (cores > max_cores)
+
+  if (cores < -1) {
+    throw std::system_error(
+        EINVAL, std::generic_category(),
+        "Invalid value " + std::to_string(cores) +
+            " for 'cores'. Allowed values are\n  -1 : use all cores\n   0 : let the program "
+            "decide\n > 0 : specify the number of cores\n");
+  } else if (cores > max_cores) {
     throw std::system_error(EINVAL, std::generic_category(),
-                            "Requested cores exceed available hardware");
-}
+                            "Invalid value " + std::to_string(cores) +
+                                " for 'cores', exceeds the maximum number of available cores, " +
+                                std::to_string(max_cores));
+  }
+}
\ No newline at end of file

From 007bf10c3f4cad5adce16a2ccc45705a2962caa8 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 2 Feb 2026 09:34:03 +0100
Subject: [PATCH 7/9] Switch to normalized (sample?) entropies and hopefully
 also Shannons, fix rmsk files

---
 workflow/Snakefile      |  8 ++++++--
 workflow/rules/crc.smk  | 14 ++++++++++----
 workflow/rules/hg19.smk | 14 ++++++++++++--
 3 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 832011d..311a818 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -34,7 +34,7 @@ rule all:
         # op.join("annotation", "mm10", "done.flag"),  ## this to download regions for Ecker's
         # op.join("data", "crc", "download.flag"),  # this to download CRC cytosine reports, caution will redownload them
         # op.join("data", "crc", "results",  "crc.html") , ## this requires the `download.flag` being present (and the cytosine reports; dependencies are not well captured)
-        # op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet")
+        op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet"),
         # op.join('scnmt_gastrulation_data', 'downloaded_data.tsv'), ## this is Argelaguet's gastrulation
         # expand(
         #     op.join(
@@ -58,12 +58,16 @@ rule all:
 
 rule install_yamet:
     """
-    Binary to goes to ~/.local/yamet; export path accordingly
+    Binary to goes to ~/.local/yamet; exported path accordingly
     """
     conda:
         "envs/yamet.yml"
     output:
         op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet")
+    priority:
+        100
+    threads:
+        workflow.cores
     log:
         op.join("logs", "build.log")
     shell:
diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk
index e2ada93..3197786 100644
--- a/workflow/rules/crc.smk
+++ b/workflow/rules/crc.smk
@@ -226,10 +226,12 @@ rule run_yamet_on_separate_features:
     output:
         simple_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out")),
         det_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out")),
+        norm_det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out")),
         meth_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out")),
         simple=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out.gz"),
         det=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out.gz"),
-        meth=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out.gz")
+        meth=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out.gz"),
+        norm_det=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out.gz")
     log:
         op.join('logs', 'yamet_{subcat}_{cat}_{patient}_{location}.log')
     group:
@@ -248,7 +250,8 @@ rule run_yamet_on_separate_features:
          --print-sampens F \
          --out {output.simple_uncomp} \
          --det-out {output.det_uncomp} \
-         --meth-out {output.meth_uncomp} &> {log}
+         --meth-out {output.meth_uncomp} \
+         --norm-det-out {output.norm_det_uncomp} &> {log}
         
         gzip --keep -f {params.path}/{wildcards.subcat}_{wildcards.cat}_{wildcards.patient}_{wildcards.location}*out  &>> {log}
         """
@@ -378,9 +381,11 @@ rule run_yamet_on_windows:
         simple_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.out")),
         det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.det.out")),
         meth_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out")),
+        norm_det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.norm.det.out")),        
         simple=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.out.gz"),
         det=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.det.out.gz"),
-        meth=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out.gz")
+        meth=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out.gz"),
+        norm_det=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.norm.det.out.gz")
     log:
         op.join('logs', 'yamet_{win_size}_{patient}_{location}.log')
     group:
@@ -399,7 +404,8 @@ rule run_yamet_on_windows:
          --print-sampens F \
          --out {output.simple_uncomp} \
          --det-out {output.det_uncomp} \
-         --meth-out {output.meth_uncomp} &> {log}
+         --meth-out {output.meth_uncomp} \
+         --norm-det-out {output.norm_det_uncomp} &> {log}
 
         gzip --keep -f {params.path}/{wildcards.win_size}_{wildcards.patient}_{wildcards.location}*out  &>> {log}
         """
diff --git a/workflow/rules/hg19.smk b/workflow/rules/hg19.smk
index a506d16..85f36a0 100644
--- a/workflow/rules/hg19.smk
+++ b/workflow/rules/hg19.smk
@@ -47,7 +47,7 @@ rule get_hg19_lines:
     conda:
         op.join("..", "envs", "processing.yml")
     output:
-        op.join(HG19_BASE, "lines.bed.gz"),
+        op.join(HG19_BASE, "rmsk.lines.bed.gz"),
     script:
         "src/download_hg19_lines.sh"
 
@@ -56,7 +56,7 @@ rule get_hg19_sines:
     conda:
         op.join("..", "envs", "processing.yml")
     output:
-        op.join(HG19_BASE, "sines.bed.gz"),
+        op.join(HG19_BASE, "rmsk.sines.bed.gz"),
     script:
         "src/download_hg19_sines.sh"
         
@@ -68,6 +68,16 @@ rule get_genes_hg19:
     script:
         "src/download_hg19_genes.sh"
 
+rule uncompress_repeats:
+    conda:
+        op.join("..", "envs", "processing.yml")
+    input:
+        op.join(HG19_BASE, "rmsk.{type}.bed.gz"),        
+    output:
+        op.join(HG19_BASE, "{type}.{type}.bed"),
+    shell:
+        "gunzip -c {input} > {output}"
+
 
 rule uncompress_hg19_genes:
     conda:

From 38914cf20fada5153f76e2c61ee5f10e0f5aed90 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 2 Feb 2026 14:10:22 +0100
Subject: [PATCH 8/9] Fix output

---
 workflow/rules/crc.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk
index 3197786..1ba5689 100644
--- a/workflow/rules/crc.smk
+++ b/workflow/rules/crc.smk
@@ -226,7 +226,7 @@ rule run_yamet_on_separate_features:
     output:
         simple_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out")),
         det_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out")),
-        norm_det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out")),
+        norm_det_uncomp=temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out")),
         meth_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out")),
         simple=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out.gz"),
         det=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out.gz"),

From 80275c405630e912d1adee83488dbbd31068cae5 Mon Sep 17 00:00:00 2001
From: Izaskun Mallona <izaskun.mallona@gmail.com>
Date: Mon, 2 Feb 2026 16:53:16 +0100
Subject: [PATCH 9/9] Add normalized data, and Shannon entropies, to the
 per-feature plots

---
 workflow/rules/crc.smk     |   2 +
 workflow/rules/src/crc.Rmd | 463 +++++++++++++++++++++++++------------
 2 files changed, 323 insertions(+), 142 deletions(-)

diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk
index 1ba5689..080327b 100644
--- a/workflow/rules/crc.smk
+++ b/workflow/rules/crc.smk
@@ -272,6 +272,8 @@ rule render_crc_report:
         list_relevant_yamet_outputs()
     params:
         output_path=CRC_OUTPUT
+    threads:
+        round(workflow.cores/2)
     output:
         op.join(CRC, "results", "crc.html")
     log:
diff --git a/workflow/rules/src/crc.Rmd b/workflow/rules/src/crc.Rmd
index 4344c1d..2f3b503 100644
--- a/workflow/rules/src/crc.Rmd
+++ b/workflow/rules/src/crc.Rmd
@@ -49,6 +49,7 @@ opts_chunk$set(
 
 ```{r save_snmk_object_for_debugging, include = FALSE}
 saveRDS(file = 'snmk_object_features.rds', object = snakemake)
+## snakemake <- readRDS('../../snmk_object_features.rds')
 ```
 
 
@@ -58,11 +59,9 @@ sink(log)
 sink(log, type = "message")
 ```
 
-# Short (global) reports by annotation  {.tabset .tabset-pills}
 
-## Modelling
 
-Firstly, we read in the yamet output files and create a table storing the biopsy location, patient id and genomic segmentation.
+These are not det.out.gz, but only out.gz files.
 
 ```{r, import_short_reports}
 filepaths <- list.files(
@@ -71,9 +70,9 @@ filepaths <- list.files(
 )
 
 filepaths <- filepaths[!grepl("\\.det\\.out\\.gz$", filepaths)]
-filepaths <- filepaths[!grepl("\\.norm\\.out\\.gz$", filepaths)]
 filepaths <- filepaths[!grepl("\\.meth\\.out\\.gz$", filepaths)]
 
+
 regex <- "(.*)_(.*)_(.*)_(.*).out.gz"
 process_file <- function(filepath) {
   filename <- basename(filepath)
@@ -98,17 +97,6 @@ files$location <- factor(
 )
 ```
 
-Patient as random effect
-
-```{r, lmer}
-lmer.files <- lmer(
-  sampen ~ location * avg_meth + (1 | patient),
-  data = subset(files, annotation == "pmds")
-)
-summary(lmer.files)
-anova(lmer.files, type = 3)
-```
-
 ## Scatters by location and annotation
  
 ```{r}
@@ -262,169 +250,360 @@ ggplot(ranges, aes(color = location)) +
 
 ```
 
-## Others
+# After normalization - same plots (the yamet-way)
 
-```{r, fig.height = 10, fig.width = 10}
+That is, not by locus but by cell norm sampEns (that are calculated according to some regionset anyway)
 
-ggplot(ranges, aes(x = compartment, y = location, fill = sampen_mean)) +
-  geom_tile(color = "white") +
-  scale_fill_viridis_c() +
-  theme_minimal() +
-  theme(axis.text.x = element_text(angle = 90))
+```{r, import_short_reports_norm}
+filepaths <- list.files(
+  snakemake@params[["output_path"]],
+  pattern = "out\\.gz$", full.names = TRUE)
 
 
-ggplot(ranges, aes(x = compartment, y = location, fill = meth_mean)) +
-  geom_tile(color = "white") +
-  scale_fill_viridis_c() +
-  theme_minimal() +
-  theme(axis.text.x = element_text(angle = 90))
+filepaths <- filepaths[!grepl("\\.det\\.out\\.gz$", filepaths)]
+filepaths <- filepaths[!grepl("\\.norm\\.out\\.gz$", filepaths)]
+filepaths <- filepaths[!grepl("\\.meth\\.out\\.gz$", filepaths)]
+
+head(filepaths)
+regex <- "(.*)_(.*)_(.*)_(.*).out.gz"
+process_file <- function(filepath) {
+  filename <- basename(filepath)
+
+  dt <- read_tsv(filepath, na = "-1", show_col_types = FALSE)
+  dt <- dt[, c("sampen_norm", "avg_meth")]
+  colnames(dt) <- c('sampen', 'avg_meth') ## caution they're normalized though
+
+  dt$annotation <- gsub(regex, "\\1", filename)
+  dt$patient <- gsub(regex, "\\3", filename)
+  dt$location <- gsub(regex, "\\4", filename)
+
+  return(dt)
+}
+
+head(filepaths)
+
+files <- do.call(rbind, bplapply(filepaths, process_file, BPPARAM = param))
+
+files$annotation <- as.factor(files$annotation)
+files$patient <- as.factor(files$patient)
+files$location <- factor(
+  files$location,
+  levels = c("NC", "PT", "LN", "ML", "MP", "MO")
+)
+```
+
+## Scatters by location and annotation
+ 
+```{r}
+ggplot(
+  ## subset(files, patient == "CRC01"),
+  files,
+  aes(y = sampen, x = avg_meth, color = location, shape = patient)
+) +
+  geom_point(size = 0.9, alpha = 0.2) +
+  scale_color_brewer(palette = "Set1") +
+  scale_shape_manual(values = 1:nlevels(files$patient)) +
+  facet_wrap(~annotation) +
+  labs(
+    x = "average methylation",
+    y = "norm sample entropy"
+  ) +
+  guides(color = guide_legend(override.aes = list(size = 2))) +
+  theme_ng()
+```
 
+## Scatters by location and annotation and patient
+ 
+```{r, fig.height = 15}
+ggplot(
+  ## subset(files, patient == "CRC01"),
+  files,
+  aes(y = sampen, x = avg_meth, color = location)
+) +
+  geom_point(size = 0.9, alpha = 0.2) +
+  scale_color_brewer(palette = "Set1") +
+  facet_grid(annotation ~ patient) +
+  labs(
+    x = "average methylation",
+    y = "norm sample entropy"
+  ) +
+  guides(color = guide_legend(override.aes = list(size = 2))) +
+  theme_ng() +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
 ```
 
-# Full (local) reports for **downsampled** PMDs and CRC01
+## Scatters by location and annotation and patient (shorter)
+ 
+```{r, fig.height = 3.2}
+ggplot(
+  subset(files, annotation %in% c("laminb1", "H3K27me3", "H3K9me3")),
+  aes(y = sampen, x = avg_meth, color = location)
+) +
+  geom_point(size = 0.9, alpha = 0.2) +
+  scale_color_brewer(palette = "Set1") +
+  facet_grid(annotation ~ patient) +
+  labs(
+    x = "average methylation",
+    y = "norm sample entropy"
+  ) +
+  guides(color = guide_legend(override.aes = list(size = 2))) +
+  theme_ng() +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
+```
 
-Caution downsampled to some rows only.
+## sampEn boxes by location and annotation - crc01
 
-```{r, import_log_reports}
-DOWNSAMPLING <- 5e4
+```{r, fig.height = 6, fig.width = 10}
+ggplot(
+  subset(files, patient == "CRC01"),
+  aes(y = sampen, x = annotation, color = location)
+) +
+  geom_boxplot(
+    outlier.size = 0.6,
+    outlier.alpha = 0.3
+  ) +
+  labs(y = "norm sample entropy") +
+  geom_jitter(
+    position = position_dodge(width = 0.75),
+    size = 0.6, alpha = 0.3
+  ) +
+  scale_color_brewer(palette = "Dark2") +
+  theme_ng() +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
+```
 
-annotation <- "pmds"
-patient <- "CRC01"
+## meth boxes by location and annotation - crc01
 
-local <- list()
-for (location in c("NC", "PT", "LN", "ML", "MP")) {
-  filepaths <- list.files(
-    snakemake@params[["output_path"]],
-    pattern = "\\.det\\.out\\.gz$", full.names = TRUE
-  )
-  filepaths <- grep(location, filepaths, value = TRUE)
-  filepaths <- grep(annotation, filepaths, value = TRUE)
-  filepaths <- grep(patient, filepaths, value = TRUE)
-
-  d <- bplapply(filepaths, \(filepath) {
-    dt <- read_tsv(
-      filepath,
-      na = "-1", n_max = DOWNSAMPLING, show_col_types = FALSE
-    )
-    colnames(dt) <- basename(colnames(dt))
-    dt
-  }, BPPARAM = param)
-
-  d2 <- bplapply(d, \(x) {
-    avg_sampen <- apply(
-      x[, grep(location, colnames(x))], 1, \(y) mean(y, na.rm = TRUE)
-    )
-    median_sampen <- apply(
-      x[, grep(location, colnames(x))], 1, \(y) median(y, na.rm = TRUE)
-    )
-    q25_sampen <- apply(
-      x[, grep(location, colnames(x))], 1, \(y) quantile(y, 0.25, na.rm = TRUE)
-    )
-    q75_sampen <- apply(
-      x[, grep(location, colnames(x))], 1, \(y) quantile(y, 0.75, na.rm = TRUE)
-    )
-    cbind(x, data.frame(
-      avg_sampen = avg_sampen, median_sampen = median_sampen,
-      q25_sampen = q25_sampen, q75_sampen = q75_sampen,
-      location = location,
-      patient = patient,
-      annotation = annotation
-    ))
-  }, BPPARAM = param)
-
-  local[[location]] <- d2[[1]]
-  rm(d, d2)
-}
+```{r, fig.height = 6, fig.width = 10}
+ggplot(
+  subset(files, patient == "CRC01"),
+  aes(y = avg_meth, x = annotation, color = location)
+) +
+  geom_boxplot(
+    outlier.size = 0.6,
+    outlier.alpha = 0.3
+  ) +
+  labs(y = "average methylation") +
+  geom_jitter(
+    position = position_dodge(width = 0.75),
+    size = 0.6, alpha = 0.3
+  ) +
+  scale_color_brewer(palette = "Dark2") +
+  theme_ng() +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
 ```
 
-## Scatters {.tabset .tabset-pills}
-
-### By meth NC
-
-```{r, fig.width = 10, fig.height = 10}
-pal <- viridis(10, alpha = 0.8)
-
-beta2m <- function(betas, epsilon = 1e-05) {
-  if (!is.numeric(betas)) {
-    stop("Non-numeric betas")
-  }
-  if (!(is.numeric(epsilon) && (!is.na(epsilon)))) {
-    stop("Epison not valid")
-  }
-  if (epsilon < 0 || epsilon >= 0.5) {
-    stop("Not 0 < epsilon < 0.5")
-  }
-  betas[betas < epsilon] <- epsilon
-  betas[betas > (1 - epsilon)] <- 1 - epsilon
-  return(log2(betas / (1 - betas)))
-}
+## Range plots - by compartment
 
-pmds <- data.frame(
-  shannon_NC = local$NC$shannon,
-  meth_NC = local$NC$avg_meth,
-  med_sampen_NC = local$NC$median_sampen,
-  shannon_PT = local$PT$shannon,
-  meth_PT = local$PT$avg_meth,
-  med_sampen_PT = local$PT$median_sampen
-)
-pmds$meth_change <- pmds$meth_PT - pmds$meth_NC
-pmds <- na.omit(pmds)
+```{r, fig.height = 10, fig.width = 10}
 
-pmds$by_meth_nc <- cut(pmds$meth_NC, breaks = 10)
-pmds$by_meth_pt <- cut(pmds$meth_PT, breaks = 10)
 
-plot(pmds[, grep("by", colnames(pmds), invert = TRUE)],
-  pch = ".", col = pal[as.numeric(pmds$by_meth_nc)], main = "by_meth_nc"
-)
+ranges <- as.data.frame(files %>%
+   filter(patient == "CRC01") %>%
+   group_by(compartment = annotation, location) %>%
+   summarise(
+     sampen_mean = mean(sampen, na.rm = TRUE),
+     sampen_min  = min(sampen, na.rm = TRUE),
+     sampen_max  = max(sampen, na.rm = TRUE),
+     meth_mean   = mean(avg_meth, na.rm = TRUE),
+     meth_min    = min(avg_meth, na.rm = TRUE),
+     meth_max    = max(avg_meth, na.rm = TRUE)))
+
+## ggplot(ranges, aes(x = meth_mean, y = sampen_mean, color = location)) +
+##   geom_point(shape = 3, size = 1.2) +  # "+" symbol
+##   geom_errorbar(aes(ymin = sampen_min, ymax = sampen_max), width = 0.3, alpha = 1, size = 1) +
+##   geom_errorbarh(aes(xmin = meth_min, xmax = meth_max), height = 0.3, alpha = 1, size = 0) +
+##   scale_color_brewer(palette = "Dark2") +
+##   labs(
+##     x = "DNA methylation",
+##     y = "Sample entropy"
+##   ) +
+##   facet_wrap(~ compartment) +
+##   theme_ng()
+
+# I think this is the correct one
+ggplot(ranges, aes(color = location)) +
+  ## horizontal segment: range of methylations for that location/compartment
+  geom_segment(
+    aes(x = meth_min, xend = meth_max, y = sampen_mean, yend = sampen_mean),
+    size = 0.6, alpha = 1
+  ) +
+  # vertical segment: range of sampens for that location/compartment
+  geom_segment(
+    aes(x = meth_mean, xend = meth_mean, y = sampen_min, yend = sampen_max),
+    size = 0.6, alpha = 1
+  ) +
+  # center point
+  geom_point(
+    aes(x = meth_mean, y = sampen_mean),
+    shape = 3, size = 1.2
+  ) +
+  scale_color_brewer(palette = "Dark2") +
+  labs(x = "DNA methylation", y = "Normalized sample entropy") +
+  facet_wrap(~ compartment) +
+  theme_ng()
+
 ```
 
-### By median sampen PT
+# Unnormalized Shannon's MEDIANS DOWNSAMPLED {.tabset .tabset-pills}
 
+This cannot be done on short `out.gz` files, but on detailed ones only. Median values, not all loci included!
 
-```{r, fig.width = 10, fig.height = 10}
-pmds$by_med_sampen_pt <- cut(pmds$med_sampen_PT, breaks = 10)
-plot(pmds[, grep("by", colnames(pmds), invert = TRUE)],
-  pch = ".", col = pal[as.numeric(pmds$by_med_sampen_pt)],
-  main = "by_med_sampen_PT"
+```{r, eval = TRUE}
+filepaths <- list.files(
+  snakemake@params[["output_path"]],
+  pattern = "det\\.out\\.gz$", full.names = TRUE
 )
+
+filepaths <- filepaths[!grepl("norm", filepaths)]
+
+head(filepaths)
+
+## todo extract Shannon's here instead
+
+regex <- "(.*)_(.*)_(.*)_(.*).det.out.gz"
+
+process_file <- function(filepath) {
+  filename <- basename(filepath)
+
+  tmp <- read_tsv(filepath, na = "-1", show_col_types = FALSE, n_max = 10e4) ## caution downsampling
+
+  tibble(
+    file = filename,
+    median_shannon   = median(tmp$shannon,   na.rm = TRUE),
+    median_avg_meth  = median(tmp$avg_meth,  na.rm = TRUE),
+    annotation = gsub(regex, "\\1", filename),
+    patient    = gsub(regex, "\\3", filename),
+    location   = gsub(regex, "\\4", filename)
+  )
+}
+
+files <- bplapply(filepaths, process_file, BPPARAM = param) |> bind_rows()
+
+files$annotation <- factor(files$annotation)
+files$patient    <- factor(files$patient)
+files$location   <- factor(files$location,
+                           levels = c("NC", "PT", "LN", "ML", "MP", "MO"))
+
 ```
 
-### By median sampen NC
+## Scatters by location and annotation
+ 
+```{r}
+ggplot(
+  files,
+  aes(y = median_shannon, x = median_avg_meth, color = location, shape = patient)
+) +
+  geom_point(size = 0.9, alpha = 0.2) +
+  scale_color_brewer(palette = "Set1") +
+  scale_shape_manual(values = 1:nlevels(files$patient)) +
+  facet_wrap(~annotation) +
+  labs(
+    x = "median average methylation",
+    y = "median Shannon entropy"
+  ) +
+  guides(color = guide_legend(override.aes = list(size = 2))) +
+  theme_ng()
+```
 
-```{r, fig.width = 10, fig.height = 10, results = 'asis'}
-pmds$by_med_sampen_nc <- cut(pmds$med_sampen_NC, breaks = 10)
-plot(pmds[, grep("by", colnames(pmds), invert = TRUE)],
-  pch = ".", col = pal[as.numeric(pmds$by_med_sampen_nc)],
-  main = "by_med_sampen_nc"
-)
+## Scatters by location and annotation and patient
+ 
+```{r, fig.height = 15}
+ggplot(
+  files,
+  aes(y = median_shannon, x = median_avg_meth, color = location)
+) +
+  geom_point(size = 0.9, alpha = 0.2) +
+  ## scale_color_brewer(palette = "Set1") +
+  ## facet_grid(annotation ~ patient) +
+  labs(
+    x = "median average methylation",
+    y = "median Shannon entropy"
+  ) +
+  guides(color = guide_legend(override.aes = list(size = 2))) +
+  theme_ng() +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
 ```
 
-### By methylation change
 
-```{r, fig.width = 10, fig.height = 10, results = 'asis'}
-pmds$by_meth_change <- cut(pmds$meth_change, breaks = 10)
-plot(pmds[, grep("by", colnames(pmds), invert = TRUE)],
-  pch = ".", col = pal[as.numeric(pmds$by_meth_change)],
-  main = "by_meth_change"
+
+# Meth-normalized Shannons (the yamet way) MEDIANS {.tabset .tabset-pills}
+
+This cannot be done on short `out.gz` files, but on detailed ones only. Median values, not all loci included!
+
+```{r, eval = TRUE}
+filepaths <- list.files(
+  snakemake@params[["output_path"]],
+  pattern = "norm\\.det\\.out\\.gz$", full.names = TRUE
 )
-```
 
-Add extra discretization.
+head(filepaths)
 
-```{r}
-pmds$by_sampen_pt <- cut(pmds$med_sampen_PT, breaks = 10)
-pmds$by_sampen_nc <- cut(pmds$med_sampen_NC, breaks = 10)
-pmds$by_shannon_pt <- cut(pmds$shannon_PT, breaks = 10)
-pmds$by_shannon_nc <- cut(pmds$shannon_NC, breaks = 10)
-```
+## todo extract Shannon's here instead
+
+regex <- "(.*)_(.*)_(.*)_(.*).norm.det.out.gz"
+
+process_file <- function(filepath) {
+  filename <- basename(filepath)
 
-# SCNA checks, by cytoband
+  tmp <- read_tsv(filepath, na = "-1", show_col_types = FALSE, n_max = 10e4) # caution downsampling
 
-Is this necessary? given we don't see much of a change https://github.com/imallona/yamet/blob/9156b817e04d9294024631dc25b528e191d913f4/workflow/rules/src/crc_deletions.Rmd#L96
+  tibble(
+    file = filename,
+    median_shannon   = median(tmp$shannon_norm,   na.rm = TRUE),
+    median_avg_meth  = median(tmp$avg_meth,  na.rm = TRUE),
+    annotation = gsub(regex, "\\1", filename),
+    patient    = gsub(regex, "\\3", filename),
+    location   = gsub(regex, "\\4", filename)
+  )
+}
 
+files <- bplapply(filepaths, process_file, BPPARAM = param) |> bind_rows()
 
+files$annotation <- factor(files$annotation)
+files$patient    <- factor(files$patient)
+files$location   <- factor(files$location,
+                           levels = c("NC", "PT", "LN", "ML", "MP", "MO"))
 
+```
 
+## Scatters by location and annotation
+ 
+```{r}
+ggplot(
+  files,
+  aes(y = median_shannon, x = median_avg_meth, color = location, shape = patient)
+) +
+  geom_point(size = 0.9, alpha = 0.2) +
+  scale_color_brewer(palette = "Set1") +
+  scale_shape_manual(values = 1:nlevels(files$patient)) +
+  facet_wrap(~annotation) +
+  labs(
+    x = "median average methylation",
+    y = "median norm Shannon entropy"
+  ) +
+  guides(color = guide_legend(override.aes = list(size = 2))) +
+  theme_ng()
+```
+
+## Scatters by location and annotation and patient
+ 
+```{r, fig.height = 15}
+ggplot(
+  files,
+  aes(y = median_shannon, x = median_avg_meth, color = location)
+) +
+  geom_point(size = 0.9, alpha = 0.2) +
+  ## scale_color_brewer(palette = "Set1") +
+  ## facet_grid(annotation ~ patient) +
+  labs(
+    x = "median average methylation",
+    y = "median norm Shannon entropy"
+  ) +
+  guides(color = guide_legend(override.aes = list(size = 2))) +
+  theme_ng() +
+  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
+```