From fce1271b66b997f9d5d20d8b4c31c971b42e12b2 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Wed, 14 Jan 2026 12:49:45 +0100 Subject: [PATCH 1/9] Fix metadata path --- workflow/rules/crc.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index 6b2d8a2..e8b3d20 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -416,9 +416,9 @@ rule get_scna_patient1_from_supplementary_data: conda: op.join("..", "envs", "r.yml") input: - op.join("src", "scna_hg19.xlsx"), + op.join(".", "src", "scna_hg19.xlsx"), output: - scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz") + scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz.pre") script: "src/parse_scna.R" @@ -426,7 +426,7 @@ rule split_patient1_crc_in_kept_lost_gained: conda: op.join("..", "envs", "r.yml") input: - scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz"), + scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz.pre"), genome_sizes = op.join(HG19_BASE, "genome.sizes"), output: uncomp = temp(op.join(HG19_BASE, "crc01_scna.bed")), From cfcb6a5c662ea188027c554e3e9320cf1eab6960 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Thu, 15 Jan 2026 14:17:01 +0100 Subject: [PATCH 2/9] Update CLI documentation and default names; shorthands are now a bit confusing --- README.md | 112 +++++++++++++------- method/src/boost.cpp | 239 +++++++++++++++++++++---------------------- 2 files changed, 191 insertions(+), 160 deletions(-) diff --git a/README.md b/README.md index 5da46e4..0b31a70 100644 --- a/README.md +++ b/README.md @@ -44,46 +44,82 @@ bash build.sh ## Usage -`yamet` processes (covered CpG) DNA methylation report(s) (`-cell` argument), a reference file listing all CpG positions in a genome (`--reference`), and a bedfile specifying the genomic regions to calculate scores for (`--intervals`; e.g. promoters, genes, etc). Full CLI args: +`yamet` processes (covered CpG) DNA methylation report(s), a reference file listing all CpG positions in a genome, and a bedfile specifying the genomic regions to calculate scores for (e.g. promoters, genes, etc). Full CLI args: ```text -Usage: - yamet (-c ... | ...) \ - -r \ - -i \ - [OPTIONS] - -Required inputs: - -c --cell ... One or more tab-separated methylation files, - OR provide them directly as positional arguments. - ... (Positional alternative) Cell files (same format as above). - -r --reference Reference file, tab-separated and sorted by chromosome/position. - -i --intervals BED file of intervals of interest. - -Optional output: - -d --det-out Path to detailed output file. - -m --meth-out Path to average methylation output file. - -o --out Path to simple output file. - -Resource options: - --cores Number of cores for parallel parsing - [default: 0, implying program decides]. - --chunk-size Buffer size per file (e.g., 64K, 128M, 2G) [default: 64K]. - -Optional input control: - --skip-header[=] Skip lines in all input files [default: 1]. - --skip-header-cell[=] Skip lines in cell files (overrides --skip-header). - --skip-header-reference[=] Skip lines in reference file (overrides --skip-header). - --skip-header-intervals[=] Skip lines in intervals file (overrides --skip-header). - -Verbose/debugging: - --print-intervals Print parsed intervals file. - --print-reference Print parsed reference file. - --print-sampens[=] Print computed sample entropies [default: true]. - -Miscellaneous: - -h --help Show help message. - --version Show version information. +yamet + +input: + -c [ --cytosine_report ] arg per-cell cytosine report file(s) + (resembling Bismark's for covered + cytosines). Synonyms: --cell, -c. + These tab-separated files describe + covered CpGs sorted by chromosome and + position, with columns: + chr pos meth_reads total_reads + rate + -r [ --cytosine_locations ] arg genomic locations of all cytosines + (typically CpGs). Used to account for + missing values in cytosine + reports.Synonyms: --cytosine_locations, + --reference, -r. + Required to reconstruct contiguous CpG + sequences. Columns: + chr pos + -i [ --regions ] arg BED file defining genomic regions where + entropies will be computed (both per + feature and globally)Synonyms: + --regions, --features, --target, + --intervals, -i. + Columns: + chr start end + --skip-header-all [=arg(=1)] Number of header lines to skip in all + input files (default: 1). Synonyms: + --skip-header-all, --skip-header. + --skip-header-cytosine_report [=arg(=1)] + Number of header lines to skip in + cytosine_report/cell files (default: + 1). + --skip-header-cytosine_locations [=arg(=1)] + Number of header lines to skip in + cytosine_locations/reference file + (default: 1). + --skip-header-regions [=arg(=1)] Number of header lines to skip in + regions/features/target/intervals file + (default: 1). + +output: + -d [ --det-out ] arg (optional) path to detailed output file + -n [ --norm-det-out ] arg (optional) path to detailed normalized + output file + -m [ --meth-out ] arg (optional) path to average methylation + output file + --all-meth [=arg(=true)] (=false) if true, include all CpGs in + methylation summaries, including those + not used for template construction + (default: false). + --all-meth [=arg(=true)] (=false) + -o [ --out ] arg (optional) path to simple output file + +resource utilisation: + --cores arg (=0) number of cores used for simultaneously + parsing methylation files + --chunk-size arg (=64K) size of the buffer (per file) used for + reading data. Can be specified as a + positive integer (bytes) or with a + suffix: B (bytes), K (kilobytes), M + (megabytes), G (gigabytes). Example: + 4096, 64K, 128M, 2G + +verbose: + --print-intervals print parsed intervals file + --print-reference print parsed reference file + --print-sampens [=arg(=true)] (=true) print computed sample entropies + +misc: + -h [ --help ] print help message + --version current version information + ``` ## Repository diff --git a/method/src/boost.cpp b/method/src/boost.cpp index 9c69807..a7055a0 100644 --- a/method/src/boost.cpp +++ b/method/src/boost.cpp @@ -10,55 +10,53 @@ #include "boost.h" #include "version.h" +namespace po = boost::program_options; + po::variables_map parseCommandLine(int argc, char **argv) { po::variables_map vm; po::options_description inp("input"); // clang-format off inp.add_options() - ("cell,c", po::value>()->composing(), - "tab separated files, sorted by chromosome and position, for " - "different cells in the following format\n" - "\n" - " chr1 5 0 2 0\n" - " chr1 9 1 1 1\n" - " chr2 2 3 4 1\n" - "\n" - "where the columns are the chromosome, position, number of methylated reads, " - "total number of reads and the rate respectively") - ("reference,r", po::value(), - "tab separated file, sorted by chromosome and position, for " - "reference sites in the following format\n" - "\n" - " chr1 5\n" - " chr1 7\n" - " chr1 9\n" - " chr1 11\n" - " chr2 2\n" - " chr2 4\n" - "\n" - "where the columns are the chromosome and start position respectively") - ("intervals,i", po::value(), - "bed file, sorted by chromosome and start position, for " - "intervals of interest in the following format\n" - "\n" - " chr1 5 7\n" - " chr1 10 30\n" - " chr2 1 6\n" - "\n" - "where the columns are the chromosome, start position and the end position respectively") - ("skip-header", po::value()->implicit_value(1), - "integer value indicating number of lines to skip in all file inputs" - "(this is a default value which can be overriden by other 'skip-header-*' options)") - ("skip-header-cell", po::value()->implicit_value(1), - "integer value indicating number of lines to skip in the cell files" - "(overrides 'skip-header' if provided)") - ("skip-header-reference", po::value()->implicit_value(1), - "integer value indicating number of lines to skip in the reference file" - "(overrides 'skip-header' if provided)") - ("skip-header-intervals", po::value()->implicit_value(1), - "integer value indicating number of lines to skip in the intervals file" - "(overrides 'skip-header' if provided)"); + + ("cytosine_report,cell,c", + po::value>()->composing(), + "Per-cell cytosine report file(s) (Bismark-like for covered cytosines). " + "Synonyms: --cytosine_report, --cell, -c.\n" + "Tab-separated, sorted by chromosome and position:\n" + " chr pos meth_reads total_reads rate") + + ("cytosine_locations,reference,r", + po::value(), + "Genomic locations of all cytosines (typically CpGs). " + "Synonyms: --cytosine_locations, --reference, -r.\n" + "Required to reconstruct contiguous CpG sequences.\n" + "Columns:\n" + " chr pos") + + ("regions,features,target,intervals,i", + po::value(), + "BED file defining genomic regions where entropies will be computed. " + "Synonyms: --regions, --features, --target, --intervals, -i.\n" + "Columns:\n" + " chr start end") + + ("skip-header-all,skip-header", + po::value()->implicit_value(1), + "Header lines to skip in all input files (default: 1). " + "Synonyms: --skip-header-all, --skip-header.") + + ("skip-header-cytosine_report,skip-header-cell", + po::value()->implicit_value(1), + "Header lines to skip in cytosine_report/cell files (default: 1).") + + ("skip-header-cytosine_locations,skip-header-reference", + po::value()->implicit_value(1), + "Header lines to skip in cytosine_locations/reference file (default: 1).") + + ("skip-header-regions,skip-header-features,skip-header-target,skip-header-intervals", + po::value()->implicit_value(1), + "Header lines to skip in regions/features/target/intervals file (default: 1)."); // clang-format on po::options_description out("output"); @@ -67,7 +65,10 @@ po::variables_map parseCommandLine(int argc, char **argv) { ("det-out,d", po::value(), "(optional) path to detailed output file") ("norm-det-out,n", po::value(), "(optional) path to detailed normalized output file") ("meth-out,m", po::value(), "(optional) path to average methylation output file") - ("all-meth", po::value()->default_value("false")->implicit_value("true")) + ("all-meth", + po::value()->default_value("false")->implicit_value("true"), + "If true, include all CpGs in methylation summaries, including those " + "not used for template construction (default: false).") ("out,o", po::value(), "(optional) path to simple output file"); // clang-format on @@ -87,14 +88,13 @@ po::variables_map parseCommandLine(int argc, char **argv) { "number of cores used for simultaneously parsing methylation files") ("chunk-size", po::value()->default_value("64K")->notifier(validate_chunk_size), "size of the buffer (per file) used for reading data. Can be specified as a " - "positive integer (bytes) or with a suffix: B (bytes), K (kilobytes), " - "M (megabytes), G (gigabytes). Example: 4096, 64K, 128M, 2G"); + "positive integer (bytes) or with a suffix: B, K, M, G."); // clang-format on po::options_description mis("misc"); // clang-format off mis.add_options() - ("help,h", "produce help message") + ("help,h", "print help message") ("version", "current version information"); // clang-format on @@ -118,49 +118,71 @@ po::variables_map parseCommandLine(int argc, char **argv) { return vm; } -// Helper get functions for different arguments - std::vector getCellFiles(const po::variables_map &vm) { - if (!vm.count("cell")) { - throw std::system_error(EINVAL, std::generic_category(), "No cell coverage files provided"); - } - return vm["cell"].as>(); + if (vm.count("cell")) + return vm["cell"].as>(); + if (vm.count("cytosine_report")) + return vm["cytosine_report"].as>(); + throw std::system_error(EINVAL, std::generic_category(), + "No cell/cytosine_report files provided"); } std::string getRef(const po::variables_map &vm) { - if (!vm.count("reference")) { - throw std::system_error(EINVAL, std::generic_category(), "No reference file provided"); - } - return vm["reference"].as(); + if (vm.count("reference")) + return vm["reference"].as(); + if (vm.count("cytosine_locations")) + return vm["cytosine_locations"].as(); + throw std::system_error(EINVAL, std::generic_category(), + "No reference/cytosine_locations file provided"); } std::string getIntervals(const po::variables_map &vm) { - if (!vm.count("intervals")) { - throw std::system_error(EINVAL, std::generic_category(), "No intervals file provided"); - } - return vm["intervals"].as(); + if (vm.count("intervals")) + return vm["intervals"].as(); + if (vm.count("regions")) + return vm["regions"].as(); + if (vm.count("features")) + return vm["features"].as(); + if (vm.count("target")) + return vm["target"].as(); + throw std::system_error(EINVAL, std::generic_category(), + "No intervals/regions/features/target file provided"); } unsigned int getSkipHeaderTemplate(const po::variables_map &vm, const std::string &file_type) { - if (vm.count(file_type)) { + if (vm.count(file_type)) return vm[file_type].as(); - } else if (vm.count("skip-header")) { - return vm["skip-header"].as(); - } else { - return 0; - } + if (vm.count("skip-header-all")) + return vm["skip-header-all"].as(); + return 1; } unsigned int getSkipHeaderCell(const po::variables_map &vm) { - return getSkipHeaderTemplate(vm, "skip-header-cell"); + if (vm.count("skip-header-cell")) + return vm["skip-header-cell"].as(); + if (vm.count("skip-header-cytosine_report")) + return vm["skip-header-cytosine_report"].as(); + return getSkipHeaderTemplate(vm, "skip-header-all"); } unsigned int getSkipHeaderReference(const po::variables_map &vm) { - return getSkipHeaderTemplate(vm, "skip-header-reference"); + if (vm.count("skip-header-reference")) + return vm["skip-header-reference"].as(); + if (vm.count("skip-header-cytosine_locations")) + return vm["skip-header-cytosine_locations"].as(); + return getSkipHeaderTemplate(vm, "skip-header-all"); } unsigned int getSkipHeaderIntervals(const po::variables_map &vm) { - return getSkipHeaderTemplate(vm, "skip-header-intervals"); + if (vm.count("skip-header-intervals")) + return vm["skip-header-intervals"].as(); + if (vm.count("skip-header-regions")) + return vm["skip-header-regions"].as(); + if (vm.count("skip-header-features")) + return vm["skip-header-features"].as(); + if (vm.count("skip-header-target")) + return vm["skip-header-target"].as(); + return getSkipHeaderTemplate(vm, "skip-header-all"); } std::string getDetOut(const po::variables_map &vm) { @@ -176,12 +198,8 @@ std::string getMethOut(const po::variables_map &vm) { } bool allMeth(const po::variables_map &vm) { - char t = (char)std::tolower(vm["all-meth"].as()[0]); - if (t == 't' || t == '1') { - return true; - } else { - return false; - } + char t = std::tolower(vm["all-meth"].as()[0]); + return (t == 't' || t == '1'); } std::string getOut(const po::variables_map &vm) { @@ -190,58 +208,42 @@ std::string getOut(const po::variables_map &vm) { unsigned int getCores(const po::variables_map &vm) { const unsigned int max_cores = std::thread::hardware_concurrency(); - const int c = vm["cores"].as(); - if (c == -1) { - return max_cores; - } else if (c == 0) { - return max_cores - (unsigned int)(std::floor(std::log2(max_cores))); - } else { - return c; - } + const int c = vm["cores"].as(); + if (c == -1) return max_cores; + if (c == 0) return max_cores - (unsigned int)(std::floor(std::log2(max_cores))); + return c; } unsigned int getChunkSize(const po::variables_map &vm) { static const std::regex pattern(R"(^(\d+(\.\d+)?)(B|K|M|G)?$)", std::regex::icase); - std::smatch match; - const auto chunk_size = vm["chunk-size"].as(); + std::smatch match; + const auto chunk_size = vm["chunk-size"].as(); std::regex_match(chunk_size, match, pattern); - double num = std::stod(match[1].str()); - char unit = match[3].str().empty() ? 'B' : std::toupper(match[3].str()[0]); + double num = std::stod(match[1].str()); + char unit = match[3].str().empty() ? 'B' : std::toupper(match[3].str()[0]); switch (unit) { - case 'K': - num *= 1024; - break; - case 'M': - num *= 1024 * 1024; - break; - case 'G': - num *= 1024 * 1024 * 1024; - break; + case 'K': num *= 1024; break; + case 'M': num *= 1024 * 1024; break; + case 'G': num *= 1024 * 1024 * 1024; break; } - if (num < 1) { + + if (num < 1) throw std::system_error(EINVAL, std::generic_category(), - "Invalid value " + chunk_size + " for 'chunk_size', less than 1 byte"); - } + "Invalid chunk_size < 1 byte"); + return static_cast(num); } bool printSampens(const po::variables_map &vm) { - char t = (char)std::tolower(vm["print-sampens"].as()[0]); - if (t == 't' || t == '1') { - return true; - } else { - return false; - } + char t = std::tolower(vm["print-sampens"].as()[0]); + return (t == 't' || t == '1'); } -// Validators - void validate_chunk_size(const std::string &chunk_size) { static const std::regex pattern(R"(^(\d+(\.\d+)?)(B|K|M|G)?$)", std::regex::icase); - if (!std::regex_match(chunk_size, pattern)) { throw std::system_error(EINVAL, std::generic_category(), "Invalid format " + chunk_size + " for 'chunk_size'"); @@ -250,17 +252,10 @@ void validate_chunk_size(const std::string &chunk_size) { void validate_cores(const int cores) { const int max_cores = std::thread::hardware_concurrency(); - - if (cores < -1) { - throw std::system_error( - EINVAL, std::generic_category(), - "Invalid value " + std::to_string(cores) + - " for 'cores'. Allowed values are\n -1 : use all cores\n 0 : let the program " - "decide\n > 0 : specify the number of cores\n"); - } else if (cores > max_cores) { + if (cores < -1) throw std::system_error(EINVAL, std::generic_category(), - "Invalid value " + std::to_string(cores) + - " for 'cores', exceeds the maximum number of available cores, " + - std::to_string(max_cores)); - } -} \ No newline at end of file + "Invalid cores value: " + std::to_string(cores)); + if (cores > max_cores) + throw std::system_error(EINVAL, std::generic_category(), + "Requested cores exceed available hardware"); +} From bf8478255f837ebd0482f99a7df342feed7d598d Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Thu, 15 Jan 2026 14:51:46 +0100 Subject: [PATCH 3/9] Rollback default for skip-header (separate and joint) --- method/src/boost.cpp | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) diff --git a/method/src/boost.cpp b/method/src/boost.cpp index a7055a0..594c578 100644 --- a/method/src/boost.cpp +++ b/method/src/boost.cpp @@ -43,20 +43,20 @@ po::variables_map parseCommandLine(int argc, char **argv) { ("skip-header-all,skip-header", po::value()->implicit_value(1), - "Header lines to skip in all input files (default: 1). " + "Header lines to skip in all input files (default: 0). " "Synonyms: --skip-header-all, --skip-header.") ("skip-header-cytosine_report,skip-header-cell", po::value()->implicit_value(1), - "Header lines to skip in cytosine_report/cell files (default: 1).") + "Header lines to skip in cytosine_report/cell files (default: 0).") ("skip-header-cytosine_locations,skip-header-reference", po::value()->implicit_value(1), - "Header lines to skip in cytosine_locations/reference file (default: 1).") + "Header lines to skip in cytosine_locations/reference file (default: 0).") ("skip-header-regions,skip-header-features,skip-header-target,skip-header-intervals", po::value()->implicit_value(1), - "Header lines to skip in regions/features/target/intervals file (default: 1)."); + "Header lines to skip in regions/features/target/intervals file (default: 0)."); // clang-format on po::options_description out("output"); @@ -150,39 +150,25 @@ std::string getIntervals(const po::variables_map &vm) { } unsigned int getSkipHeaderTemplate(const po::variables_map &vm, const std::string &file_type) { - if (vm.count(file_type)) + if (vm.count(file_type)) { return vm[file_type].as(); - if (vm.count("skip-header-all")) - return vm["skip-header-all"].as(); - return 1; + } else if (vm.count("skip-header")) { + return vm["skip-header"].as(); + } else { + return 0; + } } unsigned int getSkipHeaderCell(const po::variables_map &vm) { - if (vm.count("skip-header-cell")) - return vm["skip-header-cell"].as(); - if (vm.count("skip-header-cytosine_report")) - return vm["skip-header-cytosine_report"].as(); - return getSkipHeaderTemplate(vm, "skip-header-all"); + return vm["skip-header-cell"].as(); } unsigned int getSkipHeaderReference(const po::variables_map &vm) { - if (vm.count("skip-header-reference")) - return vm["skip-header-reference"].as(); - if (vm.count("skip-header-cytosine_locations")) - return vm["skip-header-cytosine_locations"].as(); - return getSkipHeaderTemplate(vm, "skip-header-all"); + return vm["skip-header-reference"].as(); } unsigned int getSkipHeaderIntervals(const po::variables_map &vm) { - if (vm.count("skip-header-intervals")) - return vm["skip-header-intervals"].as(); - if (vm.count("skip-header-regions")) - return vm["skip-header-regions"].as(); - if (vm.count("skip-header-features")) - return vm["skip-header-features"].as(); - if (vm.count("skip-header-target")) - return vm["skip-header-target"].as(); - return getSkipHeaderTemplate(vm, "skip-header-all"); + return vm["skip-header-intervals"].as(); } std::string getDetOut(const po::variables_map &vm) { From fe799e013e70f283fafdc733ddc8e01700d97a57 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Thu, 15 Jan 2026 14:57:12 +0100 Subject: [PATCH 4/9] Update getter --- method/src/boost.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/method/src/boost.cpp b/method/src/boost.cpp index 594c578..2b39d25 100644 --- a/method/src/boost.cpp +++ b/method/src/boost.cpp @@ -160,15 +160,15 @@ unsigned int getSkipHeaderTemplate(const po::variables_map &vm, const std::strin } unsigned int getSkipHeaderCell(const po::variables_map &vm) { - return vm["skip-header-cell"].as(); + return getSkipHeaderTemplate(vm, "skip-header-cell"); } unsigned int getSkipHeaderReference(const po::variables_map &vm) { - return vm["skip-header-reference"].as(); + return getSkipHeaderTemplate(vm, "skip-header-reference"); } unsigned int getSkipHeaderIntervals(const po::variables_map &vm) { - return vm["skip-header-intervals"].as(); + return getSkipHeaderTemplate(vm, "skip-header-intervals"); } std::string getDetOut(const po::variables_map &vm) { From 7fed06a8fdda01b3d33a830fa101c9d789192008 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Fri, 30 Jan 2026 08:59:17 +0100 Subject: [PATCH 5/9] Download sines/lines and add them to the crc featureset --- workflow/rules/crc.smk | 2 ++ workflow/rules/hg19.smk | 17 +++++++++++++++++ workflow/rules/mm10.smk | 21 +++++++++++++++++++-- workflow/rules/src/download_hg19_lines.sh | 12 ++++++++++++ workflow/rules/src/download_hg19_sines.sh | 12 ++++++++++++ workflow/rules/src/download_mm10_lines.sh | 12 ++++++++++++ workflow/rules/src/download_mm10_sines.sh | 12 ++++++++++++ 7 files changed, 86 insertions(+), 2 deletions(-) create mode 100644 workflow/rules/src/download_hg19_lines.sh create mode 100644 workflow/rules/src/download_hg19_sines.sh create mode 100644 workflow/rules/src/download_mm10_lines.sh create mode 100644 workflow/rules/src/download_mm10_sines.sh diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index e8b3d20..e2ada93 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -45,6 +45,8 @@ ANNOTATIONS = { "chip": ["H3K27me3", "H3K9me3", "H3K4me3"], "lad": ["laminb1"], "genes": ["genes"], + "lines": ["lines"], + "sines": ['sines'], "cpgIslandExt": ['cpgIslandExt'], "scna": ["crc01_nc_scna", "crc01_gain_scna", "crc01_lost_scna"] } diff --git a/workflow/rules/hg19.smk b/workflow/rules/hg19.smk index 43b4800..a506d16 100644 --- a/workflow/rules/hg19.smk +++ b/workflow/rules/hg19.smk @@ -43,6 +43,23 @@ rule build_genome_hg19_ref: "src/make_ref.sh" +rule get_hg19_lines: + conda: + op.join("..", "envs", "processing.yml") + output: + op.join(HG19_BASE, "lines.bed.gz"), + script: + "src/download_hg19_lines.sh" + + +rule get_hg19_sines: + conda: + op.join("..", "envs", "processing.yml") + output: + op.join(HG19_BASE, "sines.bed.gz"), + script: + "src/download_hg19_sines.sh" + rule get_genes_hg19: conda: op.join("..", "envs", "processing.yml") diff --git a/workflow/rules/mm10.smk b/workflow/rules/mm10.smk index 2d730ed..7e09ec1 100644 --- a/workflow/rules/mm10.smk +++ b/workflow/rules/mm10.smk @@ -6,7 +6,7 @@ CHRS = [str(i) for i in range(1, 20)] + ["X", "Y"] MM10_BASE = "mm10" -rule mm10_chr_ref: +rule mm10_per_chr_ref: conda: op.join("..", "envs", "processing.yml") output: @@ -18,7 +18,7 @@ rule mm10_chr_ref: "src/get_chr_ref.sh" -rule mm10_ref: +rule mm10_aggregate_ref: input: expand(op.join(MM10_BASE, "{chr}.{{meth_pat}}.ref"), chr=CHRS), params: @@ -46,7 +46,24 @@ rule get_mm10_genes: script: "src/download_mm10_genes.sh" +rule get_mm10_lines: + conda: + op.join("..", "envs", "processing.yml") + output: + op.join(MM10_BASE, "lines.bed.gz"), + script: + "src/download_hg19_lines.sh" + + +rule get_mm10_sines: + conda: + op.join("..", "envs", "processing.yml") + output: + op.join(MM10_BASE, "sines.bed.gz"), + script: + "src/download_mm10_sines.sh" + ENCODE_MAP = { "h3k4me3": "ENCFF160SCR", "h3k9me3": "ENCFF658QTP", diff --git a/workflow/rules/src/download_hg19_lines.sh b/workflow/rules/src/download_hg19_lines.sh new file mode 100644 index 0000000..a418bc6 --- /dev/null +++ b/workflow/rules/src/download_hg19_lines.sh @@ -0,0 +1,12 @@ +#!/bin/bash +## +## Downloads RepeatMasker (rmsk) for hg19 and extracts only SINE elements +## + +mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \ + 'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily + FROM hg19.rmsk + WHERE repClass = "LINE";' | + awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | + bedtools merge -i - | + gzip -c > ${snakemake_output[0]} diff --git a/workflow/rules/src/download_hg19_sines.sh b/workflow/rules/src/download_hg19_sines.sh new file mode 100644 index 0000000..fb8689b --- /dev/null +++ b/workflow/rules/src/download_hg19_sines.sh @@ -0,0 +1,12 @@ +#!/bin/bash +## +## Downloads RepeatMasker (rmsk) for hg19 and extracts only SINE elements +## + +mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \ + 'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily + FROM hg19.rmsk + WHERE repClass = "SINE";' | + awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | + bedtools merge -i - | + gzip -c > ${snakemake_output[0]} diff --git a/workflow/rules/src/download_mm10_lines.sh b/workflow/rules/src/download_mm10_lines.sh new file mode 100644 index 0000000..8ea0e27 --- /dev/null +++ b/workflow/rules/src/download_mm10_lines.sh @@ -0,0 +1,12 @@ +#!/bin/bash +## +## Downloads RepeatMasker (rmsk) for mm10 and extracts only SINE elements +## + +mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \ + 'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily + FROM mm10.rmsk + WHERE repClass = "LINE";' | + awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | + bedtools merge -i - | + gzip -c > ${snakemake_output[0]} diff --git a/workflow/rules/src/download_mm10_sines.sh b/workflow/rules/src/download_mm10_sines.sh new file mode 100644 index 0000000..c618543 --- /dev/null +++ b/workflow/rules/src/download_mm10_sines.sh @@ -0,0 +1,12 @@ +#!/bin/bash +## +## Downloads RepeatMasker (rmsk) for mm10 and extracts only SINE elements +## + +mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \ + 'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily + FROM mm10.rmsk + WHERE repClass = "SINE";' | + awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | + bedtools merge -i - | + gzip -c > ${snakemake_output[0]} From cfec1e77c024744fae8ec7bee7e05de1304c758b Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 2 Feb 2026 08:26:58 +0100 Subject: [PATCH 6/9] Restore old CLI as in master --- README.md | 50 +++++------ method/src/boost.cpp | 207 +++++++++++++++++++++++-------------------- 2 files changed, 136 insertions(+), 121 deletions(-) diff --git a/README.md b/README.md index 0b31a70..a81e9e9 100644 --- a/README.md +++ b/README.md @@ -50,43 +50,41 @@ bash build.sh yamet input: - -c [ --cytosine_report ] arg per-cell cytosine report file(s) - (resembling Bismark's for covered - cytosines). Synonyms: --cell, -c. - These tab-separated files describe - covered CpGs sorted by chromosome and - position, with columns: + -c [ --cytosine_report ] arg Per-cell cytosine report file(s) + (Bismark-like for covered cytosines). + Synonyms: --cytosine_report, --cell, + -c. + Tab-separated, sorted by chromosome and + position: chr pos meth_reads total_reads rate - -r [ --cytosine_locations ] arg genomic locations of all cytosines - (typically CpGs). Used to account for - missing values in cytosine - reports.Synonyms: --cytosine_locations, - --reference, -r. + -r [ --cytosine_locations ] arg Genomic locations of all cytosines + (typically CpGs). Synonyms: + --cytosine_locations, --reference, -r. Required to reconstruct contiguous CpG - sequences. Columns: + sequences. + Columns: chr pos -i [ --regions ] arg BED file defining genomic regions where - entropies will be computed (both per - feature and globally)Synonyms: + entropies will be computed. Synonyms: --regions, --features, --target, --intervals, -i. Columns: chr start end - --skip-header-all [=arg(=1)] Number of header lines to skip in all - input files (default: 1). Synonyms: + --skip-header-all [=arg(=1)] Header lines to skip in all input files + (default: 0). Synonyms: --skip-header-all, --skip-header. --skip-header-cytosine_report [=arg(=1)] - Number of header lines to skip in + Header lines to skip in cytosine_report/cell files (default: - 1). + 0). --skip-header-cytosine_locations [=arg(=1)] - Number of header lines to skip in + Header lines to skip in cytosine_locations/reference file - (default: 1). - --skip-header-regions [=arg(=1)] Number of header lines to skip in + (default: 0). + --skip-header-regions [=arg(=1)] Header lines to skip in regions/features/target/intervals file - (default: 1). + (default: 0). output: -d [ --det-out ] arg (optional) path to detailed output file @@ -94,11 +92,10 @@ output: output file -m [ --meth-out ] arg (optional) path to average methylation output file - --all-meth [=arg(=true)] (=false) if true, include all CpGs in + --all-meth [=arg(=true)] (=false) If true, include all CpGs in methylation summaries, including those not used for template construction (default: false). - --all-meth [=arg(=true)] (=false) -o [ --out ] arg (optional) path to simple output file resource utilisation: @@ -107,9 +104,7 @@ resource utilisation: --chunk-size arg (=64K) size of the buffer (per file) used for reading data. Can be specified as a positive integer (bytes) or with a - suffix: B (bytes), K (kilobytes), M - (megabytes), G (gigabytes). Example: - 4096, 64K, 128M, 2G + suffix: B, K, M, G. verbose: --print-intervals print parsed intervals file @@ -120,6 +115,7 @@ misc: -h [ --help ] print help message --version current version information + ``` ## Repository diff --git a/method/src/boost.cpp b/method/src/boost.cpp index 2b39d25..9c69807 100644 --- a/method/src/boost.cpp +++ b/method/src/boost.cpp @@ -10,53 +10,55 @@ #include "boost.h" #include "version.h" -namespace po = boost::program_options; - po::variables_map parseCommandLine(int argc, char **argv) { po::variables_map vm; po::options_description inp("input"); // clang-format off inp.add_options() - - ("cytosine_report,cell,c", - po::value>()->composing(), - "Per-cell cytosine report file(s) (Bismark-like for covered cytosines). " - "Synonyms: --cytosine_report, --cell, -c.\n" - "Tab-separated, sorted by chromosome and position:\n" - " chr pos meth_reads total_reads rate") - - ("cytosine_locations,reference,r", - po::value(), - "Genomic locations of all cytosines (typically CpGs). " - "Synonyms: --cytosine_locations, --reference, -r.\n" - "Required to reconstruct contiguous CpG sequences.\n" - "Columns:\n" - " chr pos") - - ("regions,features,target,intervals,i", - po::value(), - "BED file defining genomic regions where entropies will be computed. " - "Synonyms: --regions, --features, --target, --intervals, -i.\n" - "Columns:\n" - " chr start end") - - ("skip-header-all,skip-header", - po::value()->implicit_value(1), - "Header lines to skip in all input files (default: 0). " - "Synonyms: --skip-header-all, --skip-header.") - - ("skip-header-cytosine_report,skip-header-cell", - po::value()->implicit_value(1), - "Header lines to skip in cytosine_report/cell files (default: 0).") - - ("skip-header-cytosine_locations,skip-header-reference", - po::value()->implicit_value(1), - "Header lines to skip in cytosine_locations/reference file (default: 0).") - - ("skip-header-regions,skip-header-features,skip-header-target,skip-header-intervals", - po::value()->implicit_value(1), - "Header lines to skip in regions/features/target/intervals file (default: 0)."); + ("cell,c", po::value>()->composing(), + "tab separated files, sorted by chromosome and position, for " + "different cells in the following format\n" + "\n" + " chr1 5 0 2 0\n" + " chr1 9 1 1 1\n" + " chr2 2 3 4 1\n" + "\n" + "where the columns are the chromosome, position, number of methylated reads, " + "total number of reads and the rate respectively") + ("reference,r", po::value(), + "tab separated file, sorted by chromosome and position, for " + "reference sites in the following format\n" + "\n" + " chr1 5\n" + " chr1 7\n" + " chr1 9\n" + " chr1 11\n" + " chr2 2\n" + " chr2 4\n" + "\n" + "where the columns are the chromosome and start position respectively") + ("intervals,i", po::value(), + "bed file, sorted by chromosome and start position, for " + "intervals of interest in the following format\n" + "\n" + " chr1 5 7\n" + " chr1 10 30\n" + " chr2 1 6\n" + "\n" + "where the columns are the chromosome, start position and the end position respectively") + ("skip-header", po::value()->implicit_value(1), + "integer value indicating number of lines to skip in all file inputs" + "(this is a default value which can be overriden by other 'skip-header-*' options)") + ("skip-header-cell", po::value()->implicit_value(1), + "integer value indicating number of lines to skip in the cell files" + "(overrides 'skip-header' if provided)") + ("skip-header-reference", po::value()->implicit_value(1), + "integer value indicating number of lines to skip in the reference file" + "(overrides 'skip-header' if provided)") + ("skip-header-intervals", po::value()->implicit_value(1), + "integer value indicating number of lines to skip in the intervals file" + "(overrides 'skip-header' if provided)"); // clang-format on po::options_description out("output"); @@ -65,10 +67,7 @@ po::variables_map parseCommandLine(int argc, char **argv) { ("det-out,d", po::value(), "(optional) path to detailed output file") ("norm-det-out,n", po::value(), "(optional) path to detailed normalized output file") ("meth-out,m", po::value(), "(optional) path to average methylation output file") - ("all-meth", - po::value()->default_value("false")->implicit_value("true"), - "If true, include all CpGs in methylation summaries, including those " - "not used for template construction (default: false).") + ("all-meth", po::value()->default_value("false")->implicit_value("true")) ("out,o", po::value(), "(optional) path to simple output file"); // clang-format on @@ -88,13 +87,14 @@ po::variables_map parseCommandLine(int argc, char **argv) { "number of cores used for simultaneously parsing methylation files") ("chunk-size", po::value()->default_value("64K")->notifier(validate_chunk_size), "size of the buffer (per file) used for reading data. Can be specified as a " - "positive integer (bytes) or with a suffix: B, K, M, G."); + "positive integer (bytes) or with a suffix: B (bytes), K (kilobytes), " + "M (megabytes), G (gigabytes). Example: 4096, 64K, 128M, 2G"); // clang-format on po::options_description mis("misc"); // clang-format off mis.add_options() - ("help,h", "print help message") + ("help,h", "produce help message") ("version", "current version information"); // clang-format on @@ -118,35 +118,27 @@ po::variables_map parseCommandLine(int argc, char **argv) { return vm; } +// Helper get functions for different arguments + std::vector getCellFiles(const po::variables_map &vm) { - if (vm.count("cell")) - return vm["cell"].as>(); - if (vm.count("cytosine_report")) - return vm["cytosine_report"].as>(); - throw std::system_error(EINVAL, std::generic_category(), - "No cell/cytosine_report files provided"); + if (!vm.count("cell")) { + throw std::system_error(EINVAL, std::generic_category(), "No cell coverage files provided"); + } + return vm["cell"].as>(); } std::string getRef(const po::variables_map &vm) { - if (vm.count("reference")) - return vm["reference"].as(); - if (vm.count("cytosine_locations")) - return vm["cytosine_locations"].as(); - throw std::system_error(EINVAL, std::generic_category(), - "No reference/cytosine_locations file provided"); + if (!vm.count("reference")) { + throw std::system_error(EINVAL, std::generic_category(), "No reference file provided"); + } + return vm["reference"].as(); } std::string getIntervals(const po::variables_map &vm) { - if (vm.count("intervals")) - return vm["intervals"].as(); - if (vm.count("regions")) - return vm["regions"].as(); - if (vm.count("features")) - return vm["features"].as(); - if (vm.count("target")) - return vm["target"].as(); - throw std::system_error(EINVAL, std::generic_category(), - "No intervals/regions/features/target file provided"); + if (!vm.count("intervals")) { + throw std::system_error(EINVAL, std::generic_category(), "No intervals file provided"); + } + return vm["intervals"].as(); } unsigned int getSkipHeaderTemplate(const po::variables_map &vm, const std::string &file_type) { @@ -184,8 +176,12 @@ std::string getMethOut(const po::variables_map &vm) { } bool allMeth(const po::variables_map &vm) { - char t = std::tolower(vm["all-meth"].as()[0]); - return (t == 't' || t == '1'); + char t = (char)std::tolower(vm["all-meth"].as()[0]); + if (t == 't' || t == '1') { + return true; + } else { + return false; + } } std::string getOut(const po::variables_map &vm) { @@ -194,42 +190,58 @@ std::string getOut(const po::variables_map &vm) { unsigned int getCores(const po::variables_map &vm) { const unsigned int max_cores = std::thread::hardware_concurrency(); - const int c = vm["cores"].as(); - if (c == -1) return max_cores; - if (c == 0) return max_cores - (unsigned int)(std::floor(std::log2(max_cores))); - return c; + const int c = vm["cores"].as(); + if (c == -1) { + return max_cores; + } else if (c == 0) { + return max_cores - (unsigned int)(std::floor(std::log2(max_cores))); + } else { + return c; + } } unsigned int getChunkSize(const po::variables_map &vm) { static const std::regex pattern(R"(^(\d+(\.\d+)?)(B|K|M|G)?$)", std::regex::icase); - std::smatch match; - const auto chunk_size = vm["chunk-size"].as(); + std::smatch match; + const auto chunk_size = vm["chunk-size"].as(); std::regex_match(chunk_size, match, pattern); - double num = std::stod(match[1].str()); - char unit = match[3].str().empty() ? 'B' : std::toupper(match[3].str()[0]); + double num = std::stod(match[1].str()); + char unit = match[3].str().empty() ? 'B' : std::toupper(match[3].str()[0]); switch (unit) { - case 'K': num *= 1024; break; - case 'M': num *= 1024 * 1024; break; - case 'G': num *= 1024 * 1024 * 1024; break; + case 'K': + num *= 1024; + break; + case 'M': + num *= 1024 * 1024; + break; + case 'G': + num *= 1024 * 1024 * 1024; + break; } - - if (num < 1) + if (num < 1) { throw std::system_error(EINVAL, std::generic_category(), - "Invalid chunk_size < 1 byte"); - + "Invalid value " + chunk_size + " for 'chunk_size', less than 1 byte"); + } return static_cast(num); } bool printSampens(const po::variables_map &vm) { - char t = std::tolower(vm["print-sampens"].as()[0]); - return (t == 't' || t == '1'); + char t = (char)std::tolower(vm["print-sampens"].as()[0]); + if (t == 't' || t == '1') { + return true; + } else { + return false; + } } +// Validators + void validate_chunk_size(const std::string &chunk_size) { static const std::regex pattern(R"(^(\d+(\.\d+)?)(B|K|M|G)?$)", std::regex::icase); + if (!std::regex_match(chunk_size, pattern)) { throw std::system_error(EINVAL, std::generic_category(), "Invalid format " + chunk_size + " for 'chunk_size'"); @@ -238,10 +250,17 @@ void validate_chunk_size(const std::string &chunk_size) { void validate_cores(const int cores) { const int max_cores = std::thread::hardware_concurrency(); - if (cores < -1) - throw std::system_error(EINVAL, std::generic_category(), - "Invalid cores value: " + std::to_string(cores)); - if (cores > max_cores) + + if (cores < -1) { + throw std::system_error( + EINVAL, std::generic_category(), + "Invalid value " + std::to_string(cores) + + " for 'cores'. Allowed values are\n -1 : use all cores\n 0 : let the program " + "decide\n > 0 : specify the number of cores\n"); + } else if (cores > max_cores) { throw std::system_error(EINVAL, std::generic_category(), - "Requested cores exceed available hardware"); -} + "Invalid value " + std::to_string(cores) + + " for 'cores', exceeds the maximum number of available cores, " + + std::to_string(max_cores)); + } +} \ No newline at end of file From 007bf10c3f4cad5adce16a2ccc45705a2962caa8 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 2 Feb 2026 09:34:03 +0100 Subject: [PATCH 7/9] Switch to normalized (sample?) entropies and hopefully also Shannons, fix rmsk files --- workflow/Snakefile | 8 ++++++-- workflow/rules/crc.smk | 14 ++++++++++---- workflow/rules/hg19.smk | 14 ++++++++++++-- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 832011d..311a818 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -34,7 +34,7 @@ rule all: # op.join("annotation", "mm10", "done.flag"), ## this to download regions for Ecker's # op.join("data", "crc", "download.flag"), # this to download CRC cytosine reports, caution will redownload them # op.join("data", "crc", "results", "crc.html") , ## this requires the `download.flag` being present (and the cytosine reports; dependencies are not well captured) - # op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet") + op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet"), # op.join('scnmt_gastrulation_data', 'downloaded_data.tsv'), ## this is Argelaguet's gastrulation # expand( # op.join( @@ -58,12 +58,16 @@ rule all: rule install_yamet: """ - Binary to goes to ~/.local/yamet; export path accordingly + Binary to goes to ~/.local/yamet; exported path accordingly """ conda: "envs/yamet.yml" output: op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet") + priority: + 100 + threads: + workflow.cores log: op.join("logs", "build.log") shell: diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index e2ada93..3197786 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -226,10 +226,12 @@ rule run_yamet_on_separate_features: output: simple_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out")), det_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out")), + norm_det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out")), meth_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out")), simple=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out.gz"), det=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out.gz"), - meth=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out.gz") + meth=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out.gz"), + norm_det=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out.gz") log: op.join('logs', 'yamet_{subcat}_{cat}_{patient}_{location}.log') group: @@ -248,7 +250,8 @@ rule run_yamet_on_separate_features: --print-sampens F \ --out {output.simple_uncomp} \ --det-out {output.det_uncomp} \ - --meth-out {output.meth_uncomp} &> {log} + --meth-out {output.meth_uncomp} \ + --norm-det-out {output.norm_det_uncomp} &> {log} gzip --keep -f {params.path}/{wildcards.subcat}_{wildcards.cat}_{wildcards.patient}_{wildcards.location}*out &>> {log} """ @@ -378,9 +381,11 @@ rule run_yamet_on_windows: simple_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.out")), det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.det.out")), meth_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out")), + norm_det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.norm.det.out")), simple=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.out.gz"), det=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.det.out.gz"), - meth=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out.gz") + meth=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out.gz"), + norm_det=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.norm.det.out.gz") log: op.join('logs', 'yamet_{win_size}_{patient}_{location}.log') group: @@ -399,7 +404,8 @@ rule run_yamet_on_windows: --print-sampens F \ --out {output.simple_uncomp} \ --det-out {output.det_uncomp} \ - --meth-out {output.meth_uncomp} &> {log} + --meth-out {output.meth_uncomp} \ + --norm-det-out {output.norm_det_uncomp} &> {log} gzip --keep -f {params.path}/{wildcards.win_size}_{wildcards.patient}_{wildcards.location}*out &>> {log} """ diff --git a/workflow/rules/hg19.smk b/workflow/rules/hg19.smk index a506d16..85f36a0 100644 --- a/workflow/rules/hg19.smk +++ b/workflow/rules/hg19.smk @@ -47,7 +47,7 @@ rule get_hg19_lines: conda: op.join("..", "envs", "processing.yml") output: - op.join(HG19_BASE, "lines.bed.gz"), + op.join(HG19_BASE, "rmsk.lines.bed.gz"), script: "src/download_hg19_lines.sh" @@ -56,7 +56,7 @@ rule get_hg19_sines: conda: op.join("..", "envs", "processing.yml") output: - op.join(HG19_BASE, "sines.bed.gz"), + op.join(HG19_BASE, "rmsk.sines.bed.gz"), script: "src/download_hg19_sines.sh" @@ -68,6 +68,16 @@ rule get_genes_hg19: script: "src/download_hg19_genes.sh" +rule uncompress_repeats: + conda: + op.join("..", "envs", "processing.yml") + input: + op.join(HG19_BASE, "rmsk.{type}.bed.gz"), + output: + op.join(HG19_BASE, "{type}.{type}.bed"), + shell: + "gunzip -c {input} > {output}" + rule uncompress_hg19_genes: conda: From 38914cf20fada5153f76e2c61ee5f10e0f5aed90 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 2 Feb 2026 14:10:22 +0100 Subject: [PATCH 8/9] Fix output --- workflow/rules/crc.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index 3197786..1ba5689 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -226,7 +226,7 @@ rule run_yamet_on_separate_features: output: simple_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out")), det_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out")), - norm_det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out")), + norm_det_uncomp=temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out")), meth_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out")), simple=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out.gz"), det=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out.gz"), From 80275c405630e912d1adee83488dbbd31068cae5 Mon Sep 17 00:00:00 2001 From: Izaskun Mallona Date: Mon, 2 Feb 2026 16:53:16 +0100 Subject: [PATCH 9/9] Add normalized data, and Shannon entropies, to the per-feature plots --- workflow/rules/crc.smk | 2 + workflow/rules/src/crc.Rmd | 463 +++++++++++++++++++++++++------------ 2 files changed, 323 insertions(+), 142 deletions(-) diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index 1ba5689..080327b 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -272,6 +272,8 @@ rule render_crc_report: list_relevant_yamet_outputs() params: output_path=CRC_OUTPUT + threads: + round(workflow.cores/2) output: op.join(CRC, "results", "crc.html") log: diff --git a/workflow/rules/src/crc.Rmd b/workflow/rules/src/crc.Rmd index 4344c1d..2f3b503 100644 --- a/workflow/rules/src/crc.Rmd +++ b/workflow/rules/src/crc.Rmd @@ -49,6 +49,7 @@ opts_chunk$set( ```{r save_snmk_object_for_debugging, include = FALSE} saveRDS(file = 'snmk_object_features.rds', object = snakemake) +## snakemake <- readRDS('../../snmk_object_features.rds') ``` @@ -58,11 +59,9 @@ sink(log) sink(log, type = "message") ``` -# Short (global) reports by annotation {.tabset .tabset-pills} -## Modelling -Firstly, we read in the yamet output files and create a table storing the biopsy location, patient id and genomic segmentation. +These are not det.out.gz, but only out.gz files. ```{r, import_short_reports} filepaths <- list.files( @@ -71,9 +70,9 @@ filepaths <- list.files( ) filepaths <- filepaths[!grepl("\\.det\\.out\\.gz$", filepaths)] -filepaths <- filepaths[!grepl("\\.norm\\.out\\.gz$", filepaths)] filepaths <- filepaths[!grepl("\\.meth\\.out\\.gz$", filepaths)] + regex <- "(.*)_(.*)_(.*)_(.*).out.gz" process_file <- function(filepath) { filename <- basename(filepath) @@ -98,17 +97,6 @@ files$location <- factor( ) ``` -Patient as random effect - -```{r, lmer} -lmer.files <- lmer( - sampen ~ location * avg_meth + (1 | patient), - data = subset(files, annotation == "pmds") -) -summary(lmer.files) -anova(lmer.files, type = 3) -``` - ## Scatters by location and annotation ```{r} @@ -262,169 +250,360 @@ ggplot(ranges, aes(color = location)) + ``` -## Others +# After normalization - same plots (the yamet-way) -```{r, fig.height = 10, fig.width = 10} +That is, not by locus but by cell norm sampEns (that are calculated according to some regionset anyway) -ggplot(ranges, aes(x = compartment, y = location, fill = sampen_mean)) + - geom_tile(color = "white") + - scale_fill_viridis_c() + - theme_minimal() + - theme(axis.text.x = element_text(angle = 90)) +```{r, import_short_reports_norm} +filepaths <- list.files( + snakemake@params[["output_path"]], + pattern = "out\\.gz$", full.names = TRUE) -ggplot(ranges, aes(x = compartment, y = location, fill = meth_mean)) + - geom_tile(color = "white") + - scale_fill_viridis_c() + - theme_minimal() + - theme(axis.text.x = element_text(angle = 90)) +filepaths <- filepaths[!grepl("\\.det\\.out\\.gz$", filepaths)] +filepaths <- filepaths[!grepl("\\.norm\\.out\\.gz$", filepaths)] +filepaths <- filepaths[!grepl("\\.meth\\.out\\.gz$", filepaths)] + +head(filepaths) +regex <- "(.*)_(.*)_(.*)_(.*).out.gz" +process_file <- function(filepath) { + filename <- basename(filepath) + + dt <- read_tsv(filepath, na = "-1", show_col_types = FALSE) + dt <- dt[, c("sampen_norm", "avg_meth")] + colnames(dt) <- c('sampen', 'avg_meth') ## caution they're normalized though + + dt$annotation <- gsub(regex, "\\1", filename) + dt$patient <- gsub(regex, "\\3", filename) + dt$location <- gsub(regex, "\\4", filename) + + return(dt) +} + +head(filepaths) + +files <- do.call(rbind, bplapply(filepaths, process_file, BPPARAM = param)) + +files$annotation <- as.factor(files$annotation) +files$patient <- as.factor(files$patient) +files$location <- factor( + files$location, + levels = c("NC", "PT", "LN", "ML", "MP", "MO") +) +``` + +## Scatters by location and annotation + +```{r} +ggplot( + ## subset(files, patient == "CRC01"), + files, + aes(y = sampen, x = avg_meth, color = location, shape = patient) +) + + geom_point(size = 0.9, alpha = 0.2) + + scale_color_brewer(palette = "Set1") + + scale_shape_manual(values = 1:nlevels(files$patient)) + + facet_wrap(~annotation) + + labs( + x = "average methylation", + y = "norm sample entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() +``` +## Scatters by location and annotation and patient + +```{r, fig.height = 15} +ggplot( + ## subset(files, patient == "CRC01"), + files, + aes(y = sampen, x = avg_meth, color = location) +) + + geom_point(size = 0.9, alpha = 0.2) + + scale_color_brewer(palette = "Set1") + + facet_grid(annotation ~ patient) + + labs( + x = "average methylation", + y = "norm sample entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) ``` -# Full (local) reports for **downsampled** PMDs and CRC01 +## Scatters by location and annotation and patient (shorter) + +```{r, fig.height = 3.2} +ggplot( + subset(files, annotation %in% c("laminb1", "H3K27me3", "H3K9me3")), + aes(y = sampen, x = avg_meth, color = location) +) + + geom_point(size = 0.9, alpha = 0.2) + + scale_color_brewer(palette = "Set1") + + facet_grid(annotation ~ patient) + + labs( + x = "average methylation", + y = "norm sample entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +``` -Caution downsampled to some rows only. +## sampEn boxes by location and annotation - crc01 -```{r, import_log_reports} -DOWNSAMPLING <- 5e4 +```{r, fig.height = 6, fig.width = 10} +ggplot( + subset(files, patient == "CRC01"), + aes(y = sampen, x = annotation, color = location) +) + + geom_boxplot( + outlier.size = 0.6, + outlier.alpha = 0.3 + ) + + labs(y = "norm sample entropy") + + geom_jitter( + position = position_dodge(width = 0.75), + size = 0.6, alpha = 0.3 + ) + + scale_color_brewer(palette = "Dark2") + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +``` -annotation <- "pmds" -patient <- "CRC01" +## meth boxes by location and annotation - crc01 -local <- list() -for (location in c("NC", "PT", "LN", "ML", "MP")) { - filepaths <- list.files( - snakemake@params[["output_path"]], - pattern = "\\.det\\.out\\.gz$", full.names = TRUE - ) - filepaths <- grep(location, filepaths, value = TRUE) - filepaths <- grep(annotation, filepaths, value = TRUE) - filepaths <- grep(patient, filepaths, value = TRUE) - - d <- bplapply(filepaths, \(filepath) { - dt <- read_tsv( - filepath, - na = "-1", n_max = DOWNSAMPLING, show_col_types = FALSE - ) - colnames(dt) <- basename(colnames(dt)) - dt - }, BPPARAM = param) - - d2 <- bplapply(d, \(x) { - avg_sampen <- apply( - x[, grep(location, colnames(x))], 1, \(y) mean(y, na.rm = TRUE) - ) - median_sampen <- apply( - x[, grep(location, colnames(x))], 1, \(y) median(y, na.rm = TRUE) - ) - q25_sampen <- apply( - x[, grep(location, colnames(x))], 1, \(y) quantile(y, 0.25, na.rm = TRUE) - ) - q75_sampen <- apply( - x[, grep(location, colnames(x))], 1, \(y) quantile(y, 0.75, na.rm = TRUE) - ) - cbind(x, data.frame( - avg_sampen = avg_sampen, median_sampen = median_sampen, - q25_sampen = q25_sampen, q75_sampen = q75_sampen, - location = location, - patient = patient, - annotation = annotation - )) - }, BPPARAM = param) - - local[[location]] <- d2[[1]] - rm(d, d2) -} +```{r, fig.height = 6, fig.width = 10} +ggplot( + subset(files, patient == "CRC01"), + aes(y = avg_meth, x = annotation, color = location) +) + + geom_boxplot( + outlier.size = 0.6, + outlier.alpha = 0.3 + ) + + labs(y = "average methylation") + + geom_jitter( + position = position_dodge(width = 0.75), + size = 0.6, alpha = 0.3 + ) + + scale_color_brewer(palette = "Dark2") + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) ``` -## Scatters {.tabset .tabset-pills} - -### By meth NC - -```{r, fig.width = 10, fig.height = 10} -pal <- viridis(10, alpha = 0.8) - -beta2m <- function(betas, epsilon = 1e-05) { - if (!is.numeric(betas)) { - stop("Non-numeric betas") - } - if (!(is.numeric(epsilon) && (!is.na(epsilon)))) { - stop("Epison not valid") - } - if (epsilon < 0 || epsilon >= 0.5) { - stop("Not 0 < epsilon < 0.5") - } - betas[betas < epsilon] <- epsilon - betas[betas > (1 - epsilon)] <- 1 - epsilon - return(log2(betas / (1 - betas))) -} +## Range plots - by compartment -pmds <- data.frame( - shannon_NC = local$NC$shannon, - meth_NC = local$NC$avg_meth, - med_sampen_NC = local$NC$median_sampen, - shannon_PT = local$PT$shannon, - meth_PT = local$PT$avg_meth, - med_sampen_PT = local$PT$median_sampen -) -pmds$meth_change <- pmds$meth_PT - pmds$meth_NC -pmds <- na.omit(pmds) +```{r, fig.height = 10, fig.width = 10} -pmds$by_meth_nc <- cut(pmds$meth_NC, breaks = 10) -pmds$by_meth_pt <- cut(pmds$meth_PT, breaks = 10) -plot(pmds[, grep("by", colnames(pmds), invert = TRUE)], - pch = ".", col = pal[as.numeric(pmds$by_meth_nc)], main = "by_meth_nc" -) +ranges <- as.data.frame(files %>% + filter(patient == "CRC01") %>% + group_by(compartment = annotation, location) %>% + summarise( + sampen_mean = mean(sampen, na.rm = TRUE), + sampen_min = min(sampen, na.rm = TRUE), + sampen_max = max(sampen, na.rm = TRUE), + meth_mean = mean(avg_meth, na.rm = TRUE), + meth_min = min(avg_meth, na.rm = TRUE), + meth_max = max(avg_meth, na.rm = TRUE))) + +## ggplot(ranges, aes(x = meth_mean, y = sampen_mean, color = location)) + +## geom_point(shape = 3, size = 1.2) + # "+" symbol +## geom_errorbar(aes(ymin = sampen_min, ymax = sampen_max), width = 0.3, alpha = 1, size = 1) + +## geom_errorbarh(aes(xmin = meth_min, xmax = meth_max), height = 0.3, alpha = 1, size = 0) + +## scale_color_brewer(palette = "Dark2") + +## labs( +## x = "DNA methylation", +## y = "Sample entropy" +## ) + +## facet_wrap(~ compartment) + +## theme_ng() + +# I think this is the correct one +ggplot(ranges, aes(color = location)) + + ## horizontal segment: range of methylations for that location/compartment + geom_segment( + aes(x = meth_min, xend = meth_max, y = sampen_mean, yend = sampen_mean), + size = 0.6, alpha = 1 + ) + + # vertical segment: range of sampens for that location/compartment + geom_segment( + aes(x = meth_mean, xend = meth_mean, y = sampen_min, yend = sampen_max), + size = 0.6, alpha = 1 + ) + + # center point + geom_point( + aes(x = meth_mean, y = sampen_mean), + shape = 3, size = 1.2 + ) + + scale_color_brewer(palette = "Dark2") + + labs(x = "DNA methylation", y = "Normalized sample entropy") + + facet_wrap(~ compartment) + + theme_ng() + ``` -### By median sampen PT +# Unnormalized Shannon's MEDIANS DOWNSAMPLED {.tabset .tabset-pills} +This cannot be done on short `out.gz` files, but on detailed ones only. Median values, not all loci included! -```{r, fig.width = 10, fig.height = 10} -pmds$by_med_sampen_pt <- cut(pmds$med_sampen_PT, breaks = 10) -plot(pmds[, grep("by", colnames(pmds), invert = TRUE)], - pch = ".", col = pal[as.numeric(pmds$by_med_sampen_pt)], - main = "by_med_sampen_PT" +```{r, eval = TRUE} +filepaths <- list.files( + snakemake@params[["output_path"]], + pattern = "det\\.out\\.gz$", full.names = TRUE ) + +filepaths <- filepaths[!grepl("norm", filepaths)] + +head(filepaths) + +## todo extract Shannon's here instead + +regex <- "(.*)_(.*)_(.*)_(.*).det.out.gz" + +process_file <- function(filepath) { + filename <- basename(filepath) + + tmp <- read_tsv(filepath, na = "-1", show_col_types = FALSE, n_max = 10e4) ## caution downsampling + + tibble( + file = filename, + median_shannon = median(tmp$shannon, na.rm = TRUE), + median_avg_meth = median(tmp$avg_meth, na.rm = TRUE), + annotation = gsub(regex, "\\1", filename), + patient = gsub(regex, "\\3", filename), + location = gsub(regex, "\\4", filename) + ) +} + +files <- bplapply(filepaths, process_file, BPPARAM = param) |> bind_rows() + +files$annotation <- factor(files$annotation) +files$patient <- factor(files$patient) +files$location <- factor(files$location, + levels = c("NC", "PT", "LN", "ML", "MP", "MO")) + ``` -### By median sampen NC +## Scatters by location and annotation + +```{r} +ggplot( + files, + aes(y = median_shannon, x = median_avg_meth, color = location, shape = patient) +) + + geom_point(size = 0.9, alpha = 0.2) + + scale_color_brewer(palette = "Set1") + + scale_shape_manual(values = 1:nlevels(files$patient)) + + facet_wrap(~annotation) + + labs( + x = "median average methylation", + y = "median Shannon entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() +``` -```{r, fig.width = 10, fig.height = 10, results = 'asis'} -pmds$by_med_sampen_nc <- cut(pmds$med_sampen_NC, breaks = 10) -plot(pmds[, grep("by", colnames(pmds), invert = TRUE)], - pch = ".", col = pal[as.numeric(pmds$by_med_sampen_nc)], - main = "by_med_sampen_nc" -) +## Scatters by location and annotation and patient + +```{r, fig.height = 15} +ggplot( + files, + aes(y = median_shannon, x = median_avg_meth, color = location) +) + + geom_point(size = 0.9, alpha = 0.2) + + ## scale_color_brewer(palette = "Set1") + + ## facet_grid(annotation ~ patient) + + labs( + x = "median average methylation", + y = "median Shannon entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) ``` -### By methylation change -```{r, fig.width = 10, fig.height = 10, results = 'asis'} -pmds$by_meth_change <- cut(pmds$meth_change, breaks = 10) -plot(pmds[, grep("by", colnames(pmds), invert = TRUE)], - pch = ".", col = pal[as.numeric(pmds$by_meth_change)], - main = "by_meth_change" + +# Meth-normalized Shannons (the yamet way) MEDIANS {.tabset .tabset-pills} + +This cannot be done on short `out.gz` files, but on detailed ones only. Median values, not all loci included! + +```{r, eval = TRUE} +filepaths <- list.files( + snakemake@params[["output_path"]], + pattern = "norm\\.det\\.out\\.gz$", full.names = TRUE ) -``` -Add extra discretization. +head(filepaths) -```{r} -pmds$by_sampen_pt <- cut(pmds$med_sampen_PT, breaks = 10) -pmds$by_sampen_nc <- cut(pmds$med_sampen_NC, breaks = 10) -pmds$by_shannon_pt <- cut(pmds$shannon_PT, breaks = 10) -pmds$by_shannon_nc <- cut(pmds$shannon_NC, breaks = 10) -``` +## todo extract Shannon's here instead + +regex <- "(.*)_(.*)_(.*)_(.*).norm.det.out.gz" + +process_file <- function(filepath) { + filename <- basename(filepath) -# SCNA checks, by cytoband + tmp <- read_tsv(filepath, na = "-1", show_col_types = FALSE, n_max = 10e4) # caution downsampling -Is this necessary? given we don't see much of a change https://github.com/imallona/yamet/blob/9156b817e04d9294024631dc25b528e191d913f4/workflow/rules/src/crc_deletions.Rmd#L96 + tibble( + file = filename, + median_shannon = median(tmp$shannon_norm, na.rm = TRUE), + median_avg_meth = median(tmp$avg_meth, na.rm = TRUE), + annotation = gsub(regex, "\\1", filename), + patient = gsub(regex, "\\3", filename), + location = gsub(regex, "\\4", filename) + ) +} +files <- bplapply(filepaths, process_file, BPPARAM = param) |> bind_rows() +files$annotation <- factor(files$annotation) +files$patient <- factor(files$patient) +files$location <- factor(files$location, + levels = c("NC", "PT", "LN", "ML", "MP", "MO")) +``` +## Scatters by location and annotation + +```{r} +ggplot( + files, + aes(y = median_shannon, x = median_avg_meth, color = location, shape = patient) +) + + geom_point(size = 0.9, alpha = 0.2) + + scale_color_brewer(palette = "Set1") + + scale_shape_manual(values = 1:nlevels(files$patient)) + + facet_wrap(~annotation) + + labs( + x = "median average methylation", + y = "median norm Shannon entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() +``` + +## Scatters by location and annotation and patient + +```{r, fig.height = 15} +ggplot( + files, + aes(y = median_shannon, x = median_avg_meth, color = location) +) + + geom_point(size = 0.9, alpha = 0.2) + + ## scale_color_brewer(palette = "Set1") + + ## facet_grid(annotation ~ patient) + + labs( + x = "median average methylation", + y = "median norm Shannon entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +```