diff --git a/README.md b/README.md index 5da46e4..a81e9e9 100644 --- a/README.md +++ b/README.md @@ -44,46 +44,78 @@ bash build.sh ## Usage -`yamet` processes (covered CpG) DNA methylation report(s) (`-cell` argument), a reference file listing all CpG positions in a genome (`--reference`), and a bedfile specifying the genomic regions to calculate scores for (`--intervals`; e.g. promoters, genes, etc). Full CLI args: +`yamet` processes (covered CpG) DNA methylation report(s), a reference file listing all CpG positions in a genome, and a bedfile specifying the genomic regions to calculate scores for (e.g. promoters, genes, etc). Full CLI args: ```text -Usage: - yamet (-c ... | ...) \ - -r \ - -i \ - [OPTIONS] - -Required inputs: - -c --cell ... One or more tab-separated methylation files, - OR provide them directly as positional arguments. - ... (Positional alternative) Cell files (same format as above). - -r --reference Reference file, tab-separated and sorted by chromosome/position. - -i --intervals BED file of intervals of interest. - -Optional output: - -d --det-out Path to detailed output file. - -m --meth-out Path to average methylation output file. - -o --out Path to simple output file. - -Resource options: - --cores Number of cores for parallel parsing - [default: 0, implying program decides]. - --chunk-size Buffer size per file (e.g., 64K, 128M, 2G) [default: 64K]. - -Optional input control: - --skip-header[=] Skip lines in all input files [default: 1]. - --skip-header-cell[=] Skip lines in cell files (overrides --skip-header). - --skip-header-reference[=] Skip lines in reference file (overrides --skip-header). - --skip-header-intervals[=] Skip lines in intervals file (overrides --skip-header). - -Verbose/debugging: - --print-intervals Print parsed intervals file. - --print-reference Print parsed reference file. - --print-sampens[=] Print computed sample entropies [default: true]. - -Miscellaneous: - -h --help Show help message. - --version Show version information. +yamet + +input: + -c [ --cytosine_report ] arg Per-cell cytosine report file(s) + (Bismark-like for covered cytosines). + Synonyms: --cytosine_report, --cell, + -c. + Tab-separated, sorted by chromosome and + position: + chr pos meth_reads total_reads + rate + -r [ --cytosine_locations ] arg Genomic locations of all cytosines + (typically CpGs). Synonyms: + --cytosine_locations, --reference, -r. + Required to reconstruct contiguous CpG + sequences. + Columns: + chr pos + -i [ --regions ] arg BED file defining genomic regions where + entropies will be computed. Synonyms: + --regions, --features, --target, + --intervals, -i. + Columns: + chr start end + --skip-header-all [=arg(=1)] Header lines to skip in all input files + (default: 0). Synonyms: + --skip-header-all, --skip-header. + --skip-header-cytosine_report [=arg(=1)] + Header lines to skip in + cytosine_report/cell files (default: + 0). + --skip-header-cytosine_locations [=arg(=1)] + Header lines to skip in + cytosine_locations/reference file + (default: 0). + --skip-header-regions [=arg(=1)] Header lines to skip in + regions/features/target/intervals file + (default: 0). + +output: + -d [ --det-out ] arg (optional) path to detailed output file + -n [ --norm-det-out ] arg (optional) path to detailed normalized + output file + -m [ --meth-out ] arg (optional) path to average methylation + output file + --all-meth [=arg(=true)] (=false) If true, include all CpGs in + methylation summaries, including those + not used for template construction + (default: false). + -o [ --out ] arg (optional) path to simple output file + +resource utilisation: + --cores arg (=0) number of cores used for simultaneously + parsing methylation files + --chunk-size arg (=64K) size of the buffer (per file) used for + reading data. Can be specified as a + positive integer (bytes) or with a + suffix: B, K, M, G. + +verbose: + --print-intervals print parsed intervals file + --print-reference print parsed reference file + --print-sampens [=arg(=true)] (=true) print computed sample entropies + +misc: + -h [ --help ] print help message + --version current version information + + ``` ## Repository diff --git a/workflow/Snakefile b/workflow/Snakefile index 832011d..311a818 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -34,7 +34,7 @@ rule all: # op.join("annotation", "mm10", "done.flag"), ## this to download regions for Ecker's # op.join("data", "crc", "download.flag"), # this to download CRC cytosine reports, caution will redownload them # op.join("data", "crc", "results", "crc.html") , ## this requires the `download.flag` being present (and the cytosine reports; dependencies are not well captured) - # op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet") + op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet"), # op.join('scnmt_gastrulation_data', 'downloaded_data.tsv'), ## this is Argelaguet's gastrulation # expand( # op.join( @@ -58,12 +58,16 @@ rule all: rule install_yamet: """ - Binary to goes to ~/.local/yamet; export path accordingly + Binary to goes to ~/.local/yamet; exported path accordingly """ conda: "envs/yamet.yml" output: op.join(op.expanduser("~"), ".local", "yamet", "bin", "yamet") + priority: + 100 + threads: + workflow.cores log: op.join("logs", "build.log") shell: diff --git a/workflow/rules/crc.smk b/workflow/rules/crc.smk index 6b2d8a2..080327b 100644 --- a/workflow/rules/crc.smk +++ b/workflow/rules/crc.smk @@ -45,6 +45,8 @@ ANNOTATIONS = { "chip": ["H3K27me3", "H3K9me3", "H3K4me3"], "lad": ["laminb1"], "genes": ["genes"], + "lines": ["lines"], + "sines": ['sines'], "cpgIslandExt": ['cpgIslandExt'], "scna": ["crc01_nc_scna", "crc01_gain_scna", "crc01_lost_scna"] } @@ -224,10 +226,12 @@ rule run_yamet_on_separate_features: output: simple_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out")), det_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out")), + norm_det_uncomp=temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out")), meth_uncomp = temp(op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out")), simple=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.out.gz"), det=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.det.out.gz"), - meth=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out.gz") + meth=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.meth.out.gz"), + norm_det=op.join(CRC_OUTPUT, "{subcat}_{cat}_{patient}_{location}.norm.det.out.gz") log: op.join('logs', 'yamet_{subcat}_{cat}_{patient}_{location}.log') group: @@ -246,7 +250,8 @@ rule run_yamet_on_separate_features: --print-sampens F \ --out {output.simple_uncomp} \ --det-out {output.det_uncomp} \ - --meth-out {output.meth_uncomp} &> {log} + --meth-out {output.meth_uncomp} \ + --norm-det-out {output.norm_det_uncomp} &> {log} gzip --keep -f {params.path}/{wildcards.subcat}_{wildcards.cat}_{wildcards.patient}_{wildcards.location}*out &>> {log} """ @@ -267,6 +272,8 @@ rule render_crc_report: list_relevant_yamet_outputs() params: output_path=CRC_OUTPUT + threads: + round(workflow.cores/2) output: op.join(CRC, "results", "crc.html") log: @@ -376,9 +383,11 @@ rule run_yamet_on_windows: simple_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.out")), det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.det.out")), meth_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out")), + norm_det_uncomp=temp(op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.norm.det.out")), simple=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.out.gz"), det=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.det.out.gz"), - meth=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out.gz") + meth=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.meth.out.gz"), + norm_det=op.join(CRC_WINDOWS_OUTPUT, "{win_size}_{patient}_{location}.norm.det.out.gz") log: op.join('logs', 'yamet_{win_size}_{patient}_{location}.log') group: @@ -397,7 +406,8 @@ rule run_yamet_on_windows: --print-sampens F \ --out {output.simple_uncomp} \ --det-out {output.det_uncomp} \ - --meth-out {output.meth_uncomp} &> {log} + --meth-out {output.meth_uncomp} \ + --norm-det-out {output.norm_det_uncomp} &> {log} gzip --keep -f {params.path}/{wildcards.win_size}_{wildcards.patient}_{wildcards.location}*out &>> {log} """ @@ -416,9 +426,9 @@ rule get_scna_patient1_from_supplementary_data: conda: op.join("..", "envs", "r.yml") input: - op.join("src", "scna_hg19.xlsx"), + op.join(".", "src", "scna_hg19.xlsx"), output: - scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz") + scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz.pre") script: "src/parse_scna.R" @@ -426,7 +436,7 @@ rule split_patient1_crc_in_kept_lost_gained: conda: op.join("..", "envs", "r.yml") input: - scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz"), + scna_bed = op.join(HG19_BASE, "patient_crc01_scna.scna.bed.gz.pre"), genome_sizes = op.join(HG19_BASE, "genome.sizes"), output: uncomp = temp(op.join(HG19_BASE, "crc01_scna.bed")), diff --git a/workflow/rules/hg19.smk b/workflow/rules/hg19.smk index 43b4800..85f36a0 100644 --- a/workflow/rules/hg19.smk +++ b/workflow/rules/hg19.smk @@ -43,6 +43,23 @@ rule build_genome_hg19_ref: "src/make_ref.sh" +rule get_hg19_lines: + conda: + op.join("..", "envs", "processing.yml") + output: + op.join(HG19_BASE, "rmsk.lines.bed.gz"), + script: + "src/download_hg19_lines.sh" + + +rule get_hg19_sines: + conda: + op.join("..", "envs", "processing.yml") + output: + op.join(HG19_BASE, "rmsk.sines.bed.gz"), + script: + "src/download_hg19_sines.sh" + rule get_genes_hg19: conda: op.join("..", "envs", "processing.yml") @@ -51,6 +68,16 @@ rule get_genes_hg19: script: "src/download_hg19_genes.sh" +rule uncompress_repeats: + conda: + op.join("..", "envs", "processing.yml") + input: + op.join(HG19_BASE, "rmsk.{type}.bed.gz"), + output: + op.join(HG19_BASE, "{type}.{type}.bed"), + shell: + "gunzip -c {input} > {output}" + rule uncompress_hg19_genes: conda: diff --git a/workflow/rules/mm10.smk b/workflow/rules/mm10.smk index 2d730ed..7e09ec1 100644 --- a/workflow/rules/mm10.smk +++ b/workflow/rules/mm10.smk @@ -6,7 +6,7 @@ CHRS = [str(i) for i in range(1, 20)] + ["X", "Y"] MM10_BASE = "mm10" -rule mm10_chr_ref: +rule mm10_per_chr_ref: conda: op.join("..", "envs", "processing.yml") output: @@ -18,7 +18,7 @@ rule mm10_chr_ref: "src/get_chr_ref.sh" -rule mm10_ref: +rule mm10_aggregate_ref: input: expand(op.join(MM10_BASE, "{chr}.{{meth_pat}}.ref"), chr=CHRS), params: @@ -46,7 +46,24 @@ rule get_mm10_genes: script: "src/download_mm10_genes.sh" +rule get_mm10_lines: + conda: + op.join("..", "envs", "processing.yml") + output: + op.join(MM10_BASE, "lines.bed.gz"), + script: + "src/download_hg19_lines.sh" + + +rule get_mm10_sines: + conda: + op.join("..", "envs", "processing.yml") + output: + op.join(MM10_BASE, "sines.bed.gz"), + script: + "src/download_mm10_sines.sh" + ENCODE_MAP = { "h3k4me3": "ENCFF160SCR", "h3k9me3": "ENCFF658QTP", diff --git a/workflow/rules/src/crc.Rmd b/workflow/rules/src/crc.Rmd index 4344c1d..2f3b503 100644 --- a/workflow/rules/src/crc.Rmd +++ b/workflow/rules/src/crc.Rmd @@ -49,6 +49,7 @@ opts_chunk$set( ```{r save_snmk_object_for_debugging, include = FALSE} saveRDS(file = 'snmk_object_features.rds', object = snakemake) +## snakemake <- readRDS('../../snmk_object_features.rds') ``` @@ -58,11 +59,9 @@ sink(log) sink(log, type = "message") ``` -# Short (global) reports by annotation {.tabset .tabset-pills} -## Modelling -Firstly, we read in the yamet output files and create a table storing the biopsy location, patient id and genomic segmentation. +These are not det.out.gz, but only out.gz files. ```{r, import_short_reports} filepaths <- list.files( @@ -71,9 +70,9 @@ filepaths <- list.files( ) filepaths <- filepaths[!grepl("\\.det\\.out\\.gz$", filepaths)] -filepaths <- filepaths[!grepl("\\.norm\\.out\\.gz$", filepaths)] filepaths <- filepaths[!grepl("\\.meth\\.out\\.gz$", filepaths)] + regex <- "(.*)_(.*)_(.*)_(.*).out.gz" process_file <- function(filepath) { filename <- basename(filepath) @@ -98,17 +97,6 @@ files$location <- factor( ) ``` -Patient as random effect - -```{r, lmer} -lmer.files <- lmer( - sampen ~ location * avg_meth + (1 | patient), - data = subset(files, annotation == "pmds") -) -summary(lmer.files) -anova(lmer.files, type = 3) -``` - ## Scatters by location and annotation ```{r} @@ -262,169 +250,360 @@ ggplot(ranges, aes(color = location)) + ``` -## Others +# After normalization - same plots (the yamet-way) -```{r, fig.height = 10, fig.width = 10} +That is, not by locus but by cell norm sampEns (that are calculated according to some regionset anyway) -ggplot(ranges, aes(x = compartment, y = location, fill = sampen_mean)) + - geom_tile(color = "white") + - scale_fill_viridis_c() + - theme_minimal() + - theme(axis.text.x = element_text(angle = 90)) +```{r, import_short_reports_norm} +filepaths <- list.files( + snakemake@params[["output_path"]], + pattern = "out\\.gz$", full.names = TRUE) -ggplot(ranges, aes(x = compartment, y = location, fill = meth_mean)) + - geom_tile(color = "white") + - scale_fill_viridis_c() + - theme_minimal() + - theme(axis.text.x = element_text(angle = 90)) +filepaths <- filepaths[!grepl("\\.det\\.out\\.gz$", filepaths)] +filepaths <- filepaths[!grepl("\\.norm\\.out\\.gz$", filepaths)] +filepaths <- filepaths[!grepl("\\.meth\\.out\\.gz$", filepaths)] + +head(filepaths) +regex <- "(.*)_(.*)_(.*)_(.*).out.gz" +process_file <- function(filepath) { + filename <- basename(filepath) + + dt <- read_tsv(filepath, na = "-1", show_col_types = FALSE) + dt <- dt[, c("sampen_norm", "avg_meth")] + colnames(dt) <- c('sampen', 'avg_meth') ## caution they're normalized though + + dt$annotation <- gsub(regex, "\\1", filename) + dt$patient <- gsub(regex, "\\3", filename) + dt$location <- gsub(regex, "\\4", filename) + + return(dt) +} + +head(filepaths) + +files <- do.call(rbind, bplapply(filepaths, process_file, BPPARAM = param)) + +files$annotation <- as.factor(files$annotation) +files$patient <- as.factor(files$patient) +files$location <- factor( + files$location, + levels = c("NC", "PT", "LN", "ML", "MP", "MO") +) +``` + +## Scatters by location and annotation + +```{r} +ggplot( + ## subset(files, patient == "CRC01"), + files, + aes(y = sampen, x = avg_meth, color = location, shape = patient) +) + + geom_point(size = 0.9, alpha = 0.2) + + scale_color_brewer(palette = "Set1") + + scale_shape_manual(values = 1:nlevels(files$patient)) + + facet_wrap(~annotation) + + labs( + x = "average methylation", + y = "norm sample entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() +``` +## Scatters by location and annotation and patient + +```{r, fig.height = 15} +ggplot( + ## subset(files, patient == "CRC01"), + files, + aes(y = sampen, x = avg_meth, color = location) +) + + geom_point(size = 0.9, alpha = 0.2) + + scale_color_brewer(palette = "Set1") + + facet_grid(annotation ~ patient) + + labs( + x = "average methylation", + y = "norm sample entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) ``` -# Full (local) reports for **downsampled** PMDs and CRC01 +## Scatters by location and annotation and patient (shorter) + +```{r, fig.height = 3.2} +ggplot( + subset(files, annotation %in% c("laminb1", "H3K27me3", "H3K9me3")), + aes(y = sampen, x = avg_meth, color = location) +) + + geom_point(size = 0.9, alpha = 0.2) + + scale_color_brewer(palette = "Set1") + + facet_grid(annotation ~ patient) + + labs( + x = "average methylation", + y = "norm sample entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +``` -Caution downsampled to some rows only. +## sampEn boxes by location and annotation - crc01 -```{r, import_log_reports} -DOWNSAMPLING <- 5e4 +```{r, fig.height = 6, fig.width = 10} +ggplot( + subset(files, patient == "CRC01"), + aes(y = sampen, x = annotation, color = location) +) + + geom_boxplot( + outlier.size = 0.6, + outlier.alpha = 0.3 + ) + + labs(y = "norm sample entropy") + + geom_jitter( + position = position_dodge(width = 0.75), + size = 0.6, alpha = 0.3 + ) + + scale_color_brewer(palette = "Dark2") + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +``` -annotation <- "pmds" -patient <- "CRC01" +## meth boxes by location and annotation - crc01 -local <- list() -for (location in c("NC", "PT", "LN", "ML", "MP")) { - filepaths <- list.files( - snakemake@params[["output_path"]], - pattern = "\\.det\\.out\\.gz$", full.names = TRUE - ) - filepaths <- grep(location, filepaths, value = TRUE) - filepaths <- grep(annotation, filepaths, value = TRUE) - filepaths <- grep(patient, filepaths, value = TRUE) - - d <- bplapply(filepaths, \(filepath) { - dt <- read_tsv( - filepath, - na = "-1", n_max = DOWNSAMPLING, show_col_types = FALSE - ) - colnames(dt) <- basename(colnames(dt)) - dt - }, BPPARAM = param) - - d2 <- bplapply(d, \(x) { - avg_sampen <- apply( - x[, grep(location, colnames(x))], 1, \(y) mean(y, na.rm = TRUE) - ) - median_sampen <- apply( - x[, grep(location, colnames(x))], 1, \(y) median(y, na.rm = TRUE) - ) - q25_sampen <- apply( - x[, grep(location, colnames(x))], 1, \(y) quantile(y, 0.25, na.rm = TRUE) - ) - q75_sampen <- apply( - x[, grep(location, colnames(x))], 1, \(y) quantile(y, 0.75, na.rm = TRUE) - ) - cbind(x, data.frame( - avg_sampen = avg_sampen, median_sampen = median_sampen, - q25_sampen = q25_sampen, q75_sampen = q75_sampen, - location = location, - patient = patient, - annotation = annotation - )) - }, BPPARAM = param) - - local[[location]] <- d2[[1]] - rm(d, d2) -} +```{r, fig.height = 6, fig.width = 10} +ggplot( + subset(files, patient == "CRC01"), + aes(y = avg_meth, x = annotation, color = location) +) + + geom_boxplot( + outlier.size = 0.6, + outlier.alpha = 0.3 + ) + + labs(y = "average methylation") + + geom_jitter( + position = position_dodge(width = 0.75), + size = 0.6, alpha = 0.3 + ) + + scale_color_brewer(palette = "Dark2") + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) ``` -## Scatters {.tabset .tabset-pills} - -### By meth NC - -```{r, fig.width = 10, fig.height = 10} -pal <- viridis(10, alpha = 0.8) - -beta2m <- function(betas, epsilon = 1e-05) { - if (!is.numeric(betas)) { - stop("Non-numeric betas") - } - if (!(is.numeric(epsilon) && (!is.na(epsilon)))) { - stop("Epison not valid") - } - if (epsilon < 0 || epsilon >= 0.5) { - stop("Not 0 < epsilon < 0.5") - } - betas[betas < epsilon] <- epsilon - betas[betas > (1 - epsilon)] <- 1 - epsilon - return(log2(betas / (1 - betas))) -} +## Range plots - by compartment -pmds <- data.frame( - shannon_NC = local$NC$shannon, - meth_NC = local$NC$avg_meth, - med_sampen_NC = local$NC$median_sampen, - shannon_PT = local$PT$shannon, - meth_PT = local$PT$avg_meth, - med_sampen_PT = local$PT$median_sampen -) -pmds$meth_change <- pmds$meth_PT - pmds$meth_NC -pmds <- na.omit(pmds) +```{r, fig.height = 10, fig.width = 10} -pmds$by_meth_nc <- cut(pmds$meth_NC, breaks = 10) -pmds$by_meth_pt <- cut(pmds$meth_PT, breaks = 10) -plot(pmds[, grep("by", colnames(pmds), invert = TRUE)], - pch = ".", col = pal[as.numeric(pmds$by_meth_nc)], main = "by_meth_nc" -) +ranges <- as.data.frame(files %>% + filter(patient == "CRC01") %>% + group_by(compartment = annotation, location) %>% + summarise( + sampen_mean = mean(sampen, na.rm = TRUE), + sampen_min = min(sampen, na.rm = TRUE), + sampen_max = max(sampen, na.rm = TRUE), + meth_mean = mean(avg_meth, na.rm = TRUE), + meth_min = min(avg_meth, na.rm = TRUE), + meth_max = max(avg_meth, na.rm = TRUE))) + +## ggplot(ranges, aes(x = meth_mean, y = sampen_mean, color = location)) + +## geom_point(shape = 3, size = 1.2) + # "+" symbol +## geom_errorbar(aes(ymin = sampen_min, ymax = sampen_max), width = 0.3, alpha = 1, size = 1) + +## geom_errorbarh(aes(xmin = meth_min, xmax = meth_max), height = 0.3, alpha = 1, size = 0) + +## scale_color_brewer(palette = "Dark2") + +## labs( +## x = "DNA methylation", +## y = "Sample entropy" +## ) + +## facet_wrap(~ compartment) + +## theme_ng() + +# I think this is the correct one +ggplot(ranges, aes(color = location)) + + ## horizontal segment: range of methylations for that location/compartment + geom_segment( + aes(x = meth_min, xend = meth_max, y = sampen_mean, yend = sampen_mean), + size = 0.6, alpha = 1 + ) + + # vertical segment: range of sampens for that location/compartment + geom_segment( + aes(x = meth_mean, xend = meth_mean, y = sampen_min, yend = sampen_max), + size = 0.6, alpha = 1 + ) + + # center point + geom_point( + aes(x = meth_mean, y = sampen_mean), + shape = 3, size = 1.2 + ) + + scale_color_brewer(palette = "Dark2") + + labs(x = "DNA methylation", y = "Normalized sample entropy") + + facet_wrap(~ compartment) + + theme_ng() + ``` -### By median sampen PT +# Unnormalized Shannon's MEDIANS DOWNSAMPLED {.tabset .tabset-pills} +This cannot be done on short `out.gz` files, but on detailed ones only. Median values, not all loci included! -```{r, fig.width = 10, fig.height = 10} -pmds$by_med_sampen_pt <- cut(pmds$med_sampen_PT, breaks = 10) -plot(pmds[, grep("by", colnames(pmds), invert = TRUE)], - pch = ".", col = pal[as.numeric(pmds$by_med_sampen_pt)], - main = "by_med_sampen_PT" +```{r, eval = TRUE} +filepaths <- list.files( + snakemake@params[["output_path"]], + pattern = "det\\.out\\.gz$", full.names = TRUE ) + +filepaths <- filepaths[!grepl("norm", filepaths)] + +head(filepaths) + +## todo extract Shannon's here instead + +regex <- "(.*)_(.*)_(.*)_(.*).det.out.gz" + +process_file <- function(filepath) { + filename <- basename(filepath) + + tmp <- read_tsv(filepath, na = "-1", show_col_types = FALSE, n_max = 10e4) ## caution downsampling + + tibble( + file = filename, + median_shannon = median(tmp$shannon, na.rm = TRUE), + median_avg_meth = median(tmp$avg_meth, na.rm = TRUE), + annotation = gsub(regex, "\\1", filename), + patient = gsub(regex, "\\3", filename), + location = gsub(regex, "\\4", filename) + ) +} + +files <- bplapply(filepaths, process_file, BPPARAM = param) |> bind_rows() + +files$annotation <- factor(files$annotation) +files$patient <- factor(files$patient) +files$location <- factor(files$location, + levels = c("NC", "PT", "LN", "ML", "MP", "MO")) + ``` -### By median sampen NC +## Scatters by location and annotation + +```{r} +ggplot( + files, + aes(y = median_shannon, x = median_avg_meth, color = location, shape = patient) +) + + geom_point(size = 0.9, alpha = 0.2) + + scale_color_brewer(palette = "Set1") + + scale_shape_manual(values = 1:nlevels(files$patient)) + + facet_wrap(~annotation) + + labs( + x = "median average methylation", + y = "median Shannon entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() +``` -```{r, fig.width = 10, fig.height = 10, results = 'asis'} -pmds$by_med_sampen_nc <- cut(pmds$med_sampen_NC, breaks = 10) -plot(pmds[, grep("by", colnames(pmds), invert = TRUE)], - pch = ".", col = pal[as.numeric(pmds$by_med_sampen_nc)], - main = "by_med_sampen_nc" -) +## Scatters by location and annotation and patient + +```{r, fig.height = 15} +ggplot( + files, + aes(y = median_shannon, x = median_avg_meth, color = location) +) + + geom_point(size = 0.9, alpha = 0.2) + + ## scale_color_brewer(palette = "Set1") + + ## facet_grid(annotation ~ patient) + + labs( + x = "median average methylation", + y = "median Shannon entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) ``` -### By methylation change -```{r, fig.width = 10, fig.height = 10, results = 'asis'} -pmds$by_meth_change <- cut(pmds$meth_change, breaks = 10) -plot(pmds[, grep("by", colnames(pmds), invert = TRUE)], - pch = ".", col = pal[as.numeric(pmds$by_meth_change)], - main = "by_meth_change" + +# Meth-normalized Shannons (the yamet way) MEDIANS {.tabset .tabset-pills} + +This cannot be done on short `out.gz` files, but on detailed ones only. Median values, not all loci included! + +```{r, eval = TRUE} +filepaths <- list.files( + snakemake@params[["output_path"]], + pattern = "norm\\.det\\.out\\.gz$", full.names = TRUE ) -``` -Add extra discretization. +head(filepaths) -```{r} -pmds$by_sampen_pt <- cut(pmds$med_sampen_PT, breaks = 10) -pmds$by_sampen_nc <- cut(pmds$med_sampen_NC, breaks = 10) -pmds$by_shannon_pt <- cut(pmds$shannon_PT, breaks = 10) -pmds$by_shannon_nc <- cut(pmds$shannon_NC, breaks = 10) -``` +## todo extract Shannon's here instead + +regex <- "(.*)_(.*)_(.*)_(.*).norm.det.out.gz" + +process_file <- function(filepath) { + filename <- basename(filepath) -# SCNA checks, by cytoband + tmp <- read_tsv(filepath, na = "-1", show_col_types = FALSE, n_max = 10e4) # caution downsampling -Is this necessary? given we don't see much of a change https://github.com/imallona/yamet/blob/9156b817e04d9294024631dc25b528e191d913f4/workflow/rules/src/crc_deletions.Rmd#L96 + tibble( + file = filename, + median_shannon = median(tmp$shannon_norm, na.rm = TRUE), + median_avg_meth = median(tmp$avg_meth, na.rm = TRUE), + annotation = gsub(regex, "\\1", filename), + patient = gsub(regex, "\\3", filename), + location = gsub(regex, "\\4", filename) + ) +} +files <- bplapply(filepaths, process_file, BPPARAM = param) |> bind_rows() +files$annotation <- factor(files$annotation) +files$patient <- factor(files$patient) +files$location <- factor(files$location, + levels = c("NC", "PT", "LN", "ML", "MP", "MO")) +``` +## Scatters by location and annotation + +```{r} +ggplot( + files, + aes(y = median_shannon, x = median_avg_meth, color = location, shape = patient) +) + + geom_point(size = 0.9, alpha = 0.2) + + scale_color_brewer(palette = "Set1") + + scale_shape_manual(values = 1:nlevels(files$patient)) + + facet_wrap(~annotation) + + labs( + x = "median average methylation", + y = "median norm Shannon entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() +``` + +## Scatters by location and annotation and patient + +```{r, fig.height = 15} +ggplot( + files, + aes(y = median_shannon, x = median_avg_meth, color = location) +) + + geom_point(size = 0.9, alpha = 0.2) + + ## scale_color_brewer(palette = "Set1") + + ## facet_grid(annotation ~ patient) + + labs( + x = "median average methylation", + y = "median norm Shannon entropy" + ) + + guides(color = guide_legend(override.aes = list(size = 2))) + + theme_ng() + + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +``` diff --git a/workflow/rules/src/download_hg19_lines.sh b/workflow/rules/src/download_hg19_lines.sh new file mode 100644 index 0000000..a418bc6 --- /dev/null +++ b/workflow/rules/src/download_hg19_lines.sh @@ -0,0 +1,12 @@ +#!/bin/bash +## +## Downloads RepeatMasker (rmsk) for hg19 and extracts only SINE elements +## + +mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \ + 'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily + FROM hg19.rmsk + WHERE repClass = "LINE";' | + awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | + bedtools merge -i - | + gzip -c > ${snakemake_output[0]} diff --git a/workflow/rules/src/download_hg19_sines.sh b/workflow/rules/src/download_hg19_sines.sh new file mode 100644 index 0000000..fb8689b --- /dev/null +++ b/workflow/rules/src/download_hg19_sines.sh @@ -0,0 +1,12 @@ +#!/bin/bash +## +## Downloads RepeatMasker (rmsk) for hg19 and extracts only SINE elements +## + +mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \ + 'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily + FROM hg19.rmsk + WHERE repClass = "SINE";' | + awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | + bedtools merge -i - | + gzip -c > ${snakemake_output[0]} diff --git a/workflow/rules/src/download_mm10_lines.sh b/workflow/rules/src/download_mm10_lines.sh new file mode 100644 index 0000000..8ea0e27 --- /dev/null +++ b/workflow/rules/src/download_mm10_lines.sh @@ -0,0 +1,12 @@ +#!/bin/bash +## +## Downloads RepeatMasker (rmsk) for mm10 and extracts only SINE elements +## + +mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \ + 'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily + FROM mm10.rmsk + WHERE repClass = "LINE";' | + awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | + bedtools merge -i - | + gzip -c > ${snakemake_output[0]} diff --git a/workflow/rules/src/download_mm10_sines.sh b/workflow/rules/src/download_mm10_sines.sh new file mode 100644 index 0000000..c618543 --- /dev/null +++ b/workflow/rules/src/download_mm10_sines.sh @@ -0,0 +1,12 @@ +#!/bin/bash +## +## Downloads RepeatMasker (rmsk) for mm10 and extracts only SINE elements +## + +mysql --user=genome --host=genome-mysql.cse.ucsc.edu -N -s -e \ + 'SELECT genoName, genoStart, genoEnd, repName, repClass, repFamily + FROM mm10.rmsk + WHERE repClass = "SINE";' | + awk 'BEGIN{OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | + bedtools merge -i - | + gzip -c > ${snakemake_output[0]}