-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdownstream_analysis.R
More file actions
375 lines (325 loc) · 14.4 KB
/
downstream_analysis.R
File metadata and controls
375 lines (325 loc) · 14.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# =============================================================================
# ROGEN Project - Activity 2.1.8.1
# Downstream Methylation Analysis with DMRcaller
# =============================================================================
# This R script demonstrates the downstream analysis workflow for identifying
# Differentially Methylated Regions (DMRs) using the DMRcaller Bioconductor
# package.
#
# Input: bedMethyl files generated by Modkit (from pipeline_validation.sh)
# Output: DMRs between experimental groups (e.g., Old vs Young samples)
# =============================================================================
# =============================================================================
# SECTION 1: Package Installation and Loading
# =============================================================================
# Install BiocManager if not already installed
if (!require("BiocManager", quietly = TRUE)) {
install.packages("BiocManager")
}
# Install DMRcaller if not already installed
# DMRcaller is available on Bioconductor
if (!require("DMRcaller", quietly = TRUE)) {
BiocManager::install("DMRcaller")
}
# Load required libraries
library(DMRcaller)
library(GenomicRanges) # For genomic range operations
library(rtracklayer) # For importing bedMethyl files
# Additional useful packages for methylation analysis
# Uncomment if needed:
# library(ggplot2) # For visualization
# library(dplyr) # For data manipulation
# library(BSgenome) # For reference genome sequences
# =============================================================================
# SECTION 2: Data Import and Preprocessing
# =============================================================================
#' Import bedMethyl file generated by Modkit
#'
#' @param bedmethyl_path Character string, path to bedMethyl file
#' @return GRanges object with methylation data
#'
#' @details
#' The bedMethyl format (BED 9+3) contains:
#' - Standard BED fields: chrom, start, end, name, score, strand
#' - Coverage: number of reads covering this position
#' - Percent methylated: percentage of reads showing methylation
#' - Additional fields: count_methylated, count_unmethylated
#'
#' DMRcaller expects data in a specific format, so we may need to convert
#' the bedMethyl data to the required structure.
import_bedmethyl <- function(bedmethyl_path) {
cat("Importing bedMethyl file:", bedmethyl_path, "\n")
# Read bedMethyl file using rtracklayer
# The bedMethyl format is a BED9+3 format
bedmethyl_data <- import(
bedmethyl_path,
format = "bed",
genome = "unknown" # Specify genome if known (e.g., "hg38", "mm10")
)
# Extract metadata columns (coverage, percent methylated, etc.)
# These are typically in additional columns after the standard BED fields
# Note: Actual column names may vary based on Modkit version
cat(" Imported", length(bedmethyl_data), "methylation calls\n")
return(bedmethyl_data)
}
# Example usage (commented out for skeleton):
# young_sample_1 <- import_bedmethyl("path/to/young_sample1.bedMethyl")
# young_sample_2 <- import_bedmethyl("path/to/young_sample2.bedMethyl")
# old_sample_1 <- import_bedmethyl("path/to/old_sample1.bedMethyl")
# old_sample_2 <- import_bedmethyl("path/to/old_sample2.bedMethyl")
# =============================================================================
# SECTION 3: Data Preparation for DMRcaller
# =============================================================================
#' Prepare methylation data for DMRcaller
#'
#' @param bedmethyl_granges GRanges object from import_bedmethyl
#' @param sample_name Character string, name of the sample
#' @return List with coverage and methylation matrices
#'
#' @details
#' DMRcaller requires:
#' - Coverage matrix: rows = genomic positions, columns = samples
#' - Methylation matrix: rows = genomic positions, columns = samples
#' - Both matrices must have matching dimensions and row names (genomic positions)
prepare_dmrcaller_data <- function(bedmethyl_granges, sample_name) {
cat("Preparing data for DMRcaller:", sample_name, "\n")
# Extract coverage information
# In bedMethyl format, coverage is typically in a metadata column
# Adjust column name based on your Modkit output
coverage <- mcols(bedmethyl_granges)$score # Or appropriate column name
# Extract methylation percentage and convert to counts
# percent_methylated is typically in a metadata column
# Adjust based on actual bedMethyl structure
percent_meth <- mcols(bedmethyl_granges)$thickStart # Placeholder - adjust!
# percent_meth <- mcols(bedmethyl_granges)$percent_methylated # Actual column
# Calculate methylated and unmethylated counts
methylated_count <- round(coverage * percent_meth / 100)
unmethylated_count <- coverage - methylated_count
# Create genomic position identifiers
positions <- paste0(
seqnames(bedmethyl_granges), ":",
start(bedmethyl_granges), "-",
end(bedmethyl_granges)
)
return(list(
positions = positions,
coverage = coverage,
methylated = methylated_count,
unmethylated = unmethylated_count
))
}
# =============================================================================
# SECTION 4: DMR Calling Function
# =============================================================================
#' Identify Differentially Methylated Regions (DMRs) between two groups
#'
#' @param group1_data List of data frames/lists for group 1 samples (e.g., Young)
#' @param group2_data List of data frames/lists for group 2 samples (e.g., Old)
#' @param group1_name Character string, name of group 1
#' @param group2_name Character string, name of group 2
#' @param min_coverage Numeric, minimum coverage required per position (default: 5)
#' @param p_value_threshold Numeric, p-value threshold for DMR significance (default: 0.01)
#' @param min_cpg Numeric, minimum number of CpG sites per DMR (default: 3)
#' @return GRanges object with identified DMRs
#'
#' @details
#' This function uses DMRcaller to identify regions with significant
#' differences in methylation between two groups. The function:
#' 1. Combines data from multiple samples per group
#' 2. Filters positions by minimum coverage
#' 3. Calculates differential methylation statistics
#' 4. Identifies contiguous regions of differential methylation
#' 5. Filters DMRs by significance and size
calculate_dmrs <- function(
group1_data,
group2_data,
group1_name = "Group1",
group2_name = "Group2",
min_coverage = 5,
p_value_threshold = 0.01,
min_cpg = 3
) {
cat("\n")
cat("=============================================================================\n")
cat("Calculating DMRs:", group1_name, "vs", group2_name, "\n")
cat("=============================================================================\n")
# Combine data from all samples in each group
# This is a placeholder - actual implementation depends on data structure
cat("Combining samples within groups...\n")
cat(" Group 1 (", group1_name, "):", length(group1_data), "samples\n")
cat(" Group 2 (", group2_name, "):", length(group2_data), "samples\n")
# Filter by minimum coverage
cat("Filtering positions by minimum coverage:", min_coverage, "\n")
# Implementation: filter positions where coverage >= min_coverage in both groups
# Calculate differential methylation
cat("Calculating differential methylation statistics...\n")
cat(" Using statistical test appropriate for methylation data\n")
cat(" P-value threshold:", p_value_threshold, "\n")
# Identify DMRs
cat("Identifying contiguous DMRs...\n")
cat(" Minimum CpG sites per DMR:", min_cpg, "\n")
# Placeholder for actual DMRcaller function call
# Example structure (adjust based on actual DMRcaller API):
#
# dmrs <- DMRcaller::findDMRs(
# group1_coverage_matrix,
# group1_methylation_matrix,
# group2_coverage_matrix,
# group2_methylation_matrix,
# minCoverage = min_coverage,
# pValueThreshold = p_value_threshold,
# minCpg = min_cpg
# )
# For now, return empty GRanges as placeholder
dmrs <- GRanges()
cat("\n")
cat("DMR calling completed.\n")
cat(" Number of DMRs identified:", length(dmrs), "\n")
return(dmrs)
}
# =============================================================================
# SECTION 5: Example Workflow
# =============================================================================
#' Main analysis workflow
#'
#' This function demonstrates the complete workflow from data import to DMR calling
run_dmrcaller_workflow <- function() {
cat("\n")
cat("=============================================================================\n")
cat("ROGEN Methylation Analysis Workflow\n")
cat("=============================================================================\n")
cat("\n")
# STEP 1: Import bedMethyl files
cat("STEP 1: Importing bedMethyl files\n")
cat("-----------------------------------\n")
# Uncomment and modify paths when data is available:
#
# young_samples <- list(
# import_bedmethyl("data/young_sample1.bedMethyl"),
# import_bedmethyl("data/young_sample2.bedMethyl"),
# import_bedmethyl("data/young_sample3.bedMethyl")
# )
#
# old_samples <- list(
# import_bedmethyl("data/old_sample1.bedMethyl"),
# import_bedmethyl("data/old_sample2.bedMethyl"),
# import_bedmethyl("data/old_sample3.bedMethyl")
# )
cat(" [Placeholder: Import bedMethyl files]\n")
cat(" Expected: Multiple bedMethyl files per group\n")
cat("\n")
# STEP 2: Prepare data for DMRcaller
cat("STEP 2: Preparing data for DMRcaller\n")
cat("--------------------------------------\n")
# Uncomment when data is available:
#
# young_prepared <- lapply(seq_along(young_samples), function(i) {
# prepare_dmrcaller_data(young_samples[[i]], paste0("Young_", i))
# })
#
# old_prepared <- lapply(seq_along(old_samples), function(i) {
# prepare_dmrcaller_data(old_samples[[i]], paste0("Old_", i))
# })
cat(" [Placeholder: Prepare coverage and methylation matrices]\n")
cat(" Expected: Matrices with genomic positions as rows, samples as columns\n")
cat("\n")
# STEP 3: Calculate DMRs
cat("STEP 3: Calculating DMRs\n")
cat("--------------------------\n")
# Uncomment when data is available:
#
# dmrs <- calculate_dmrs(
# group1_data = young_prepared,
# group2_data = old_prepared,
# group1_name = "Young",
# group2_name = "Old",
# min_coverage = 5,
# p_value_threshold = 0.01,
# min_cpg = 3
# )
cat(" [Placeholder: Run DMR calling]\n")
cat(" Expected: GRanges object with DMR coordinates and statistics\n")
cat("\n")
# STEP 4: Export results
cat("STEP 4: Exporting results\n")
cat("---------------------------\n")
# Uncomment when DMRs are calculated:
#
# # Export DMRs to BED file
# export(dmrs, "dmrs_young_vs_old.bed", format = "bed")
#
# # Export summary statistics
# dmr_summary <- data.frame(
# chr = seqnames(dmrs),
# start = start(dmrs),
# end = end(dmrs),
# width = width(dmrs),
# n_cpg = mcols(dmrs)$n_cpg, # Number of CpG sites
# p_value = mcols(dmrs)$p_value,
# mean_meth_diff = mcols(dmrs)$mean_meth_diff # Mean methylation difference
# )
# write.csv(dmr_summary, "dmr_summary.csv", row.names = FALSE)
cat(" [Placeholder: Export DMRs to BED and CSV]\n")
cat(" Expected: dmrs_young_vs_old.bed and dmr_summary.csv\n")
cat("\n")
cat("=============================================================================\n")
cat("Workflow completed!\n")
cat("=============================================================================\n")
}
# =============================================================================
# SECTION 6: Additional Helper Functions
# =============================================================================
#' Annotate DMRs with gene information
#'
#' @param dmrs GRanges object with DMRs
#' @param annotation_file Path to annotation file (GTF/GFF)
#' @return GRanges object with added gene annotations
annotate_dmrs_with_genes <- function(dmrs, annotation_file) {
cat("Annotating DMRs with gene information...\n")
# Load gene annotations
# gtf <- import(annotation_file, format = "gtf")
# genes <- gtf[gtf$type == "gene"]
# Find overlaps between DMRs and genes
# overlaps <- findOverlaps(dmrs, genes)
# Add gene information to DMR metadata
# mcols(dmrs)$gene_id <- ...
# mcols(dmrs)$gene_name <- ...
return(dmrs)
}
#' Visualize DMR results
#'
#' @param dmrs GRanges object with DMRs
#' @param output_file Path to output PDF/PNG
visualize_dmrs <- function(dmrs, output_file = "dmr_visualization.pdf") {
cat("Creating DMR visualization...\n")
# Example visualization code (requires ggplot2, Gviz, or similar):
# - Manhattan plot of DMRs
# - Heatmap of methylation levels
# - Distribution of DMR sizes
# - Distribution of methylation differences
cat(" Visualization would be saved to:", output_file, "\n")
}
# =============================================================================
# SECTION 7: Script Execution
# =============================================================================
# Run the workflow when script is executed
if (!interactive()) {
run_dmrcaller_workflow()
} else {
cat("\n")
cat("=============================================================================\n")
cat("DMRcaller Analysis Script Loaded\n")
cat("=============================================================================\n")
cat("\n")
cat("Available functions:\n")
cat(" - import_bedmethyl(): Import bedMethyl files from Modkit\n")
cat(" - prepare_dmrcaller_data(): Prepare data for DMRcaller\n")
cat(" - calculate_dmrs(): Identify DMRs between two groups\n")
cat(" - run_dmrcaller_workflow(): Complete analysis workflow\n")
cat(" - annotate_dmrs_with_genes(): Add gene annotations to DMRs\n")
cat(" - visualize_dmrs(): Create visualizations\n")
cat("\n")
cat("To run the workflow, execute:\n")
cat(" run_dmrcaller_workflow()\n")
cat("\n")
}