-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDGE_analysis.R
More file actions
79 lines (65 loc) · 3.12 KB
/
DGE_analysis.R
File metadata and controls
79 lines (65 loc) · 3.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# =========================================================================
# Title: Differential Gene Expression (DGE) Analysis Pipeline
# Purpose: Identify significant genes between LS and HS conditions
# Author: Daniil Gerassimov
# 1. Environment Setup ----------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, readxl, writexl, ggrepel)
# 2. Data Loading & Cleaning ----------------------------------------------
# Load data from Excel (handles both .xlsx and .xls)
raw_data <- read_excel("data/gene_exp1.xlsx")
# Function to fix comma-decimal issues common in localized Excel exports
clean_numeric <- function(x) {
if(is.character(x)) as.numeric(gsub(",", ".", x)) else x
}
# Define sample groups (Adjust names based on your columns)
ls_samples <- c("LS1L1", "LS1L2", "LS1L3")
hs_samples <- c("HS1L1", "HS1L2", "HS1L3")
# Apply cleaning and ensure numeric types
clean_data <- raw_data %>%
mutate(across(all_of(c(ls_samples, hs_samples)), clean_numeric))
# 3. Statistical Analysis -------------------------------------------------
analysis_results <- clean_data %>%
rowwise() %>%
mutate(
# Paired t-test
p_value = t.test(c_across(all_of(ls_samples)),
c_across(all_of(hs_samples)),
paired = TRUE)$p.value,
# Calculate means (using na.rm for safety)
mean_LS = mean(c_across(all_of(ls_samples)), na.rm = TRUE),
mean_HS = mean(c_across(all_of(hs_samples)), na.rm = TRUE),
# Log2 Fold Change calculation with pseudocount to avoid log(0)
LFC = log2((mean_HS + 0.01) / (mean_LS + 0.01))
) %>%
ungroup() %>%
# Categorize genes based on thresholds
mutate(status = case_when(
p_value < 0.05 & LFC > 1 ~ "Upregulated",
p_value < 0.05 & LFC < -1 ~ "Downregulated",
TRUE ~ "Not significant"
))
# 4. Publication-Quality Plotting -----------------------------------------
# Identify top 10 genes for labeling
top_10_genes <- analysis_results %>%
filter(status != "Not significant") %>%
arrange(p_value) %>%
head(10)
# 4.1 Volcano Plot
volcano_plot <- ggplot(analysis_results, aes(x = LFC, y = -log10(p_value), color = status)) +
geom_point(alpha = 0.4, size = 1.8) +
scale_color_manual(values = c("Upregulated" = "#CC0000",
"Downregulated" = "#006600",
"Not significant" = "grey70")) +
geom_text_repel(data = top_10_genes, aes(label = gene), color = "black", size = 3) +
geom_vline(xintercept = c(-1, 1), linetype = "dashed", alpha = 0.5) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", alpha = 0.5) +
theme_minimal() +
labs(title = "Differential Gene Expression Analysis",
subtitle = "Condition HS vs LS",
x = "Log2 Fold Change", y = "-Log10(p-value)")
# 5. Exporting Results ----------------------------------------------------
dir.create("results", showWarnings = FALSE)
write_xlsx(analysis_results, "results/DGE_Analysis_Report.xlsx")
ggsave("results/volcano_plot.png", plot = volcano_plot, width = 8, height = 6, dpi = 300)
print("Analysis Complete. Results saved in 'results/' folder.")