CERIS_PAG/CERIS_EP.r at main · DDLi-ISU/CERIS_PAG · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
### Block 1
# system("git clone https://github.com/amkusmec/CERIS_PAG/")


### Block 2
if (!require(colorspace)) install.packages("colorspace")
if (!require(rrBLUP)) install.packages("rrBLUP")

# Color scales for plotting
col_wdw <- 25;
col_palette <- diverge_hcl(col_wdw + 1, h = c(260, 0), c = 100, l = c(50, 90), power = 1)
gray_alpha <- rgb(128, 128, 128, alpha = 35, maxColorValue = 255)
poly_alpha <- rgb(238, 130, 238, alpha = 55.5, maxColorValue = 255)


### Don't copy this block
### Block 3
# cwd <- '/content/CERIS_PAG/' # This is the location of the files cloned into your Google Drive
cwd <- "~/CERIS_PAG/" # For local testing
r_files <- list.files(paste0(cwd, "R"), "*", full.names = TRUE)
for (f in r_files) source(f)


### Block 4
experiment <- "Sorghum"
trait <- "FTgdd"


### Block 5
exp_dir <- paste(cwd, experiment, '/', sep = '')

# Load the environment metadata file
# Column `PlantingDate` should be formatted "YYYY-MM-DD" for proper parsing
env_meta_file <- paste(exp_dir, 'Env_meta_table.txt', sep = '')
env_meta_info_0 <- read.table(env_meta_file, header = T, sep = "\t", stringsAsFactors = F)

# Load the phenotypic data
exp_traits_file <- paste(exp_dir, 'Traits_record.txt', sep = '')
exp_traits <- read.table(exp_traits_file, sep = "\t", header = T, stringsAsFactors = F, na.string = 'NA')


### Block 6
# Load the daily environmental data
all_env_codes <- unique(exp_traits$env_code)
env_cols <- rainbow_hcl(length(all_env_codes), c = 80, l = 60, start = 0, end = 300, fixup = TRUE, alpha = 0.75)

envParas_file <- paste0(exp_dir, length(all_env_codes), 'Envs_envParas.txt')
if ( !file.exists(envParas_file) ) {
        envParas <- Compile_Envirome_Matrix(exp_dir, all_env_codes)
} else {
        envParas <- read.table(envParas_file, sep = "\t", header = T, stringsAsFactors = F, na.string = "NA")
        if (!("DAP" %in% names(envParas))) {
                params <- setdiff(names(envParas), c("env_code", "date"))
                envParas <- split(envParas, envParas$env_code)
                envParas <- lapply(envParas, function(df) {
                        df$DAP <- 0:(nrow(df) - 1)
                        df[, c("env_code", "DAP", params)]
                })
                envParas <- do.call("rbind", envParas)
        }
}

# Names of the environmental parameters
Paras <- names(envParas)[-(1:2)]


### Block 7
# Standardize the format of the trait data
lInd <- which(colnames(exp_traits) == 'line_code')
eInd <- which(colnames(exp_traits) == 'env_code')
tInd <- which(colnames(exp_traits) == trait)

exp_trait <- exp_traits[, c(lInd, eInd, tInd)]
colnames(exp_trait)[3] <- 'Yobs'

# Average across replicates within environments to reduce to one observation
# per line per environment
exp_trait <- aggregate(Yobs ~ line_code + env_code, exp_trait, mean, na.rm = TRUE)
exp_trait <- exp_trait[!is.na(exp_trait$Yobs), ]

line_codes <- unique(exp_trait$line_code)

# Calculate the environmental mean phenotype
env_mean_trait_0 <- aggregate(Yobs ~ env_code, exp_trait, mean, na.rm = TRUE)
colnames(env_mean_trait_0)[2] <- 'meanY'
env_mean_trait <- merge(env_mean_trait_0, env_meta_info_0)
env_mean_trait <- env_mean_trait[order(env_mean_trait$meanY), ]

# Reformat the phenotypic data
env_codes <- env_mean_trait$env_code
line_by_env_df <- data.frame(line_code = line_codes)
for (e_i in 1:nrow(env_mean_trait)) {
        e <- env_codes[e_i]
        e_trait <- subset(exp_trait, exp_trait$env_code == e)
        nonNAs <- length(which(!is.na(e_trait[, 3])))
        colnames(e_trait)[3] <- e
        line_by_env_df <- merge(line_by_env_df, e_trait[, c(1, 3)], all.x = T)
}


### Block 8
# Plot two different orderings of the environments
layout(matrix(1:2, ncol = 2))

# First, ordered by latitude, longitude, and planting date
plot_geoOrder(env_mean_trait, env_meta_info_0, line_by_env_df, trait)
mtext('A', side = 3, at = 1, cex = 1.2)

# Second, ordered by the environmental mean phenotype
plot_envMeans(env_mean_trait, line_by_env_df, trait)
mtext('B', side = 3, at = min(env_mean_trait$meanY), cex = 1.2)

layout(matrix(1))


### Block 9
# JRA and plots
jra_res <- JRA_Model(line_by_env_df, env_mean_trait)
plot_JRAResults(env_mean_trait, line_by_env_df, jra_res, trait)


### Block 10
ceris_res <- CERIS(env_mean_trait, envParas, Paras)
plot_CERIS(ceris_res, Paras, max(envParas$DAP))


### Block 11
# Identify the best window and parameter
idxR <- match(paste0("R_", Paras), colnames(ceris_res))
maxR <- arrayInd(which.max(abs(ceris_res[, idxR])), .dim = dim(ceris_res[, idxR]))

kPara_Name <- Paras[maxR[1, 2]]
maxR_dap1 <- ceris_res[maxR[1, 1], 2]
maxR_dap2 <- ceris_res[maxR[1, 1], 3]

# Compile average covariate values for the best window and parameter
env_mean_trait$kPara <- as.numeric(NA)
for (e_i in 1:nrow(env_mean_trait)) {
  e <- env_mean_trait$env_code[e_i]
  env_mean_trait$kPara[e_i] <- mean(subset(envParas, env_code == e)
                                    [maxR_dap1:maxR_dap2, kPara_Name])
}

# Plot the best parameter vs. phenotype means
plot_traitMean_kPara(env_mean_trait, trait, kPara_Name, maxR_dap1, maxR_dap2)

# Calculate slopes and intercepts using the best window and parameter
res_para <- slopeIntercept(exp_trait, env_mean_trait)

# Plot the reaction norms
plot_slopeIntercept(merge(exp_trait, env_mean_trait, by = "env_code"),
                    res_para, trait, kPara_Name)


### Block 12
# JGRA
# Cross-validation parameters:
#  gFold      = number of cross-validation folds
#  gIteration = number of times to perform cross-validation
gFold <- 5
gIteration <- 1

SNPs_file <- paste0(exp_dir, "Genotype.txt")
if (file.exists(SNPs_file)) {
  SNPs <- read.table(SNPs_file, header = TRUE, sep = "\t")

  # Check the orientation of the SNP table
  if (!any(line_codes %in% SNPs[[1]])) {
    snp_codes <- SNPs[[1]]
    line_codes2 <- names(SNPs)[-1]
    SNPs <- t(as.matrix(SNPs[, -1]))
    dimnames(SNPs) <- list(line_codes2, snp_codes)
    rm(line_codes2, snp_codes); gc()
  } else {
    line_codes2 <- SNPs[[1]]
    SNPs <- as.matrix(SNPs[, -1])
    rownames(SNPs) <- line_codes2
    rm(line_codes2); gc()
  }

  res_1to2 <- oneTo2CV(env_mean_trait, exp_trait)
  res_1to3 <- oneTo3CV(gFold, gIteration, SNPs, res_para, env_mean_trait, exp_trait)
  res_1to4 <- oneTo4CV(gFold, gIteration, SNPs, env_mean_trait, exp_trait)

  plotCVResults(list(res_1to2, res_1to3, res_1to4), all_env_codes)
} else {
  cat("Genomic prediction requires SNPs. Please choose a different dataset.")
}


### Block 13
# Enviromic prediction (USDA wheat only)

eFold <- 10;
eIteration <- 2;
Enviromic_Prediction(eFold, eIteration, envParas, env_mean_trait_0, Paras, trait)