-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathpredict_rrBLUP.R
More file actions
executable file
·135 lines (108 loc) · 3.94 KB
/
predict_rrBLUP.R
File metadata and controls
executable file
·135 lines (108 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#####################################
# Make phenotype predictions using mixed solve from rrBLUP
# Equivalent to using kin.blup on the G-matrix. See:
# https://pdfs.semanticscholar.org/5c52/d445ea806770331b65d1ac65afaa74a619c4.pdf
#
# Arguments: [1] X_file
# [2] Y_file
# [3] Features to keep
# [4] trait (col_name or all)
# [5] Hold out set
# [6] save sane (also used as identifier in rrBLUP_RESULTS.csv file)
# [7] optional: save directory
#
# Written by: Christina Azodi
# Original: 4.26.17
# Modified:
#####################################
library(rrBLUP)
library(data.table)
# Removes all existing variables from the workspace
rm(list=ls())
args = commandArgs(trailingOnly=TRUE)
start.time <- Sys.time()
# Read in arguments with 5 as the default PCs to include
if (length(args) < 7) {
stop("Need 7 arguments: X_file Y_file Feats trait CVFs_file cvJobNum save_name save directory", call.=FALSE)
}
X_file <- args[1]
Y_file <- args[2]
feat_file <- args[3]
trait <- args[4]
ho_file <- args[5]
save_name <- args[6]
save_dir <- args[7]
## load the phenotypes and PCs
print('Loading data...')
X <- fread(X_file)
X <- as.data.frame(X)
row.names(X) <- X$ID
X$ID <- NULL
print(X[1:5,1:5])
Y <- fread(Y_file)
Y <- as.data.frame(Y)
row.names(Y) <- Y$ID
Y$ID <- NULL
ho <- scan(ho_file, what='character')
train_instances <- setdiff(row.names(X), ho)
# Make sure Y is in the same order as X:
Y <- Y[match(rownames(X), rownames(Y)),]
feat_method <- 'none'
# Subset X if feat_file is not all
if (feat_file != 'all'){
print('Pulling features to use...')
FEAT <- scan(feat_file, what='character')
X <- X[FEAT]
feat_method <- tail(unlist(strsplit(feat_file, '/')), n=1)
}
feat_num <- dim(X)[2]
ho_name <- tail(unlist(strsplit(ho_file, '/')), n=1)
if (trait == 'all') {
print('Modeling all traits')
} else {
Y <- Y[trait]
}
# Make output directory
setwd(save_dir)
for(i in 1:length(Y)){
print('Building Model...')
test_x <- as.matrix(X[ho,])
test_y <- Y[names(Y)[i]][ho,]
train_x <- as.matrix(X[train_instances,])
train_y <- Y[names(Y)[i]][train_instances,]
mod <- mixed.solve(train_y, Z=train_x, K=NULL, method='ML', SE=F, return.Hinv=F)
# Predict test set (i.e. holdout)
print('Predicting test set...')
coef <- mod$u
effect_size <- as.matrix(coef)
yhat <- (test_x %*% effect_size)[,1]
yhat <- yhat + mod$beta
yhat_all <- (as.matrix(X) %*% effect_size)[,1]
yhat_all <- yhat_all + mod$beta
# Calculate scoring metrics
print('Calculating scoring metrics...')
pcc <- cor(yhat, test_y)
mse <- mean((test_y - yhat)^2)
# Save predicted Y (i.e. yhat)
print('Saving output...')
yhat_out <- paste('rrBLUP', save_name, names(Y)[i], 'yhat.csv', sep='_')
job_ID <- paste('rrBLUP', save_name, names(Y)[i], ho_name, sep='_')
yhat_df <- data.frame(yhat=yhat_all)
names(yhat_df) <- job_ID
row.names(yhat_df) <- row.names(Y)
yhat_t <- t(yhat_df)
write.table(yhat_t, yhat_out, append=T, sep=',', row.names=T, quote=F, col.names=!file.exists(yhat_out))
# Save coefficients
coef_out <- paste('rrBLUP', save_name, names(Y)[i], 'coef.csv', sep='_')
coef_df <- data.frame(u=coef)
names(coef_df) <- job_ID
coef_df <- t(coef_df)
write.table(coef_df, coef_out, sep=',', append=T, row.names=T, quote=F, col.names=!file.exists(coef_out))
# save results files
time.taken <- difftime(Sys.time(), start.time, units='sec')
res <- data.frame('rrBLUP', X_file, save_name, names(Y)[i], ho_name, feat_method, feat_num, pcc, pcc^2, mse, time.taken)
colnames(res) <- c('model', 'x_file','tag', 'y', 'holdout_set','feat_method','feat_num', 'PCC', 'r2','mse', 'run_time')
write.table(res, 'rrBLUP_RESULTS.csv', sep=',', append=T, row.names=F, quote=F, col.names=!file.exists('rrBLUP_RESULTS.csv'))
}
unlink('*.dat')
print('Done!')