GenomicSelection/predict_rrBLUP.R at master · ShiuLab/GenomicSelection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#####################################
# Make phenotype predictions using mixed solve from rrBLUP
# Equivalent to using kin.blup on the G-matrix. See:
# https://pdfs.semanticscholar.org/5c52/d445ea806770331b65d1ac65afaa74a619c4.pdf
#
# Arguments: [1]   X_file
#            [2]   Y_file
#            [3]   Features to keep
#            [4]   trait (col_name or all)
#            [5]   Hold out set
#            [6]   save sane (also used as identifier in rrBLUP_RESULTS.csv file)
#            [7]   optional: save directory
#
# Written by: Christina Azodi
# Original: 4.26.17
# Modified:
#####################################
library(rrBLUP)
library(data.table)


# Removes all existing variables from the workspace
rm(list=ls())
args = commandArgs(trailingOnly=TRUE)

start.time <- Sys.time()

# Read in arguments with 5 as the default PCs to include
if (length(args) < 7) {
  stop("Need 7 arguments: X_file Y_file Feats trait CVFs_file cvJobNum save_name save directory", call.=FALSE)
}

X_file <- args[1]
Y_file <- args[2]
feat_file <- args[3]
trait <- args[4]
ho_file <- args[5]
save_name <- args[6]
save_dir <- args[7]

## load the phenotypes and PCs
print('Loading data...')
X <- fread(X_file)
X <- as.data.frame(X)
row.names(X) <- X$ID
X$ID <- NULL

print(X[1:5,1:5])
Y <- fread(Y_file)
Y <- as.data.frame(Y)
row.names(Y) <- Y$ID
Y$ID <- NULL

ho <- scan(ho_file, what='character')
train_instances <- setdiff(row.names(X), ho)

# Make sure Y is in the same order as X:
Y <- Y[match(rownames(X), rownames(Y)),]
feat_method <- 'none'

# Subset X if feat_file is not all
if (feat_file != 'all'){
  print('Pulling features to use...')
  FEAT <- scan(feat_file, what='character')
  X <- X[FEAT]
  feat_method <- tail(unlist(strsplit(feat_file, '/')), n=1)
}

feat_num <- dim(X)[2]
ho_name <- tail(unlist(strsplit(ho_file, '/')), n=1)

if (trait == 'all') {
  print('Modeling all traits')
} else {
  Y <- Y[trait]
}

# Make output directory
setwd(save_dir)


for(i in 1:length(Y)){
  print('Building Model...')

  test_x <- as.matrix(X[ho,])
  test_y <- Y[names(Y)[i]][ho,]
  train_x <- as.matrix(X[train_instances,])
  train_y <- Y[names(Y)[i]][train_instances,]
  mod <- mixed.solve(train_y, Z=train_x, K=NULL, method='ML', SE=F, return.Hinv=F)

  # Predict test set (i.e. holdout)
  print('Predicting test set...')
  coef <- mod$u
  effect_size <- as.matrix(coef)
  yhat <- (test_x %*% effect_size)[,1]
  yhat <- yhat + mod$beta
  yhat_all <- (as.matrix(X) %*% effect_size)[,1]
  yhat_all <- yhat_all + mod$beta

  # Calculate scoring metrics
  print('Calculating scoring metrics...')
  pcc <- cor(yhat, test_y)
  mse <- mean((test_y - yhat)^2)


  # Save predicted Y (i.e. yhat)
  print('Saving output...')
  yhat_out <- paste('rrBLUP', save_name, names(Y)[i], 'yhat.csv', sep='_')
  job_ID <- paste('rrBLUP', save_name, names(Y)[i], ho_name, sep='_')
  yhat_df <- data.frame(yhat=yhat_all)
  names(yhat_df) <- job_ID
  row.names(yhat_df) <- row.names(Y)
  yhat_t <- t(yhat_df)
  write.table(yhat_t, yhat_out, append=T, sep=',', row.names=T, quote=F, col.names=!file.exists(yhat_out))

  # Save coefficients
  coef_out <- paste('rrBLUP', save_name, names(Y)[i], 'coef.csv', sep='_')
  coef_df <- data.frame(u=coef)
  names(coef_df) <- job_ID
  coef_df <- t(coef_df)
  write.table(coef_df, coef_out, sep=',', append=T, row.names=T, quote=F, col.names=!file.exists(coef_out))

  # save  results files
  time.taken <- difftime(Sys.time(), start.time, units='sec')

  res <- data.frame('rrBLUP', X_file, save_name, names(Y)[i], ho_name, feat_method, feat_num, pcc, pcc^2, mse, time.taken)
  colnames(res) <- c('model', 'x_file','tag', 'y', 'holdout_set','feat_method','feat_num', 'PCC', 'r2','mse', 'run_time')
  write.table(res, 'rrBLUP_RESULTS.csv', sep=',', append=T, row.names=F, quote=F, col.names=!file.exists('rrBLUP_RESULTS.csv'))
}

unlink('*.dat')


print('Done!')