-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanalyse.R
More file actions
172 lines (146 loc) · 7.31 KB
/
analyse.R
File metadata and controls
172 lines (146 loc) · 7.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
library(ggplot2)
library(cvTools)
data_summary <- function(data){
player_count <- data %>%
group_by(playerID) %>%
summarize() %>%
nrow()
man_years <- data %>%
group_by(playerID, yearID) %>%
summarize() %>%
nrow()
award_count <- data %>%
group_by(playerID, yearID, awardID) %>%
summarize() %>%
filter(!is.na(awardID)) %>%
nrow()
award_winners <- data %>%
group_by(awardID) %>%
summarize(players = n_distinct(playerID))
players_by_pos <- data %>%
group_by(f_POS) %>%
summarize(players = n_distinct(playerID))
by_year <- data %>%
group_by(yearID) %>%
summarize(players = n_distinct(playerID),
teams = n_distinct(teamID))
save(data,
player_count,
man_years,
award_count,
award_winners,
players_by_pos,
by_year,
file = "data_summary.Rdata")
}
plot_all_box <- function(split_col, data){
par(mfrow=c(2,2))
for(col in 1:ncol(data)){
if(col != split_col){
temp <- select(data, split_col, col)
title <- paste(colnames(temp), collapse = " vs ")
xlab <- colnames(temp)[1]
ylab <- colnames(temp)[2]
colnames(temp) <- c("x", "y")
boxplot(y ~ x, data = temp,
main = title,
xlab = xlab,
ylab = ylab)
}
}
par(mfrow=c(2,2))
}
find_cut_off_logistic <- function(model, cut_offs, test_data, observed_col){
results_at_cut_off <- lapply(cut_offs, function(cut_off){
predicted <- predict(model, newdata = test_data, type = "response")
results <- data.frame(
true_positive = sum((predicted >= cut_off) & test_data[,observed_col]),
true_negative = sum(!(predicted >= cut_off) & !test_data[,observed_col]),
false_positive = sum((predicted >= cut_off) & !test_data[,observed_col]),
false_negative = sum(!(predicted >= cut_off) & test_data[,observed_col]))
return(results)
})
results_at_cut_off <- do.call("rbind", results_at_cut_off)
results_at_cut_off <- cbind(cut_offs, results_at_cut_off)
results_at_cut_off <- results_at_cut_off %>%
mutate(recall = true_positive / (true_positive + false_negative),
accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative),
precision = true_positive / (true_positive + false_positive),
specificity = true_negative / (true_negative + false_positive),
f_measure = (2 * precision * recall) / (recall + precision))
return(results_at_cut_off)
}
model_silver_slug <- function(data){
batting_cols <- which(grepl("b_", colnames(data)))
batting_data <- data %>%
group_by(playerID, yearID) %>%
mutate(row_num = row_number(-win_silver_slug)) %>%
filter(row_num == 1) %>%
ungroup() %>%
select(salary, batting_cols, win_silver_slug, playerID, yearID) %>%
filter(complete.cases(.))
model <- glm(win_silver_slug ~ .-playerID -yearID, data = batting_data, family = binomial())
model_backwards_selection <- step(model, direction = "backward", trace = F)
model_backwards_linear <- lm(win_silver_slug ~ b_G + b_G_batting + b_AB + b_H + b_HR + b_RBI + b_SB + b_BB + b_SO + b_IBB + b_HBP + b_SH + b_SF + b_home_runs_per_H + b_balls_per_AB + b_HBP_per_AB + b_games_batted_per_all_games, data = batting_data)
model_backwards_semifinal <- glm(win_silver_slug ~ b_H + b_HR + b_SB + b_BB + b_SO + b_IBB + b_HBP + b_SH + b_SF + b_home_runs_per_H + b_balls_per_AB + b_HBP_per_AB + b_games_batted_per_all_games, data = batting_data, family = binomial())
model_backwards_final <- glm(win_silver_slug ~ b_H + b_HR + b_SB + b_SO + b_IBB, data = batting_data, family = binomial())
model_backwards_final_lm <- lm(win_silver_slug ~ b_H + b_HR + b_SB + b_SO + b_IBB, data = batting_data)
k <- 5
r <- 5
folds <- cvFolds(nrow(batting_data), K = k, R = r)
cross_validated_results <- lapply(1:r, function(R){
lapply(1:k, function(K){
print(paste("R=", R, " K=", K, sep = ""))
train <- batting_data[folds$subsets[folds$which != K, R],]
test <- batting_data[folds$subsets[folds$which == K, R],]
model <- glm(win_silver_slug ~ b_H + b_HR + b_SB + b_SO + b_IBB, data = train, family = binomial())
cut_offs <- seq(0, 1, by = 0.01)
results_at_cut_off <- find_cut_off_logistic(model, cut_offs, train, which(colnames(train) == "win_silver_slug")[1])
best_cut_off <- results_at_cut_off %>%
filter(f_measure == max(f_measure, na.rm = T)) %>%
filter(precision == max(precision, na.rm = T))
cut_off <- best_cut_off$cut_off
predicted <- predict(model, newdata = test, type = "response")
results <- data.frame(
true_positive = sum((predicted >= cut_off) & test$win_silver_slug),
true_negative = sum(!(predicted >= cut_off) & !test$win_silver_slug),
false_positive = sum((predicted >= cut_off) & !test$win_silver_slug),
false_negative = sum(!(predicted >= cut_off) & test$win_silver_slug))
return(results)
}) %>% do.call("rbind", .)
}) %>% do.call("rbind", .) %>%
mutate(recall = true_positive / (true_positive + false_negative),
accuracy = (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative),
precision = true_positive / (true_positive + false_positive),
specificity = true_negative / (true_negative + false_positive),
f_measure = (2 * precision * recall) / (recall + precision),
prior = (true_positive + false_negative) / (true_positive + true_negative + false_positive + false_negative))
cut_offs <- seq(0, 1, by = 0.01)
results_at_cut_off <- find_cut_off_logistic(model_backwards_final, cut_offs, batting_data, which(colnames(batting_data) == "win_silver_slug")[1])
results_at_cut_off <- results_at_cut_off[complete.cases(results_at_cut_off),]
best_cut_off <- results_at_cut_off %>%
filter(f_measure == max(f_measure, na.rm = T)) %>%
filter(precision == max(precision, na.rm = T))
plot_of_cut_offs <- ggplot(results_at_cut_off, aes(x = cut_offs)) +
geom_line(aes(y = recall, colour = "Recall")) +
geom_line(aes(y = accuracy, colour = "Accuracy")) +
geom_line(aes(y = precision, colour = "Precision")) +
geom_line(aes(y = specificity, colour = "Specificity")) +
geom_line(aes(y = f_measure, colour = "FMeasure")) +
scale_colour_manual("Cut Off Fit Metrics",values=c("Red","Green","Blue", "Orange", "Black")) +
ylab("Metric Values") +
xlab("Cut Offs") +
geom_vline(xintercept = best_cut_off$cut_offs[1]) +
ggtitle("Prediction Fit Metrics by Cut Off")
save(batting_data,
model,
model_backwards_selection,
model_backwards_linear,
model_backwards_final,
model_backwards_semifinal,
model_backwards_final_lm,
cross_validated_results,
best_cut_off,
plot_of_cut_offs,
file = "silver_slug_model.Rdata")
}