This repository was archived by the owner on Jul 20, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathprocess_data.R
More file actions
515 lines (433 loc) · 30.1 KB
/
process_data.R
File metadata and controls
515 lines (433 loc) · 30.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
library(rvest)
library(stringr)
library(plyr)
library(dplyr)
# Functions
check_files <- function(dir_base) {
v_files <- list.files(path = dir_base, recursive = TRUE, include.dirs = FALSE)
df_flags <- data_frame(dir_base = dir_base,
dir_inning = (sum(str_detect(string = list.dirs(path = dir_base),
pattern = '/inning')) > 0),
flag_boxscore = (sum(str_detect(string = v_files, pattern = 'boxscore.xml')) > 0),
flag_players = (sum(str_detect(string = v_files, pattern = 'players.xml')) > 0),
flag_inning_all = (sum(str_detect(string = v_files, pattern = 'inning_all.xml')) > 0),
flag_inning_hit = (sum(str_detect(string = v_files, pattern = 'inning_hit.xml')) > 0))
return(df_flags)
}
parse_game <- function(dir_base) {
dir_boxscore <- paste(dir_base, '/boxscore.xml', sep = '')
raw <- xml(x = dir_boxscore)
root <- xml_node(x = raw, xpath = '//boxscore')
game_df <- data_frame(game_pk = xml_attr(x = root, name = 'game_pk'),
game_id = xml_attr(x = root, name = 'game_id'),
stadium_id = xml_attr(x = root, name = 'venue_id'),
date = xml_attr(x = root, name = 'date'),
team_id_home = xml_attr(x = root, name = 'home_id'),
team_id_away = xml_attr(x = root, name = 'away_id'))
return(game_df)
}
parse_game_teams <- function(dir_base) {
dir_boxscore <- paste(dir_base, '/boxscore.xml', sep = '')
raw <- xml(x = dir_boxscore)
root <- xml_node(x = raw, xpath = '//boxscore')
home_df <- data_frame(game_pk = xml_attr(x = root, name = 'game_pk'),
role = 'home',
team_id = xml_attr(x = root, name = 'home_id'),
wins = xml_attr(x = root, name = 'home_wins'),
losses = xml_attr(x = root, name = 'home_loss'))
away_df <- data_frame(game_pk = xml_attr(x = root, name = 'game_pk'),
role = 'away',
team_id = xml_attr(x = root, name = 'away_id'),
wins = xml_attr(x = root, name = 'away_wins'),
losses = xml_attr(x = root, name = 'away_loss'))
game_teams_df <- rbind_all(list(home_df, away_df))
return(game_teams_df)
}
parse_game_umpires <- function(dir_base) {
dir_boxscore <- paste(dir_base, '/boxscore.xml', sep = '')
raw_boxscore <- xml(x = dir_boxscore)
root_boxscore <- xml_node(x = raw_boxscore, xpath = '//boxscore')
dir_players <- paste(dir_base, '/players.xml', sep = '')
raw_players <- xml(x = dir_players)
root_umpires <- xml_node(x = raw_players, xpath = '//game/umpires')
l_umpires <- xml_children(root_umpires)
gu_df <- data_frame(game_pk = xml_attr(x = root_boxscore, name = 'game_pk'),
umpire_id = as.character(lapply(X = l_umpires, FUN = xml_attr, name = 'id')),
position = as.character(lapply(X = l_umpires, FUN = xml_attr, name = 'position')),
name_first = as.character(lapply(X = l_umpires, FUN = xml_attr, name = 'first')),
name_last = as.character(lapply(X = l_umpires, FUN = xml_attr, name = 'last')))
return(gu_df)
}
# TODO: fix bug in missing coaches, switch to loop format
parse_game_coaches <- function(dir_base) {
dir_boxscore <- paste(dir_base, '/boxscore.xml', sep = '')
raw_boxscore <- xml(x = dir_boxscore)
root_boxscore <- xml_node(x = raw_boxscore, xpath = '//boxscore')
dir_players <- paste(dir_base, '/players.xml', sep = '')
raw_players <- xml(x = dir_players)
root_game <- xml_node(x = raw_players, xpath = '//game')
l_game <- xml_children(root_game)
l_game <- l_game[names(l_game) == 'team']
names(l_game) <- c('team1','team2')
df_coaches <- data_frame()
# TODO: switch function to loop approach
# Team 1
root_t1 <- xml_node(x = l_game[['team1']], xpath = '.')
l_coaches_t1 <- xml_children(root_t1)
l_coaches_t1 <- l_coaches_t1[names(l_coaches_t1) == 'coach']
df_t1 <- data_frame(game_pk = xml_attr(x = root_boxscore, name = 'game_pk'),
team_type = xml_attr(x = root_t1, name = 'type'),
coach_id = as.character(lapply(X = l_coaches_t1, FUN = xml_attr, name = 'id')),
position = as.character(lapply(X = l_coaches_t1, FUN = xml_attr, name = 'position')),
name_first = as.character(lapply(X = l_coaches_t1, FUN = xml_attr, name = 'first')),
name_last = as.character(lapply(X = l_coaches_t1, FUN = xml_attr, name = 'last')),
number = as.character(lapply(X = l_coaches_t1, FUN = xml_attr, name = 'num')))
# Team 2
root_t2 <- xml_node(x = l_game[['team2']], xpath = '.')
l_coaches_t2 <- xml_children(root_t2)
l_coaches_t2 <- l_coaches_t2[names(l_coaches_t2) == 'coach']
df_t2 <- data_frame(game_pk = xml_attr(x = root_boxscore, name = 'game_pk'),
team_type = xml_attr(x = root_t2, name = 'type'),
coach_id = as.character(lapply(X = l_coaches_t2, FUN = xml_attr, name = 'id')),
position = as.character(lapply(X = l_coaches_t2, FUN = xml_attr, name = 'position')),
name_first = as.character(lapply(X = l_coaches_t2, FUN = xml_attr, name = 'first')),
name_last = as.character(lapply(X = l_coaches_t2, FUN = xml_attr, name = 'last')),
number = as.character(lapply(X = l_coaches_t2, FUN = xml_attr, name = 'num')))
df_coaches <- rbind_all(list(df_t1, df_t2))
return(df_coaches)
}
parse_game_players <- function(dir_base) {
dir_boxscore <- paste(dir_base, '/boxscore.xml', sep = '')
raw_boxscore <- xml(x = dir_boxscore)
root_boxscore <- xml_node(x = raw_boxscore, xpath = '//boxscore')
dir_players <- paste(dir_base, '/players.xml', sep = '')
raw_players <- xml(x = dir_players)
root_game <- xml_node(x = raw_players, xpath = '//game')
l_game <- xml_children(root_game)
l_game <- l_game[names(l_game) == 'team']
names(l_game) <- c('team1','team2')
# Team 1
root_t1 <- xml_node(x = l_game[['team1']], xpath = '.')
l_players_t1 <- xml_children(root_t1)
l_players_t1 <- l_players_t1[names(l_players_t1) == 'player']
df_t1 <- data_frame(game_pk = xml_attr(x = root_boxscore, name = 'game_pk'),
team_id = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'team_id')),
player_id = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'id')),
name_first = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'first')),
name_last = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'last')),
number = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'num')),
position = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'position')),
game_position = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'game_position')),
rl = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'rl')),
bat_order = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'bat_order')),
bats = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'bats')),
status = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'status')),
batting_avg = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'avg')),
homeruns = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'hr')),
rbi = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'rbi')),
p_wins = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'wins')),
p_losses = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'losses')),
p_era = as.character(lapply(X = l_players_t1, FUN = xml_attr, name = 'era')))
# Team 2
root_t2 <- xml_node(x = l_game[['team2']], xpath = '.')
l_players_t2 <- xml_children(root_t2)
l_players_t2 <- l_players_t2[names(l_players_t2) == 'player']
df_t2 <- data_frame(game_pk = xml_attr(x = root_boxscore, name = 'game_pk'),
team_id = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'team_id')),
player_id = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'id')),
name_first = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'first')),
name_last = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'last')),
number = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'num')),
position = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'position')),
game_position = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'game_position')),
rl = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'rl')),
bat_order = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'bat_order')),
bats = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'bats')),
status = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'status')),
batting_avg = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'avg')),
homeruns = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'hr')),
rbi = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'rbi')),
p_wins = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'wins')),
p_losses = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'losses')),
p_era = as.character(lapply(X = l_players_t2, FUN = xml_attr, name = 'era')))
df_players <- rbind_all(list(df_t1, df_t2))
return(df_players)
}
parse_atbats <- function(dir_base) {
dir_boxscore <- paste(dir_base, '/boxscore.xml', sep = '')
raw_boxscore <- xml(x = dir_boxscore)
root_boxscore <- xml_node(x = raw_boxscore, xpath = '//boxscore')
game_pk <- xml_attr(x = root_boxscore, name = 'game_pk')
dir_inning <- paste(dir_base, '/inning/inning_all.xml', sep = '')
raw_inning <- xml(x = dir_inning)
root_game <- xml_node(x = raw_inning, xpath = '//game')
l_innings <- xml_children(root_game)
atbat_df <- data_frame()
for (inning in l_innings) {
i_num <- xml_attr(x = inning, name = 'num')
l_inning_part <- xml_children(inning)
for (inning_part in l_inning_part) {
i_part <- xml_tag(inning_part)
l_atbats <- xml_children(inning_part)
l_atbats <- l_atbats[names(l_atbats) == 'atbat']
if (length(l_atbats) > 0) {
new_atbat <- data_frame(game_pk = game_pk,
inning = i_num,
inning_part = i_part,
batter_id = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'batter')),
pitcher_id = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'pitcher')),
start_tfs = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'start_tfs')),
start_tfs_zulu = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'start_tfs_zulu')),
atbat_num = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'num')),
event_num = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'event_num')),
pitcher_throws = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'p_throws')),
batter_height = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'b_height')),
batter_stance = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'stand')),
runs_home = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'home_team_runs')),
runs_away = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'away_team_runs')),
outcome_event = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'event')),
outcome_balls = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'b')),
outcome_strikes = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 's')),
outcome_outs = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'o')))
atbat_df <- bind_rows(atbat_df, new_atbat)
}
}
}
return(atbat_df)
}
parse_game_actions <- function(dir_base) {
dir_boxscore <- paste(dir_base, '/boxscore.xml', sep = '')
raw_boxscore <- xml(x = dir_boxscore)
root_boxscore <- xml_node(x = raw_boxscore, xpath = '//boxscore')
dir_inning <- paste(dir_base, '/inning/inning_all.xml', sep = '')
raw_inning <- xml(x = dir_inning)
root_game <- xml_node(x = raw_inning, xpath = '//game')
l_innings <- xml_children(root_game)
actions_df <- data_frame()
for (inning in l_innings) {
i_num <- xml_attr(x = inning, name = 'num')
l_inning_part <- xml_children(inning)
for (inning_part in l_inning_part) {
i_part <- xml_tag(inning_part)
l_atbats <- xml_children(inning_part)
l_atbats <- l_atbats[names(l_atbats) == 'action']
if (length(l_atbats) > 0) {
new_actions <- data_frame(game_pk = xml_attr(x = root_boxscore, name = 'game_pk'),
inning = i_num,
inning_part = i_part,
event = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'event')),
event_num = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'event_num')),
tfs = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'tfs')),
tfs_zulu = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'tfs_zulu')),
player_id = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'player')),
pitch_num = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'pitch')),
runs_home = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'home_team_runs')),
runs_away = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'away_team_runs')),
occ_balls = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'b')),
occ_strikes = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 's')),
occ_outs = as.character(lapply(X = l_atbats, FUN = xml_attr, name = 'o')))
actions_df <- bind_rows(actions_df, new_actions)
}
}
}
return(actions_df)
}
parse_game_hits <- function(dir_base) {
dir_boxscore <- paste(dir_base, '/boxscore.xml', sep = '')
raw_boxscore <- xml(x = dir_boxscore)
root_boxscore <- xml_node(x = raw_boxscore, xpath = '//boxscore')
dir_hit <- paste(dir_base, '/inning/inning_hit.xml', sep = '')
raw_hit <- xml(x = dir_hit)
root_hit <- xml_node(x = raw_hit, xpath = '//hitchart')
hits_df <- data_frame()
l_hits <- xml_children(root_hit)
if (length(l_hits) > 0) {
hits_df <- data_frame(game_pk = xml_attr(x = root_boxscore, name = 'game_pk'),
inning = as.character(lapply(X = l_hits, FUN = xml_attr, name = 'inning')),
description = as.character(lapply(X = l_hits, FUN = xml_attr, name = 'des')),
pitcher_id = as.character(lapply(X = l_hits, FUN = xml_attr, name = 'pitcher')),
batter_id = as.character(lapply(X = l_hits, FUN = xml_attr, name = 'batter')),
team_type = as.character(lapply(X = l_hits, FUN = xml_attr, name = 'team')),
hit_type = as.character(lapply(X = l_hits, FUN = xml_attr, name = 'type')),
hit_x = as.character(lapply(X = l_hits, FUN = xml_attr, name = 'x')),
hit_y = as.character(lapply(X = l_hits, FUN = xml_attr, name = 'y')))
}
return(hits_df)
}
parse_pitches <- function(dir_base) {
dir_boxscore <- paste(dir_base, '/boxscore.xml', sep = '')
raw_boxscore <- xml(x = dir_boxscore)
root_boxscore <- xml_node(x = raw_boxscore, xpath = '//boxscore')
game_pk <- xml_attr(x = root_boxscore, name = 'game_pk')
dir_inning <- paste(dir_base, '/inning/inning_all.xml', sep = '')
raw_inning <- xml(x = dir_inning)
root_game <- xml_node(x = raw_inning, xpath = '//game')
l_innings <- xml_children(root_game)
pitches_df <- data_frame()
for (inning in l_innings) {
i_num <- xml_attr(x = inning, name = 'num')
l_inning_part <- xml_children(inning)
for (inning_part in l_inning_part) {
i_part <- xml_tag(inning_part)
l_atbats <- xml_children(inning_part)
l_atbats <- l_atbats[names(l_atbats) == 'atbat']
if (length(l_atbats) > 0) {
for (atbat in l_atbats) {
l_pitches <- xml_children(atbat)
l_pitches <- l_pitches[names(l_pitches) == 'pitch']
if (length(l_pitches) > 0) {
new_pitches <- data_frame(game_pk = game_pk,
play_guid = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'play_guid')),
batter_id = xml_attr(x = atbat, name = 'batter'),
pitcher_id = xml_attr(x = atbat, name = 'pitcher'),
atbat_num = xml_attr(x = atbat, name = 'num'),
event_num = xml_attr(x = atbat, name = 'event_num'),
pitch_des = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'des')),
pitch_id = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'id')),
type = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'type')),
tfs = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'tfs')),
tfs_zulu = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'tfs_zulu')),
x = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'x')),
y = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'y')),
pitch_event_num = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'event_num')),
sv_id = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'sv_id')),
speed_start = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'start_speed')),
speed_end = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'end_speed')),
sz_top = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'sz_top')),
sz_bot = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'sz_bot')),
pfx_x = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'pfx_x')),
pfx_z = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'pfx_z')),
px = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'px')),
pz = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'pz')),
x0 = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'x0')),
y0 = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'y0')),
z0 = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'z0')),
vx0 = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'vx0')),
vy0 = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'vy0')),
vz0 = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'vz0')),
ax = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'ax')),
ay = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'ay')),
az = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'az')),
break_y = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'break_y')),
break_angle = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'break_angle')),
break_length = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'break_length')),
pitch_type = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'pitch_type')),
type_confidence = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'type_confidence')),
zone = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'zone')),
nasty = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'nasty')),
spin_dir = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'spin_dir')),
spin_rate = as.character(lapply(X = l_pitches, FUN = xml_attr, name = 'spin_rate')))
pitches_df <- bind_rows(pitches_df, new_pitches)
}
}
}
}
}
return(pitches_df)
}
parse_game_runners <- function(dir_base) {
dir_boxscore <- paste(dir_base, '/boxscore.xml', sep = '')
raw_boxscore <- xml(x = dir_boxscore)
root_boxscore <- xml_node(x = raw_boxscore, xpath = '//boxscore')
game_pk <- xml_attr(x = root_boxscore, name = 'game_pk')
dir_inning <- paste(dir_base, '/inning/inning_all.xml', sep = '')
raw_inning <- xml(x = dir_inning)
root_game <- xml_node(x = raw_inning, xpath = '//game')
l_innings <- xml_children(root_game)
runners_df <- data_frame()
for (inning in l_innings) {
i_num <- xml_attr(x = inning, name = 'num')
l_inning_part <- xml_children(inning)
for (inning_part in l_inning_part) {
i_part <- xml_tag(inning_part)
l_atbats <- xml_children(inning_part)
l_atbats <- l_atbats[names(l_atbats) == 'atbat']
if (length(l_atbats) > 0) {
for (atbat in l_atbats) {
l_runners <- xml_children(atbat)
l_runners <- l_runners[names(l_runners) == 'runner']
if (length(l_runners) > 0) {
new_runners <- data_frame(game_pk = game_pk,
batter_id = xml_attr(x = atbat, name = 'batter'),
pitcher_id = xml_attr(x = atbat, name = 'pitcher'),
atbat_num = xml_attr(x = atbat, name = 'num'),
event_num = xml_attr(x = atbat, name = 'event_num'),
runner_id = as.character(lapply(X = l_runners, FUN = xml_attr, name = 'id')),
base_start = as.character(lapply(X = l_runners, FUN = xml_attr, name = 'start')),
base_end = as.character(lapply(X = l_runners, FUN = xml_attr, name = 'end')),
event = as.character(lapply(X = l_runners, FUN = xml_attr, name = 'event')),
event_num = as.character(lapply(X = l_runners, FUN = xml_attr, name = 'event_num')),
flag_score = as.character(lapply(X = l_runners, FUN = xml_attr, name = 'score')),
flag_rbi = as.character(lapply(X = l_runners, FUN = xml_attr, name = 'rbi')),
flag_earned = as.character(lapply(X = l_runners, FUN = xml_attr, name = 'earned')))
runners_df <- bind_rows(runners_df, new_runners)
}
}
}
}
}
return(runners_df)
}
# Get Directory Vectors
dirs <- list.dirs(path = './month_10', recursive = TRUE, full.names = TRUE)
split_dirs <- str_split(string = dirs, pattern = '/')
dirs_gid <- data_frame(dirs = dirs,
last_dir = as.character(lapply(X = split_dirs, FUN = tail, n = 1))) %>%
filter(str_detect(string = last_dir, pattern = 'gid') == TRUE) %>%
select(-(last_dir))
# TODO: produce df of missing data before mutating gid df
dirs_gid <- rbind_all(alply(.data = dirs_gid$dirs, .margins = 1, .fun = check_files)) %>%
filter(dir_inning, flag_boxscore, flag_players, flag_inning_all, flag_inning_hit)
# Make Tables
df_game <- rbind_all(alply(.data = dirs_gid$dir_base, .margins = 1, .fun = parse_game, .progress = 'text'))
df_game_teams <- rbind_all(alply(.data = dirs_gid$dir_base, .margins = 1, .fun = parse_game_teams, .progress = 'text'))
df_game_umpires <- rbind_all(alply(.data = dirs_gid$dir_base, .margins = 1, .fun = parse_game_umpires, .progress = 'text'))
df_game_coaches <- rbind_all(alply(.data = dirs_gid$dir_base, .margins = 1, .fun = parse_game_coaches, .progress = 'text'))
df_game_players <- rbind_all(alply(.data = dirs_gid$dir_base, .margins = 1, .fun = parse_game_players, .progress = 'text'))
df_atbats <- rbind_all(alply(.data = dirs_gid$dir_base, .margins = 1, .fun = parse_atbats, .progress = 'text'))
df_game_actions <- rbind_all(alply(.data = dirs_gid$dir_base, .margins = 1, .fun = parse_game_actions, .progress = 'text'))
df_game_hits <- rbind_all(alply(.data = dirs_gid$dir_base, .margins = 1, .fun = parse_game_hits, .progress = 'text'))
df_pitches <- rbind_all(alply(.data = dirs_gid$dir_base, .margins = 1, .fun = parse_pitches, .progress = 'text'))
df_game_runners <- rbind_all(alply(.data = dirs_gid$dir_base, .margins = 1, .fun = parse_game_runners, .progress = 'text'))
# Write Tables - Year Level
write.table(x = df_game, file = './data_parsed/game.csv',
sep = '|', row.names = FALSE, quote = FALSE)
write.table(x = df_game_teams, file = './data_parsed/game_teams.csv',
sep = '|', row.names = FALSE, quote = FALSE)
write.table(x = df_game_umpires, file = './data_parsed/game_umpires.csv',
sep = '|', row.names = FALSE, quote = FALSE)
write.table(x = df_game_coaches, file = './data_parsed/game_coaches.csv',
sep = '|', row.names = FALSE, quote = FALSE)
write.table(x = df_game_players, file = './data_parsed/game_players.csv',
sep = '|', row.names = FALSE, quote = FALSE)
write.table(x = df_game_actions, file = './data_parsed/game_actions.csv',
sep = '|', row.names = FALSE, quote = FALSE)
write.table(x = df_game_hits, file = './data_parsed/game_hits.csv',
sep = '|', row.names = FALSE, quote = FALSE)
write.table(x = df_game_runners, file = './data_parsed/game_runners.csv',
sep = '|', row.names = FALSE, quote = FALSE)
# Write Tables - Month Level
write.table(x = df_atbats, file = './data_parsed/atbats.csv',
sep = '|', row.names = FALSE, quote = FALSE)
write.table(x = df_pitches, file = 'pitches_201310.csv',
sep = '|', row.names = FALSE, quote = FALSE)
# Combine Multi-Months
ym <- c(201402:201410)
atbats <- data_frame()
for (i in (1:length(ym))) {
i_string <- as.character(ym[i])
i_file <- paste('./data_parsed/atbats_', i_string, '.csv', sep = '')
atbats_new <- tbl_df(read.table(file = i_file, header = TRUE, sep = '|', quote = ''))
atbats <- bind_rows(atbats, atbats_new)
}
write.table(x = atbats, file = './data_parsed/atbats_2013.csv',
sep = '|', row.names = FALSE, quote = FALSE)
pitches <- data_frame()
for (i in (1:length(ym))) {
i_string <- as.character(ym[i])
i_file <- paste('./data_parsed/pitches_', i_string, '.csv', sep = '')
pitches_new <- tbl_df(read.table(file = i_file, header = TRUE, sep = '|', quote = ''))
pitches <- bind_rows(pitches, pitches_new)
}
write.table(x = pitches, file = './data_parsed/pitches_2014.csv',
sep = '|', row.names = FALSE, quote = FALSE)