Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
Package: espnscrapeR
Type: Package
Title: Scrapes Or Collects NFL Data From ESPN
Version: 0.8.0
Version: 0.9.0
Author: Thomas Mock
Maintainer: Thomas Mock <j.thomasmock@gmail.com>
Description: Main use case is to collect ESPN QBR for NFL and college football.
Alternative functions include getting NFL standings and scraping NFL season-level stats.
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.2.1
RoxygenNote: 7.3.2
URL: https://github.com/jthomasmock/espnscrapeR
BugReports: https://github.com/jthomasmock/espnscrapeR/issues
Imports:
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -53,5 +53,6 @@ importFrom(scales,col_numeric)
importFrom(stringr,str_detect)
importFrom(stringr,str_remove)
importFrom(tibble,enframe)
importFrom(tidyr,pivot_longer)
importFrom(tidyr,separate)
importFrom(xml2,read_html)
154 changes: 96 additions & 58 deletions R/scrape_espn_win_rate.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,34 @@
#' @return Returns a tibble
#' @export
#' @import dplyr stringr
#' @importFrom dplyr %>%
#' @importFrom dplyr %>% bind_rows
#' @importFrom readr parse_number
#' @importFrom tidyr separate pivot_longer
#' @importFrom tidyr separate pivot_longer
#' @importFrom xml2 read_html
#' @importFrom purrr map2_dfr
#' @importFrom rvest html_node html_text
#' @importFrom tibble enframe
#' @importFrom stringr str_glue
#' @examples
#' # Get off and def pass/run win rates
#' scrape_espn_win_rate()

scrape_espn_win_rate <- function(season = 2023){
scrape_espn_win_rate <- function(season = 2024){

if(!(as.numeric(season) %in% c(2019:2023))) stop("Data available for 2019-2023")
if(!(as.numeric(season) %in% c(2019:2024))) stop("Data available for 2019-2024")

web_archive_prefix <- "https://web.archive.org/web/{dtstr}/{url}"

# The friday after each week. Used to get an archived page of the prior weeks results
archive_weeks <- list(
lapply((0:17), function(w) as.Date("2019-09-13")+(w*7)), # 2019
lapply((0:17), function(w) as.Date("2020-09-18")+(w*7)), # 2020
lapply((0:18), function(w) as.Date("2021-09-17")+(w*7)), # 2021
lapply((0:18), function(w) as.Date("2022-09-16")+(w*7)), # 2022
lapply((0:18), function(w) as.Date("2023-09-15")+(w*7)), # 2023
lapply((0:18), function(w) as.Date("2024-09-13")+(w*7)) # 2024

)

pbwr_2022 <- "https://www.espn.com/nfl/story/_/id/34536376/2022-nfl-pass-rushing-run-stopping-blocking-leaderboard-win-rate-rankings-top-players-teams"
pbwr_2021 <- "https://www.espn.com/nfl/story/_/id/32176833/2021-nfl-pass-rushing-run-stopping-blocking-leaderboard-win-rate-rankings"
Expand All @@ -35,71 +49,95 @@ scrape_espn_win_rate <- function(season = 2023){
"Pass Block Win Rate"
)

# 2023 specific code:
snapshot_dates <- archive_weeks[[season-2018]]

if(season == 2023){
url_2023 <- "https://www.espn.com/nfl/story/_/id/38356170/2023-nfl-pass-rush-run-stop-blocking-win-rate-rankings-top-players-teams"
# 2023+ specific code:

raw_html <- read_html(url_2023)
if(season >= 2023){

load_from <- switch (as.character(season),
"2023" = "https://www.espn.com/nfl/story/_/id/38356170/2023-nfl-pass-rush-run-stop-blocking-win-rate-rankings-top-players-teams",
"2024" = "https://www.espn.com/nfl/story/_/id/41040723/2024-nfl-win-rates-top-teams-players-rankings"
)

output <- lapply(seq_along(snapshot_dates), function(i) {

week <- snapshot_dates[[i]]
cache_url <- str_glue(web_archive_prefix, dtstr=format(week, "%Y%m%d"), url=load_from)
print(cache_url)
raw_html <- read_html(cache_url)

tab <- raw_html %>%
html_table() %>%
.[[9]] %>%
pivot_longer(cols = -1, names_to = "stat", values_to = "win_rate") %>%
mutate(
stat = case_when(
stat == "PRWR" ~ "Pass Rush Win Rate",
stat == "RSWR" ~ "Run Stop Win Rate",
stat == "PBWR" ~ "Pass Block Win Rate",
stat == "RBWR" ~ "Run Block Win Rate"
)) %>%
mutate(
# extract just the string that is before a '%'
win_rate = str_extract(win_rate, "^[^%]+"),
# convert to a number
win_pct = as.numeric(win_rate)
) %>%
mutate(date_updated = NA, season = season, week = i) %>%
arrange(stat, desc(win_pct)) %>%
group_by(stat) %>%
mutate(stat_rank = row_number()) %>%
ungroup() %>%
select(stat, stat_rank, team = Team, win_pct, date_updated, week, season)

return(tab)
})

return(bind_rows(output))

tab_23 <- raw_html %>%
html_table() %>%
.[[9]] %>%
pivot_longer(cols = -1, names_to = "stat", values_to = "win_rate") %>%
mutate(
stat = case_when(
stat == "PRWR" ~ "Pass Rush Win Rate",
stat == "RSWR" ~ "Run Stop Win Rate",
stat == "PBWR" ~ "Pass Block Win Rate",
stat == "RBWR" ~ "Run Block Win Rate"
)) %>%
mutate(
# extract just the string that is before a '%'
win_rate = str_extract(win_rate, "^[^%]+"),
# convert to a number
win_pct = as.numeric(win_rate)
) %>%
mutate(date_updated = NA, season = 2023) %>%
arrange(stat, desc(win_pct)) %>%
group_by(stat) %>%
mutate(stat_rank = row_number()) %>%
ungroup() %>%
select(stat, stat_rank, team = Team, win_pct, date_updated, season)

return(tab_23)
}

output <- lapply(seq_along(snapshot_dates), function(i) {

raw_html <- read_html(
case_when(
week <- snapshot_dates[[i]]
load_from <- case_when(
season == 2019 ~ pbwr_2019,
season == 2020 ~ pbwr_2020,
season == 2021 ~ pbwr_2021,
season == 2022 ~ pbwr_2022
)
)

cache_url <- str_glue(web_archive_prefix, dtstr=format(week, "%Y%m%d"), url=load_from)
print(cache_url)
raw_html <- read_html(cache_url)

date_updated <- raw_html %>%
html_node("#article-feed > article:nth-child(1) > div > div.article-body > div.article-meta > span > span") %>%
html_text()

raw_text <- raw_html %>%
html_nodes("#article-feed > article:nth-child(1) > div > div.article-body > p") %>%
html_text()

tibble::enframe(raw_text) %>%
filter(str_detect(value, "1. ")) %>%
mutate(name = if_else(season == 2019, list(stat_2019), list(stats_in))[[1]]) %>%
mutate(value = str_split(value, "\n")) %>%
unnest_longer(value) %>%
separate(value, into = c("rank", "team", "win_pct"), sep = "\\. |, ") %>%
mutate(
rank = as.integer(rank),
win_pct = str_remove(win_pct, "%"),
win_pct = as.double(win_pct),
date_updated = date_updated,
season = season,
week = i
) %>%
rename(stat = name, stat_rank = rank)

})

date_updated <- raw_html %>%
html_node("#article-feed > article:nth-child(1) > div > div.article-body > div.article-meta > span > span") %>%
html_text()

raw_text <- raw_html %>%
html_nodes("#article-feed > article:nth-child(1) > div > div.article-body > p") %>%
html_text()

tibble::enframe(raw_text) %>%
filter(str_detect(value, "1. ")) %>%
mutate(name = if_else(season == 2019, list(stat_2019), list(stats_in))[[1]]) %>%
mutate(value = str_split(value, "\n")) %>%
unnest_longer(value) %>%
separate(value, into = c("rank", "team", "win_pct"), sep = "\\. |, ") %>%
mutate(
rank = as.integer(rank),
win_pct = str_remove(win_pct, "%"),
win_pct = as.double(win_pct),
date_updated = date_updated,
season = season
) %>%
rename(stat = name, stat_rank = rank)
return(bind_rows(output))

}
1 change: 1 addition & 0 deletions espnscrapeR.Rproj
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ StripTrailingWhitespace: Yes
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
2 changes: 1 addition & 1 deletion man/scrape_espn_win_rate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.