diff --git a/DESCRIPTION b/DESCRIPTION index 1ca5378..50f6cac 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: espnscrapeR Type: Package Title: Scrapes Or Collects NFL Data From ESPN -Version: 0.8.0 +Version: 0.9.0 Author: Thomas Mock Maintainer: Thomas Mock Description: Main use case is to collect ESPN QBR for NFL and college football. @@ -9,7 +9,7 @@ Description: Main use case is to collect ESPN QBR for NFL and college football. License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -RoxygenNote: 7.2.1 +RoxygenNote: 7.3.2 URL: https://github.com/jthomasmock/espnscrapeR BugReports: https://github.com/jthomasmock/espnscrapeR/issues Imports: diff --git a/NAMESPACE b/NAMESPACE index 77d74a5..3b13652 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,5 +53,6 @@ importFrom(scales,col_numeric) importFrom(stringr,str_detect) importFrom(stringr,str_remove) importFrom(tibble,enframe) +importFrom(tidyr,pivot_longer) importFrom(tidyr,separate) importFrom(xml2,read_html) diff --git a/R/scrape_espn_win_rate.R b/R/scrape_espn_win_rate.R index fdf0472..080b9e7 100644 --- a/R/scrape_espn_win_rate.R +++ b/R/scrape_espn_win_rate.R @@ -3,20 +3,34 @@ #' @return Returns a tibble #' @export #' @import dplyr stringr -#' @importFrom dplyr %>% +#' @importFrom dplyr %>% bind_rows #' @importFrom readr parse_number -#' @importFrom tidyr separate pivot_longer +#' @importFrom tidyr separate pivot_longer #' @importFrom xml2 read_html #' @importFrom purrr map2_dfr #' @importFrom rvest html_node html_text #' @importFrom tibble enframe +#' @importFrom stringr str_glue #' @examples #' # Get off and def pass/run win rates #' scrape_espn_win_rate() -scrape_espn_win_rate <- function(season = 2023){ +scrape_espn_win_rate <- function(season = 2024){ - if(!(as.numeric(season) %in% c(2019:2023))) stop("Data available for 2019-2023") + if(!(as.numeric(season) %in% c(2019:2024))) stop("Data available for 2019-2024") + + web_archive_prefix <- "https://web.archive.org/web/{dtstr}/{url}" + + # The friday after each week. Used to get an archived page of the prior weeks results + archive_weeks <- list( + lapply((0:17), function(w) as.Date("2019-09-13")+(w*7)), # 2019 + lapply((0:17), function(w) as.Date("2020-09-18")+(w*7)), # 2020 + lapply((0:18), function(w) as.Date("2021-09-17")+(w*7)), # 2021 + lapply((0:18), function(w) as.Date("2022-09-16")+(w*7)), # 2022 + lapply((0:18), function(w) as.Date("2023-09-15")+(w*7)), # 2023 + lapply((0:18), function(w) as.Date("2024-09-13")+(w*7)) # 2024 + + ) pbwr_2022 <- "https://www.espn.com/nfl/story/_/id/34536376/2022-nfl-pass-rushing-run-stopping-blocking-leaderboard-win-rate-rankings-top-players-teams" pbwr_2021 <- "https://www.espn.com/nfl/story/_/id/32176833/2021-nfl-pass-rushing-run-stopping-blocking-leaderboard-win-rate-rankings" @@ -35,71 +49,95 @@ scrape_espn_win_rate <- function(season = 2023){ "Pass Block Win Rate" ) - # 2023 specific code: + snapshot_dates <- archive_weeks[[season-2018]] - if(season == 2023){ - url_2023 <- "https://www.espn.com/nfl/story/_/id/38356170/2023-nfl-pass-rush-run-stop-blocking-win-rate-rankings-top-players-teams" + # 2023+ specific code: -raw_html <- read_html(url_2023) + if(season >= 2023){ + + load_from <- switch (as.character(season), + "2023" = "https://www.espn.com/nfl/story/_/id/38356170/2023-nfl-pass-rush-run-stop-blocking-win-rate-rankings-top-players-teams", + "2024" = "https://www.espn.com/nfl/story/_/id/41040723/2024-nfl-win-rates-top-teams-players-rankings" + ) + + output <- lapply(seq_along(snapshot_dates), function(i) { + + week <- snapshot_dates[[i]] + cache_url <- str_glue(web_archive_prefix, dtstr=format(week, "%Y%m%d"), url=load_from) + print(cache_url) + raw_html <- read_html(cache_url) + + tab <- raw_html %>% + html_table() %>% + .[[9]] %>% + pivot_longer(cols = -1, names_to = "stat", values_to = "win_rate") %>% + mutate( + stat = case_when( + stat == "PRWR" ~ "Pass Rush Win Rate", + stat == "RSWR" ~ "Run Stop Win Rate", + stat == "PBWR" ~ "Pass Block Win Rate", + stat == "RBWR" ~ "Run Block Win Rate" + )) %>% + mutate( + # extract just the string that is before a '%' + win_rate = str_extract(win_rate, "^[^%]+"), + # convert to a number + win_pct = as.numeric(win_rate) + ) %>% + mutate(date_updated = NA, season = season, week = i) %>% + arrange(stat, desc(win_pct)) %>% + group_by(stat) %>% + mutate(stat_rank = row_number()) %>% + ungroup() %>% + select(stat, stat_rank, team = Team, win_pct, date_updated, week, season) + + return(tab) + }) + + return(bind_rows(output)) -tab_23 <- raw_html %>% - html_table() %>% - .[[9]] %>% - pivot_longer(cols = -1, names_to = "stat", values_to = "win_rate") %>% - mutate( - stat = case_when( - stat == "PRWR" ~ "Pass Rush Win Rate", - stat == "RSWR" ~ "Run Stop Win Rate", - stat == "PBWR" ~ "Pass Block Win Rate", - stat == "RBWR" ~ "Run Block Win Rate" - )) %>% - mutate( - # extract just the string that is before a '%' - win_rate = str_extract(win_rate, "^[^%]+"), - # convert to a number - win_pct = as.numeric(win_rate) - ) %>% - mutate(date_updated = NA, season = 2023) %>% - arrange(stat, desc(win_pct)) %>% - group_by(stat) %>% - mutate(stat_rank = row_number()) %>% - ungroup() %>% - select(stat, stat_rank, team = Team, win_pct, date_updated, season) - - return(tab_23) } + output <- lapply(seq_along(snapshot_dates), function(i) { - raw_html <- read_html( - case_when( + week <- snapshot_dates[[i]] + load_from <- case_when( season == 2019 ~ pbwr_2019, season == 2020 ~ pbwr_2020, season == 2021 ~ pbwr_2021, season == 2022 ~ pbwr_2022 - ) ) + + cache_url <- str_glue(web_archive_prefix, dtstr=format(week, "%Y%m%d"), url=load_from) + print(cache_url) + raw_html <- read_html(cache_url) + + date_updated <- raw_html %>% + html_node("#article-feed > article:nth-child(1) > div > div.article-body > div.article-meta > span > span") %>% + html_text() + + raw_text <- raw_html %>% + html_nodes("#article-feed > article:nth-child(1) > div > div.article-body > p") %>% + html_text() + + tibble::enframe(raw_text) %>% + filter(str_detect(value, "1. ")) %>% + mutate(name = if_else(season == 2019, list(stat_2019), list(stats_in))[[1]]) %>% + mutate(value = str_split(value, "\n")) %>% + unnest_longer(value) %>% + separate(value, into = c("rank", "team", "win_pct"), sep = "\\. |, ") %>% + mutate( + rank = as.integer(rank), + win_pct = str_remove(win_pct, "%"), + win_pct = as.double(win_pct), + date_updated = date_updated, + season = season, + week = i + ) %>% + rename(stat = name, stat_rank = rank) + + }) - date_updated <- raw_html %>% - html_node("#article-feed > article:nth-child(1) > div > div.article-body > div.article-meta > span > span") %>% - html_text() - - raw_text <- raw_html %>% - html_nodes("#article-feed > article:nth-child(1) > div > div.article-body > p") %>% - html_text() - - tibble::enframe(raw_text) %>% - filter(str_detect(value, "1. ")) %>% - mutate(name = if_else(season == 2019, list(stat_2019), list(stats_in))[[1]]) %>% - mutate(value = str_split(value, "\n")) %>% - unnest_longer(value) %>% - separate(value, into = c("rank", "team", "win_pct"), sep = "\\. |, ") %>% - mutate( - rank = as.integer(rank), - win_pct = str_remove(win_pct, "%"), - win_pct = as.double(win_pct), - date_updated = date_updated, - season = season - ) %>% - rename(stat = name, stat_rank = rank) + return(bind_rows(output)) } diff --git a/espnscrapeR.Rproj b/espnscrapeR.Rproj index 497f8bf..270314b 100644 --- a/espnscrapeR.Rproj +++ b/espnscrapeR.Rproj @@ -18,3 +18,4 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/man/scrape_espn_win_rate.Rd b/man/scrape_espn_win_rate.Rd index 11b35fc..0163498 100644 --- a/man/scrape_espn_win_rate.Rd +++ b/man/scrape_espn_win_rate.Rd @@ -4,7 +4,7 @@ \alias{scrape_espn_win_rate} \title{Scrape ESPN Pass/Run Block/Rush Win Rates ratings for a specific season from ESPN's site} \usage{ -scrape_espn_win_rate(season = 2022) +scrape_espn_win_rate(season = 2024) } \value{ Returns a tibble