From f93fa088aade8059c492f71766117ad72d45200f Mon Sep 17 00:00:00 2001 From: rachel-carroll Date: Sun, 2 Feb 2025 21:09:59 -0500 Subject: [PATCH 1/8] remove functions dependent on wru --- R/performance_analysis.R | 245 ---------------------------- R/surname_utils.R | 298 ----------------------------------- R/wru_predict_race_wrapper.R | 210 ------------------------ 3 files changed, 753 deletions(-) delete mode 100644 R/performance_analysis.R delete mode 100644 R/surname_utils.R delete mode 100644 R/wru_predict_race_wrapper.R diff --git a/R/performance_analysis.R b/R/performance_analysis.R deleted file mode 100644 index ad9503af..00000000 --- a/R/performance_analysis.R +++ /dev/null @@ -1,245 +0,0 @@ -#' Performs a performance analysis using a voter file, census shape, and -#' district shape. -#' -#' @param voter_file A dataframe containing the voter file. -#' @param district_shape The shapefiles for the new districts or precincts to -#' consider. -#' @param census_shape The shapefiles for the Census blocks or tracts for which -#' the voter file will be geocoded against. -#' @param census_data A dataframe containing the Census tracts or blocks in the -#' region for the voter file. -#' @param join_district_shape A logical denoting whether the voter file already -#' has the district identity per voter. If TRUE, then a column names for the -#' district must be provided. If FALSE, then a distrct shape must be provided -#' in order to perform a spatial join. -#' @param join_census_shape A logical denoting whether the voter file already -#' has the Census block, tract, and county information. If TRUE, then column -#' names for these items must be provided. If FALSE, then a Census shape must -#' be provided in order to perform a spatial join. -#' @param state The state in which the functionality analysis is performed, as -#' a two character string. -#' @param voter_id A string denoting the column name for the voter ID. -#' @param surname A string denoting the column name for the surname. -#' @param district A string denoting the column name for the district. -#' @param census_state_col The column in the Census data that indicates state. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the state FIPS code. -#' @param census_county_col The column in the Census data that indicates county. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the county FIPS code. -#' @param census_tract_col The column in the Census data that indicates tract. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the tract FIPS code. -#' @param census_block_col The column in the Census data that indicates block. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the block FIPS code. -#' @param crs A string denoting the PROJ4 string for projecting maps. -#' @param coords The columns for the coordinates. -#' @param census_geo The geographic level at which to perform BISG. -#' @param use_surname Whether to use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_only Whether to only use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_year Which Census year to use for surname matching. Passed to -#' WRU. -#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. -#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. -#' @param normalize If TRUE, normalizes the district percentages. -#' @param verbose If TRUE, will output diagnostic strings. -#' @return The processed voter file and a summary of district turnout across -#' racial groups. -#' -#' @export performance_analysis -#' @importFrom dplyr filter group_by_at inner_join rename select summarise -#' @importFrom tidyselect all_of -performance_analysis <- function(voter_file, - district_shape, - census_shape, - census_data, - join_census_shape = TRUE, - join_district_shape = TRUE, - state = NULL, - voter_id = "voter_id", - surname = "last_name", - district = "district", - census_state_col = "STATEFP10", - census_county_col = "COUNTYFP10", - census_tract_col = "TRACTCE10", - census_block_col = "BLOCKCE10", - crs = NULL, - coords = c("lon", "lat"), - census_geo = "block", - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - normalize = TRUE, - verbose = FALSE) { - if (verbose) { - n_voters <- nrow(voter_file) - message(paste( - "Voter file has", n_voters, "rows.\nBeginning", - "functionality analysis..." - )) - } - - # De-duplicate voter file - voter_file <- dedupe_voter_file( - voter_file = voter_file, - voter_id = voter_id - ) - if (verbose) { - n_voters_new <- nrow(voter_file) - message(paste( - "De-duplicating removed", n_voters - n_voters_new, "voters.", - "Voter file now has", n_voters_new, "rows.\nMerging", - "voter file with census shape files." - )) - n_voters <- n_voters_new - } - - # Merge the voter file to census shape file, then pick out the right columns - if (join_census_shape) { - voter_file_w_census <- merge_voter_file_to_shape( - voter_file = voter_file, - shape_file = census_shape, - crs = crs, - coords = coords, - voter_id = voter_id - ) - # Filter out voters that didn't match on the block - voter_file_w_census <- dplyr::filter( - voter_file_w_census, - !is.na(.data[[census_block_col]]) - ) - - if (verbose) { - n_voters_new <- nrow(voter_file_w_census) - message(paste( - "Matching by Census block removed", n_voters - n_voters_new, - "voters. Voter file now has", n_voters_new, - "rows.\nMerging voter file with district shape files." - )) - n_voters <- n_voters_new - } - } else { - # If we don't need to merge to Census shape file, then rename variable - if (verbose) { - message(paste("Voter file already matched to Census shapefile.")) - } - voter_file_w_census <- voter_file - } - - if (join_district_shape) { - # Merge the voter file with the district shape file - voter_file_w_district <- merge_voter_file_to_shape( - voter_file = voter_file_w_census, - shape_file = district_shape, - crs = crs, - coords = coords, - voter_id = voter_id - ) - # Filter out voters that didn't match on a district - voter_file_w_district <- dplyr::filter( - voter_file_w_district, - !is.na(.data[[district]]) - ) - if (verbose) { - n_voters_new <- nrow(voter_file_w_district) - message(paste( - "Matching by district removed", n_voters - n_voters_new, - "voters. Voter file now has", n_voters_new, - "rows.\nApplying BISG." - )) - n_voters <- n_voters_new - } - } else { - if (verbose) { - # If we don't need to merge to district shape file, then rename variable - message(paste("Voter file already matched to district shape.")) - voter_file_w_district <- voter_file - } - } - - # Select the final set of columns needed for BISG - voter_file_final <- voter_file_w_district %>% - dplyr::rename( - c( - "st" = tidyselect::all_of(census_state_col), - "county" = tidyselect::all_of(census_county_col), - "tract" = tidyselect::all_of(census_tract_col), - "block" = tidyselect::all_of(census_block_col) - ) - ) %>% - dplyr::select( - tidyselect::all_of(c( - voter_id, - surname, - district, - "st", - "county", - "tract", - "block" - )) - ) - - # Apply BISG to the voter file to get race predictions - voter_file_final_w_race <- wru_predict_race_wrapper( - voter_file = as.data.frame(voter_file_final), - census_data = census_data, - voter_id = voter_id, - surname = surname, - state = state, - county = "county", - tract = "tract", - block = "block", - census_geo = census_geo, - use_surname = use_surname, - surname_only = surname_only, - surname_year = surname_year, - use_age = use_age, - use_sex = use_sex, - return_surname_flag = TRUE, - return_geocode_flag = TRUE, - verbose = TRUE - ) - # Add matching flags to voter file - if (verbose) { - n_surname_match <- sum(voter_file_final_w_race$matched_surname) - n_geocode_match <- sum(voter_file_final_w_race$matched_geocode) - message(paste0( - paste("BISG didn't match", n_voters - n_surname_match, "surnames.\n"), - paste("BISG didn't match", n_voters - n_geocode_match, "geocodes.") - )) - } - - # Aggregate percentages across districts - results <- precinct_agg_combine( - voter_file = voter_file_final_w_race, - group_col = district, - race_cols = NULL, - include_total = FALSE - ) - - # If necessary, normalize counts - if (normalize) { - races <- c( - "pred.whi_prop", - "pred.bla_prop", - "pred.his_prop", - "pred.asi_prop", - "pred.oth_prop" - ) - sums <- rowSums(results[, races]) - results[, races] <- results[, races] / sums - } - - if (verbose) { - message("Performance analysis complete.") - } - return(list( - voter_file = voter_file_final_w_race, - results = results - )) -} diff --git a/R/surname_utils.R b/R/surname_utils.R deleted file mode 100644 index b25cdc5d..00000000 --- a/R/surname_utils.R +++ /dev/null @@ -1,298 +0,0 @@ -#' Counts the number of words per row in the column of a dataframe. -#' -#' A "word" is defined as a string of alphabetical characters separated by -#' either spaces or dashes (but not other special characters). -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param regex A string denoting the regular expression to use for querying the -#' the word count. -#' @return A vector of word counts. -#' -#' @export get_word_count -#' @importFrom stringr str_count -get_word_count <- function(voter_file, - surname_col = "last_name", - regex = "[ -]+") { - word_count <- 1 + stringr::str_count(voter_file[[surname_col]], regex) - return(word_count) -} - - -#' Gets special characters in a column of names. -#' -#' Returns a unique list of special characters found in a column of a dataframe. -#' By default, these characters consist of any that are not upper- or lower-case -#' letters. This preference can be overwritten by providing a new regular -#' expression. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param regex A string denoting the regular expression to use for identifying -#' non-special characters (by default, alphabetic characters). -#' @return A vector of unique special characters found in the names. -#' -#' @export get_unique_special_characters -#' @importFrom stringr str_c str_replace_all str_split -get_unique_special_characters <- function(voter_file, - surname_col = "last_name", - regex = "[A-Za-z]") { - # Replace all alphabetic characters with empty strings - characters <- stringr::str_replace_all(voter_file[[surname_col]], regex, "") - # Combine all strings together - characters <- stringr::str_c(characters, collapse = "") - # Split up by individual character, taking unique ones - characters <- stringr::str_split(characters, pattern = "")[[1]] - characters <- sort(unique(characters)) - return(characters) -} - - -#' Gets surnames containing special characters. -#' -#' Returns a subsetted voter file whose rows consist of voters that have -#' special characters in their last name. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param regex A string denoting the regular expression to use for querying the -#' the special characters. -#' @return A dataframe of voters whose surname has special characters. -#' -#' @export get_special_character_surnames -#' @importFrom stringr str_detect -get_special_character_surnames <- function(voter_file, - surname_col = "last_name", - regex = "[^A-Za-z]") { - special_characters <- stringr::str_detect(voter_file[[surname_col]], regex) - return(voter_file[special_characters, ]) -} - - -#' Strips special characters from a voter file. -#' -#' Given a voter file and a column, returns a voter file with special characters -#' stripped stripped from that column. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param regex A string denoting the regular expression to use for denoting the -#' the special characters. -#' @param replace The replacement string for special characters. -#' @return A dataframe of voters whose surname column is stripped of special -#' characters. -#' -#' @export strip_special_characters -#' @importFrom stringr str_replace_all -strip_special_characters <- function(voter_file, - surname_col = "last_name", - regex = "[^A-Za-z]+", - replace = " ") { - # Replace special characters with empty spaces - voter_file[[surname_col]] <- stringr::str_replace_all( - voter_file[[surname_col]], - regex, - replace - ) - return(voter_file) -} - - -#' Gets multi-barreled surnames from a voter file. -#' -#' A multi-barreled surname is one containing a dash or a space. This function -#' finds all multi-barreled surnames in a voter file. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param regex A string denoting the regular expression to use for denoting the -#' the special characters. -#' @return A dataframe of voters whose surnames are multi-barreled. -#' -#' @export get_multi_barreled_surnames -#' @importFrom dplyr filter -#' @importFrom stringr str_detect -get_multi_barreled_surnames <- function(voter_file, - surname_col = "last_name", - regex = "[ -]+") { - multi_barreled <- dplyr::filter( - voter_file, - stringr::str_detect(voter_file[[surname_col]], regex) - ) - return(multi_barreled) -} - - -#' Determines which surnames match to the Census list. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param strip_special Whether to strip special characters before matching in -#' the surname database. -#' @return A vector of logicals denoting a match or not. -#' -#' @export surname_match -#' @importFrom utils getFromNamespace -surname_match <- function(voter_file, - surname_col = "last_name", - strip_special = FALSE) { - if (strip_special) { - voter_file <- eiCompare::strip_special_characters( - voter_file = voter_file, - surname_col = surname_col, - replace = "" - ) - } - - # Determine if there's a surname match - surname_match <- voter_file[[surname_col]] %in% wru::surnames2010$surname - return(surname_match) -} - - -#' Briefly summarizes the surnames in a voter file. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @return No return value, called for side effects (message) -#' @export surname_summary -surname_summary <- function(voter_file, surname_col) { - # Print number of voters - n_voters <- nrow(voter_file) - message(paste("Voter file has", n_voters, "voters.")) - - # Print number of voters with no last name - n_nan_surnames <- sum(is.na(voter_file[[surname_col]])) - message(paste("Voter files has", n_nan_surnames, "voter(s) with no surname.")) - - # Print number of voters with special characters - special_character_surnames <- eiCompare::get_special_character_surnames( - voter_file = voter_file, - surname_col = surname_col - ) - n_special_character_surnames <- nrow(special_character_surnames) - message(paste( - "Voter file has", n_special_character_surnames, - "voters containing special characters." - )) - - # Print number of matches by default - n_organic_matches <- sum(eiCompare::surname_match( - voter_file = voter_file, - surname_col = surname_col, - strip_special = FALSE - )) - message(paste( - "Voter file has", n_organic_matches, - "voters with surnames matching the database by default." - )) - - # Print number of matches after stripping special characters - n_stripped_matches <- sum(eiCompare::surname_match( - voter_file = voter_file, - surname_col = surname_col, - strip_special = TRUE - )) - message(paste( - "Voter file has", n_stripped_matches, - "voters with surnames matching the database after removing", - "special characters." - )) -} - - - -#' Predicts, for one row in a voter file, the probability of a voter having a -#' certain race by averaging over each "barrel" of the surname. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param surname_only Whether to obtain probabilities for surnames only. -#' @param census_data A data frame containing Census data corresponding to the -#' geographic information for units in the voter file. -#' @param census_geo The census level at which to apply BISG. Passed to WRU. -#' @param surname_year Which Census year to use for surname matching. Passed to -#' WRU. -#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. -#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. -#' @param state A string denoting the state for which the data is queried. -#' @param county A string denoting the column containing the county FIPS code. -#' @param tract A string denoting the column containing the tract FIPS code. -#' @param block A string denoting the column containing the block FIPS code. -#' @param pattern What pattern to split surnames on. By default, surnames are -#' split on a space(s), which assumes hyphens have already been removed. -#' @param remove_patterns A list of strings which will be removed from the -#' list of barrels. -#' @return A vector of probabilities for each surname. -#' -#' @export predict_race_multi_barreled -#' @importFrom utils getFromNamespace -predict_race_multi_barreled <- function(voter_file, - surname_col = "last_name", - surname_only = TRUE, - census_data = NULL, - census_geo = "block", - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - state = NULL, - county = NULL, - tract = NULL, - block = NULL, - pattern = "[ -]+", - remove_patterns = NULL) { - # Split up multi-barreled surnames - surnames <- stringr::str_split( - voter_file[[surname_col]], - pattern = pattern - )[[1]] - # Remove specific barrels - if (!is.null(remove_patterns)) { - surnames <- surnames[!(surnames %in% remove_patterns)] - } - - # Get merge_surnames function out from wru - merge_surnames_copy <- utils::getFromNamespace("merge_surnames", "wru") - - # Use surname only - if (surname_only) { - new_voter_file <- data.frame(surname = surnames) - # Calculate probabilities using surnames only - probabilities <- suppressWarnings( - merge_surnames_copy( - voter.file = new_voter_file, - surname.year = 2010, - clean.surname = FALSE, - impute.missing = TRUE - ) - ) - probabilities <- as.numeric(colMeans(probabilities[, c(-1, -2)])) - } else { - # Transfer geolocations - new_voter_file <- data.frame( - surname = surnames, - state = state, - county = voter_file[[county]], - tract = voter_file[[tract]], - block = voter_file[[block]] - ) - - # Predict race using full BISG - invisible(capture.output( - bisg <- suppressWarnings( - wru::predict_race( - voter.file = new_voter_file, - census.surname = TRUE, - surname.only = FALSE, - surname.year = surname_year, - census.geo = census_geo, - census.data = census_data, - age = use_age, - sex = use_sex, - ) - ) - )) - probabilities <- as.numeric(colMeans(bisg[, c(6:10)])) - } - return(probabilities) -} diff --git a/R/wru_predict_race_wrapper.R b/R/wru_predict_race_wrapper.R deleted file mode 100644 index 65e82337..00000000 --- a/R/wru_predict_race_wrapper.R +++ /dev/null @@ -1,210 +0,0 @@ -#' Prepares a voter file for the WRU predict_race function, and then predicts -#' race. -#' -#' This function assumes that the Census data is provided to the function. It -#' does not provide the capability of downloading the Census data, since this -#' is a time intensive process. -#' -#' @param voter_file The voter file, containing columns with a surname and -#' potentially geographic information. -#' @param census_data A data frame containing Census data corresponding to the -#' geographic information for units in the voter file. -#' @param voter_id A string denoting the column containing voter ID. Default is -#' NULL, if there is no voter ID in the file. In this case, a voter ID will be -#' assigned. -#' @param surname A string denoting the column containing the surname. -#' @param state A string denoting the column containing the state FIPS code. -#' @param county A string denoting the column containing the county FIPS code. -#' @param tract A string denoting the column containing the tract FIPS code. -#' @param block A string denoting the column containing the block FIPS code. -#' @param census_geo The census level at which to apply BISG. Passed to WRU. -#' @param use_surname Whether to use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_only Whether to only use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_year Which Census year to use for surname matching. Passed to -#' WRU. -#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. -#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. -#' @param return_surname_flag If TRUE, returns a flag indicating whether the -#' surnames matched. -#' @param return_geocode_flag If TRUE, returns a flag indicating whether the -#' first level of geocode matched. -#' @param verbose A flag indicating whether to print out status messages. -#' @return The voter file component extracted from the provided data frame, with -#' additional surname/geocode flags, as well as a data frame race prediction. -#' -#' @references Imai and Khanna (2016) "Improving Ecological Inference by -#' Predicting Individual Ethnicity from Voter Registration Records" -#' -#' @export wru_predict_race_wrapper -#' @import wru -#' @importFrom dplyr relocate -#' @importFrom utils getFromNamespace -wru_predict_race_wrapper <- function(voter_file, - census_data, - voter_id = NULL, - surname = "last_name", - state = NULL, - county = NULL, - tract = NULL, - block = NULL, - census_geo = NULL, - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - return_surname_flag = FALSE, - return_geocode_flag = FALSE, - verbose = FALSE) { - # Tidy up voter file according to WRU's specifications - wru_voter_file <- tidy_voter_file_wru( - voter_file = voter_file, - voter_id = voter_id, - surname = surname, - state = state, - county = county, - tract = tract, - block = block - ) - - # Temporary check to force use_sex and use_age into FALSE - if (use_age) { - warning("age is currently disabled in wru... forcing use_age to be FALSE") - use_age <- FALSE - } - if (use_sex) { - warning("sex is currently disabled in wru... forcing use_sex to be FALSE") - use_sex <- FALSE - } - - # Get merge_surnames function out from wru - merge_surnames_copy <- utils::getFromNamespace("merge_surnames", "wru") - - # If necessary, check which surnames matched - if (return_surname_flag) { - if (verbose) { - message("Matching surnames.") - } - merged_surnames <- suppressWarnings( - merge_surnames_copy( - voter.file = wru_voter_file, - surname.year = surname_year, - clean.surname = TRUE, - impute.missing = TRUE - ) - ) - # Get matched surname flag - voter_file$matched_surname <- !(merged_surnames$surname.match == "") - } - - # Predict race using BISG via WRU - if (verbose) { - message("Performing BISG to obtain race probabilities.") - } - invisible(capture.output( - bisg <- suppressWarnings( - wru::predict_race( - voter.file = wru_voter_file, - census.surname = use_surname, - surname.only = surname_only, - surname.year = surname_year, - census.geo = census_geo, - census.data = census_data, - age = use_age, - sex = use_sex, - ) - ) - )) - # Re-order race predictions to match voter file - bisg <- bisg[match(wru_voter_file$voterid, bisg$voterid), ] - # Find out which geographic units didn't match - no_geocode_match <- is.na(bisg$pred.whi) - - # For voters that didn't match geocode, use the next highest level - if (sum(no_geocode_match) > 1) { - if (verbose) { - message( - "Some voters failed geocode matching. Matching at a higher level." - ) - } - no_match_voters <- wru_voter_file[no_geocode_match, ] - # Re-run BISG using a new geographic unit - if (census_geo == "block") { - new_geo <- "tract" - } else if (census_geo == "tract") { - new_geo <- "county" - } - # Re-run the BISG only on voters that didn't match - invisible(capture.output( - bisg_no_match <- suppressWarnings( - wru::predict_race( - voter.file = no_match_voters, - census.surname = use_surname, - surname.only = surname_only, - surname.year = surname_year, - census.geo = new_geo, - census.data = census_data, - age = use_age, - sex = use_sex, - ) - ) - )) - # Re-order race predictions to match voter file - bisg_no_match <- bisg_no_match[match( - no_match_voters$voterid, - bisg_no_match$voterid - ), ] - # Merge the new probabilities back into the old dataframe - bisg[no_geocode_match, ] <- bisg_no_match - - matched_geocode <- !no_geocode_match - } else { - matched_geocode <- TRUE - } - - # Store geocode match flag - if (return_geocode_flag & !surname_only) { - voter_file$matched_geocode <- matched_geocode - } - - # Final check if there are no matches - no_match_final <- is.na(bisg$pred.whi) - if (sum(no_match_final) > 1) { - if (verbose) { - message(paste( - "Some surnames did not match at higher geographic level.", - "Using surname only for these cases." - )) - } - # Use probabilities from surnames only for those that don't match - invisible(capture.output( - no_match_surnames <- suppressWarnings( - merge_surnames_copy( - voter.file = wru_voter_file[no_match_final, ], - surname.year = surname_year, - clean.surname = TRUE, - impute.missing = TRUE - ) - ) - )) - # Merge back into BISG estimates - p_cols <- c("p_whi", "p_bla", "p_his", "p_asi", "p_oth") - pred_cols <- c("pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth") - bisg[no_match_final, ][, pred_cols] <- no_match_surnames[, p_cols] - } - - # Finalize voter file - if (is.null(voter_id)) { - voter_file$voter_id <- bisg$voterid - voter_file <- dplyr::relocate(voter_file, voter_id, .before = 1) - } - pred_cols <- c("pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth") - voter_file[, pred_cols] <- bisg[, pred_cols] - - if (verbose) { - message("BISG complete.") - } - return(voter_file) -} From e924a62e1a518f5b75c867357aa695ca4b3bdf14 Mon Sep 17 00:00:00 2001 From: rachel-carroll Date: Sun, 2 Feb 2025 21:10:29 -0500 Subject: [PATCH 2/8] remove wru dependency ref in description and readme --- DESCRIPTION | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index dd259612..56310f80 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,7 +40,7 @@ Authors@R: URL: https://github.com/RPVote/eiCompare Description: Provides a comprehensive suite of tools for estimating the candidate preferences of racial/ethnic voting blocs in elections. Includes functions for predicting voter race/ethnicity and conducting ecological inference. Race/ethnicity prediction builds on race prediction developed by Imai et al. (2016) . Ecological inference methods are based on King (1997) , ; King et. al. (2004) , . License: GPL-3 -Depends: R (>= 3.5.0), eiPack, ei, wru (>= 1.0.0) +Depends: R (>= 3.5.0), eiPack, ei Imports: bayestestR, coda, data.table, doSNOW, dplyr, foreach, ggplot2, graphics, magrittr, mcmcse, methods, overlapping, purrr, rlang, sf, stringr, tidyr,tidyselect diff --git a/README.md b/README.md index 34579148..6353efe3 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![R build status](https://github.com/RPVote/eiCompare/workflows/R-CMD-check/badge.svg)](https://github.com/RPVote/eiCompare/actions?workflow=R-CMD-check) [![Style status](https://github.com/RPVote/eiCompare/workflows/Styler/badge.svg)](https://github.com/RPVote/eiCompare/actions?workflow=Styler) -`eiCompare` is an R package built to help practitioners and academics quantify racially polarized voting (RPV) with ease and confidence. It builds on top of several existing packages, augmenting their utility for measuring racially polarized voting in elections. Underlying packages include `ei`, `eiPack`, and `wru`. +`eiCompare` is an R package built to help practitioners and academics quantify racially polarized voting (RPV) with ease and confidence. It builds on top of several existing packages, augmenting their utility for measuring racially polarized voting in elections. Underlying packages include `ei` and `eiPack`. `eiCompare` was built with several types of users in mind: From 7573bf68a362c35742926087b34b70d7a716d3aa Mon Sep 17 00:00:00 2001 From: rachel-carroll Date: Sun, 2 Feb 2025 21:12:32 -0500 Subject: [PATCH 3/8] remove test `wru_predict_race_wrapper` --- .../testthat/test_wru_predict_race_wrapper.R | 66 ------------------- 1 file changed, 66 deletions(-) delete mode 100644 tests/testthat/test_wru_predict_race_wrapper.R diff --git a/tests/testthat/test_wru_predict_race_wrapper.R b/tests/testthat/test_wru_predict_race_wrapper.R deleted file mode 100644 index 98137b67..00000000 --- a/tests/testthat/test_wru_predict_race_wrapper.R +++ /dev/null @@ -1,66 +0,0 @@ -context("Testing WRU predict race wrapper function.") - -test_that("WRU wrapper correctly calculates probabilities.", { - # Create voter file - voter_file <- data.frame( - voter_id = c("1", "2"), - surname = c("JOHNSON", "HERNANDEZ"), - precinct = c("23", "34"), - state = "NY", - county = c("087", "087"), - tract = c("010101", "010101"), - block = c("1001", "1016") - ) - - # Load Rockland county Census information - data(rockland_census) - rockland_census$NY$year <- 2010 - # Run predict race wrapper function - bisg <- wru_predict_race_wrapper( - voter_file = voter_file, - census_data = rockland_census, - voter_id = "voter_id", - surname = "surname", - state = "NY", - county = "county", - tract = "tract", - block = "block", - census_geo = "block", - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - return_surname_flag = TRUE, - return_geocode_flag = TRUE, - verbose = FALSE - ) - expect_true(all(!is.na(bisg))) - expect_true(all(bisg$merged_surname)) - expect_true(all(bisg$merged_geocode)) - expect_true("precinct" %in% names(bisg)) - - - # Run predict race wrapper function - bisg <- wru_predict_race_wrapper( - voter_file = voter_file, - census_data = rockland_census, - voter_id = "voter_id", - surname = "surname", - state = "NY", - county = "county", - tract = "tract", - block = "block", - census_geo = "block", - use_surname = TRUE, - surname_only = TRUE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - return_surname_flag = TRUE, - return_geocode_flag = TRUE, - verbose = FALSE - ) - expect_true(all(!is.na(bisg))) - testthat::expect_false(any(names(bisg) == "merged_geocode")) -}) From 48efd31554f76ce360c820a6633b498c4fb05842 Mon Sep 17 00:00:00 2001 From: rachel-carroll Date: Sun, 2 Feb 2025 21:49:14 -0500 Subject: [PATCH 4/8] Revert "remove functions dependent on wru" This reverts commit f93fa088aade8059c492f71766117ad72d45200f. --- R/performance_analysis.R | 245 ++++++++++++++++++++++++++++ R/surname_utils.R | 298 +++++++++++++++++++++++++++++++++++ R/wru_predict_race_wrapper.R | 210 ++++++++++++++++++++++++ 3 files changed, 753 insertions(+) create mode 100644 R/performance_analysis.R create mode 100644 R/surname_utils.R create mode 100644 R/wru_predict_race_wrapper.R diff --git a/R/performance_analysis.R b/R/performance_analysis.R new file mode 100644 index 00000000..ad9503af --- /dev/null +++ b/R/performance_analysis.R @@ -0,0 +1,245 @@ +#' Performs a performance analysis using a voter file, census shape, and +#' district shape. +#' +#' @param voter_file A dataframe containing the voter file. +#' @param district_shape The shapefiles for the new districts or precincts to +#' consider. +#' @param census_shape The shapefiles for the Census blocks or tracts for which +#' the voter file will be geocoded against. +#' @param census_data A dataframe containing the Census tracts or blocks in the +#' region for the voter file. +#' @param join_district_shape A logical denoting whether the voter file already +#' has the district identity per voter. If TRUE, then a column names for the +#' district must be provided. If FALSE, then a distrct shape must be provided +#' in order to perform a spatial join. +#' @param join_census_shape A logical denoting whether the voter file already +#' has the Census block, tract, and county information. If TRUE, then column +#' names for these items must be provided. If FALSE, then a Census shape must +#' be provided in order to perform a spatial join. +#' @param state The state in which the functionality analysis is performed, as +#' a two character string. +#' @param voter_id A string denoting the column name for the voter ID. +#' @param surname A string denoting the column name for the surname. +#' @param district A string denoting the column name for the district. +#' @param census_state_col The column in the Census data that indicates state. +#' If the voter file already has Census information, this should denote the +#' column in the voter file containing the state FIPS code. +#' @param census_county_col The column in the Census data that indicates county. +#' If the voter file already has Census information, this should denote the +#' column in the voter file containing the county FIPS code. +#' @param census_tract_col The column in the Census data that indicates tract. +#' If the voter file already has Census information, this should denote the +#' column in the voter file containing the tract FIPS code. +#' @param census_block_col The column in the Census data that indicates block. +#' If the voter file already has Census information, this should denote the +#' column in the voter file containing the block FIPS code. +#' @param crs A string denoting the PROJ4 string for projecting maps. +#' @param coords The columns for the coordinates. +#' @param census_geo The geographic level at which to perform BISG. +#' @param use_surname Whether to use the surname in calculating race +#' probabilities. Passed to WRU. +#' @param surname_only Whether to only use the surname in calculating race +#' probabilities. Passed to WRU. +#' @param surname_year Which Census year to use for surname matching. Passed to +#' WRU. +#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. +#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. +#' @param normalize If TRUE, normalizes the district percentages. +#' @param verbose If TRUE, will output diagnostic strings. +#' @return The processed voter file and a summary of district turnout across +#' racial groups. +#' +#' @export performance_analysis +#' @importFrom dplyr filter group_by_at inner_join rename select summarise +#' @importFrom tidyselect all_of +performance_analysis <- function(voter_file, + district_shape, + census_shape, + census_data, + join_census_shape = TRUE, + join_district_shape = TRUE, + state = NULL, + voter_id = "voter_id", + surname = "last_name", + district = "district", + census_state_col = "STATEFP10", + census_county_col = "COUNTYFP10", + census_tract_col = "TRACTCE10", + census_block_col = "BLOCKCE10", + crs = NULL, + coords = c("lon", "lat"), + census_geo = "block", + use_surname = TRUE, + surname_only = FALSE, + surname_year = 2010, + use_age = FALSE, + use_sex = FALSE, + normalize = TRUE, + verbose = FALSE) { + if (verbose) { + n_voters <- nrow(voter_file) + message(paste( + "Voter file has", n_voters, "rows.\nBeginning", + "functionality analysis..." + )) + } + + # De-duplicate voter file + voter_file <- dedupe_voter_file( + voter_file = voter_file, + voter_id = voter_id + ) + if (verbose) { + n_voters_new <- nrow(voter_file) + message(paste( + "De-duplicating removed", n_voters - n_voters_new, "voters.", + "Voter file now has", n_voters_new, "rows.\nMerging", + "voter file with census shape files." + )) + n_voters <- n_voters_new + } + + # Merge the voter file to census shape file, then pick out the right columns + if (join_census_shape) { + voter_file_w_census <- merge_voter_file_to_shape( + voter_file = voter_file, + shape_file = census_shape, + crs = crs, + coords = coords, + voter_id = voter_id + ) + # Filter out voters that didn't match on the block + voter_file_w_census <- dplyr::filter( + voter_file_w_census, + !is.na(.data[[census_block_col]]) + ) + + if (verbose) { + n_voters_new <- nrow(voter_file_w_census) + message(paste( + "Matching by Census block removed", n_voters - n_voters_new, + "voters. Voter file now has", n_voters_new, + "rows.\nMerging voter file with district shape files." + )) + n_voters <- n_voters_new + } + } else { + # If we don't need to merge to Census shape file, then rename variable + if (verbose) { + message(paste("Voter file already matched to Census shapefile.")) + } + voter_file_w_census <- voter_file + } + + if (join_district_shape) { + # Merge the voter file with the district shape file + voter_file_w_district <- merge_voter_file_to_shape( + voter_file = voter_file_w_census, + shape_file = district_shape, + crs = crs, + coords = coords, + voter_id = voter_id + ) + # Filter out voters that didn't match on a district + voter_file_w_district <- dplyr::filter( + voter_file_w_district, + !is.na(.data[[district]]) + ) + if (verbose) { + n_voters_new <- nrow(voter_file_w_district) + message(paste( + "Matching by district removed", n_voters - n_voters_new, + "voters. Voter file now has", n_voters_new, + "rows.\nApplying BISG." + )) + n_voters <- n_voters_new + } + } else { + if (verbose) { + # If we don't need to merge to district shape file, then rename variable + message(paste("Voter file already matched to district shape.")) + voter_file_w_district <- voter_file + } + } + + # Select the final set of columns needed for BISG + voter_file_final <- voter_file_w_district %>% + dplyr::rename( + c( + "st" = tidyselect::all_of(census_state_col), + "county" = tidyselect::all_of(census_county_col), + "tract" = tidyselect::all_of(census_tract_col), + "block" = tidyselect::all_of(census_block_col) + ) + ) %>% + dplyr::select( + tidyselect::all_of(c( + voter_id, + surname, + district, + "st", + "county", + "tract", + "block" + )) + ) + + # Apply BISG to the voter file to get race predictions + voter_file_final_w_race <- wru_predict_race_wrapper( + voter_file = as.data.frame(voter_file_final), + census_data = census_data, + voter_id = voter_id, + surname = surname, + state = state, + county = "county", + tract = "tract", + block = "block", + census_geo = census_geo, + use_surname = use_surname, + surname_only = surname_only, + surname_year = surname_year, + use_age = use_age, + use_sex = use_sex, + return_surname_flag = TRUE, + return_geocode_flag = TRUE, + verbose = TRUE + ) + # Add matching flags to voter file + if (verbose) { + n_surname_match <- sum(voter_file_final_w_race$matched_surname) + n_geocode_match <- sum(voter_file_final_w_race$matched_geocode) + message(paste0( + paste("BISG didn't match", n_voters - n_surname_match, "surnames.\n"), + paste("BISG didn't match", n_voters - n_geocode_match, "geocodes.") + )) + } + + # Aggregate percentages across districts + results <- precinct_agg_combine( + voter_file = voter_file_final_w_race, + group_col = district, + race_cols = NULL, + include_total = FALSE + ) + + # If necessary, normalize counts + if (normalize) { + races <- c( + "pred.whi_prop", + "pred.bla_prop", + "pred.his_prop", + "pred.asi_prop", + "pred.oth_prop" + ) + sums <- rowSums(results[, races]) + results[, races] <- results[, races] / sums + } + + if (verbose) { + message("Performance analysis complete.") + } + return(list( + voter_file = voter_file_final_w_race, + results = results + )) +} diff --git a/R/surname_utils.R b/R/surname_utils.R new file mode 100644 index 00000000..b25cdc5d --- /dev/null +++ b/R/surname_utils.R @@ -0,0 +1,298 @@ +#' Counts the number of words per row in the column of a dataframe. +#' +#' A "word" is defined as a string of alphabetical characters separated by +#' either spaces or dashes (but not other special characters). +#' +#' @param voter_file The voter file, with each row consisting of a voter. +#' @param surname_col A string denoting the surname column. +#' @param regex A string denoting the regular expression to use for querying the +#' the word count. +#' @return A vector of word counts. +#' +#' @export get_word_count +#' @importFrom stringr str_count +get_word_count <- function(voter_file, + surname_col = "last_name", + regex = "[ -]+") { + word_count <- 1 + stringr::str_count(voter_file[[surname_col]], regex) + return(word_count) +} + + +#' Gets special characters in a column of names. +#' +#' Returns a unique list of special characters found in a column of a dataframe. +#' By default, these characters consist of any that are not upper- or lower-case +#' letters. This preference can be overwritten by providing a new regular +#' expression. +#' +#' @param voter_file The voter file, with each row consisting of a voter. +#' @param surname_col A string denoting the surname column. +#' @param regex A string denoting the regular expression to use for identifying +#' non-special characters (by default, alphabetic characters). +#' @return A vector of unique special characters found in the names. +#' +#' @export get_unique_special_characters +#' @importFrom stringr str_c str_replace_all str_split +get_unique_special_characters <- function(voter_file, + surname_col = "last_name", + regex = "[A-Za-z]") { + # Replace all alphabetic characters with empty strings + characters <- stringr::str_replace_all(voter_file[[surname_col]], regex, "") + # Combine all strings together + characters <- stringr::str_c(characters, collapse = "") + # Split up by individual character, taking unique ones + characters <- stringr::str_split(characters, pattern = "")[[1]] + characters <- sort(unique(characters)) + return(characters) +} + + +#' Gets surnames containing special characters. +#' +#' Returns a subsetted voter file whose rows consist of voters that have +#' special characters in their last name. +#' +#' @param voter_file The voter file, with each row consisting of a voter. +#' @param surname_col A string denoting the surname column. +#' @param regex A string denoting the regular expression to use for querying the +#' the special characters. +#' @return A dataframe of voters whose surname has special characters. +#' +#' @export get_special_character_surnames +#' @importFrom stringr str_detect +get_special_character_surnames <- function(voter_file, + surname_col = "last_name", + regex = "[^A-Za-z]") { + special_characters <- stringr::str_detect(voter_file[[surname_col]], regex) + return(voter_file[special_characters, ]) +} + + +#' Strips special characters from a voter file. +#' +#' Given a voter file and a column, returns a voter file with special characters +#' stripped stripped from that column. +#' +#' @param voter_file The voter file, with each row consisting of a voter. +#' @param surname_col A string denoting the surname column. +#' @param regex A string denoting the regular expression to use for denoting the +#' the special characters. +#' @param replace The replacement string for special characters. +#' @return A dataframe of voters whose surname column is stripped of special +#' characters. +#' +#' @export strip_special_characters +#' @importFrom stringr str_replace_all +strip_special_characters <- function(voter_file, + surname_col = "last_name", + regex = "[^A-Za-z]+", + replace = " ") { + # Replace special characters with empty spaces + voter_file[[surname_col]] <- stringr::str_replace_all( + voter_file[[surname_col]], + regex, + replace + ) + return(voter_file) +} + + +#' Gets multi-barreled surnames from a voter file. +#' +#' A multi-barreled surname is one containing a dash or a space. This function +#' finds all multi-barreled surnames in a voter file. +#' +#' @param voter_file The voter file, with each row consisting of a voter. +#' @param surname_col A string denoting the surname column. +#' @param regex A string denoting the regular expression to use for denoting the +#' the special characters. +#' @return A dataframe of voters whose surnames are multi-barreled. +#' +#' @export get_multi_barreled_surnames +#' @importFrom dplyr filter +#' @importFrom stringr str_detect +get_multi_barreled_surnames <- function(voter_file, + surname_col = "last_name", + regex = "[ -]+") { + multi_barreled <- dplyr::filter( + voter_file, + stringr::str_detect(voter_file[[surname_col]], regex) + ) + return(multi_barreled) +} + + +#' Determines which surnames match to the Census list. +#' +#' @param voter_file The voter file, with each row consisting of a voter. +#' @param surname_col A string denoting the surname column. +#' @param strip_special Whether to strip special characters before matching in +#' the surname database. +#' @return A vector of logicals denoting a match or not. +#' +#' @export surname_match +#' @importFrom utils getFromNamespace +surname_match <- function(voter_file, + surname_col = "last_name", + strip_special = FALSE) { + if (strip_special) { + voter_file <- eiCompare::strip_special_characters( + voter_file = voter_file, + surname_col = surname_col, + replace = "" + ) + } + + # Determine if there's a surname match + surname_match <- voter_file[[surname_col]] %in% wru::surnames2010$surname + return(surname_match) +} + + +#' Briefly summarizes the surnames in a voter file. +#' +#' @param voter_file The voter file, with each row consisting of a voter. +#' @param surname_col A string denoting the surname column. +#' @return No return value, called for side effects (message) +#' @export surname_summary +surname_summary <- function(voter_file, surname_col) { + # Print number of voters + n_voters <- nrow(voter_file) + message(paste("Voter file has", n_voters, "voters.")) + + # Print number of voters with no last name + n_nan_surnames <- sum(is.na(voter_file[[surname_col]])) + message(paste("Voter files has", n_nan_surnames, "voter(s) with no surname.")) + + # Print number of voters with special characters + special_character_surnames <- eiCompare::get_special_character_surnames( + voter_file = voter_file, + surname_col = surname_col + ) + n_special_character_surnames <- nrow(special_character_surnames) + message(paste( + "Voter file has", n_special_character_surnames, + "voters containing special characters." + )) + + # Print number of matches by default + n_organic_matches <- sum(eiCompare::surname_match( + voter_file = voter_file, + surname_col = surname_col, + strip_special = FALSE + )) + message(paste( + "Voter file has", n_organic_matches, + "voters with surnames matching the database by default." + )) + + # Print number of matches after stripping special characters + n_stripped_matches <- sum(eiCompare::surname_match( + voter_file = voter_file, + surname_col = surname_col, + strip_special = TRUE + )) + message(paste( + "Voter file has", n_stripped_matches, + "voters with surnames matching the database after removing", + "special characters." + )) +} + + + +#' Predicts, for one row in a voter file, the probability of a voter having a +#' certain race by averaging over each "barrel" of the surname. +#' +#' @param voter_file The voter file, with each row consisting of a voter. +#' @param surname_col A string denoting the surname column. +#' @param surname_only Whether to obtain probabilities for surnames only. +#' @param census_data A data frame containing Census data corresponding to the +#' geographic information for units in the voter file. +#' @param census_geo The census level at which to apply BISG. Passed to WRU. +#' @param surname_year Which Census year to use for surname matching. Passed to +#' WRU. +#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. +#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. +#' @param state A string denoting the state for which the data is queried. +#' @param county A string denoting the column containing the county FIPS code. +#' @param tract A string denoting the column containing the tract FIPS code. +#' @param block A string denoting the column containing the block FIPS code. +#' @param pattern What pattern to split surnames on. By default, surnames are +#' split on a space(s), which assumes hyphens have already been removed. +#' @param remove_patterns A list of strings which will be removed from the +#' list of barrels. +#' @return A vector of probabilities for each surname. +#' +#' @export predict_race_multi_barreled +#' @importFrom utils getFromNamespace +predict_race_multi_barreled <- function(voter_file, + surname_col = "last_name", + surname_only = TRUE, + census_data = NULL, + census_geo = "block", + surname_year = 2010, + use_age = FALSE, + use_sex = FALSE, + state = NULL, + county = NULL, + tract = NULL, + block = NULL, + pattern = "[ -]+", + remove_patterns = NULL) { + # Split up multi-barreled surnames + surnames <- stringr::str_split( + voter_file[[surname_col]], + pattern = pattern + )[[1]] + # Remove specific barrels + if (!is.null(remove_patterns)) { + surnames <- surnames[!(surnames %in% remove_patterns)] + } + + # Get merge_surnames function out from wru + merge_surnames_copy <- utils::getFromNamespace("merge_surnames", "wru") + + # Use surname only + if (surname_only) { + new_voter_file <- data.frame(surname = surnames) + # Calculate probabilities using surnames only + probabilities <- suppressWarnings( + merge_surnames_copy( + voter.file = new_voter_file, + surname.year = 2010, + clean.surname = FALSE, + impute.missing = TRUE + ) + ) + probabilities <- as.numeric(colMeans(probabilities[, c(-1, -2)])) + } else { + # Transfer geolocations + new_voter_file <- data.frame( + surname = surnames, + state = state, + county = voter_file[[county]], + tract = voter_file[[tract]], + block = voter_file[[block]] + ) + + # Predict race using full BISG + invisible(capture.output( + bisg <- suppressWarnings( + wru::predict_race( + voter.file = new_voter_file, + census.surname = TRUE, + surname.only = FALSE, + surname.year = surname_year, + census.geo = census_geo, + census.data = census_data, + age = use_age, + sex = use_sex, + ) + ) + )) + probabilities <- as.numeric(colMeans(bisg[, c(6:10)])) + } + return(probabilities) +} diff --git a/R/wru_predict_race_wrapper.R b/R/wru_predict_race_wrapper.R new file mode 100644 index 00000000..65e82337 --- /dev/null +++ b/R/wru_predict_race_wrapper.R @@ -0,0 +1,210 @@ +#' Prepares a voter file for the WRU predict_race function, and then predicts +#' race. +#' +#' This function assumes that the Census data is provided to the function. It +#' does not provide the capability of downloading the Census data, since this +#' is a time intensive process. +#' +#' @param voter_file The voter file, containing columns with a surname and +#' potentially geographic information. +#' @param census_data A data frame containing Census data corresponding to the +#' geographic information for units in the voter file. +#' @param voter_id A string denoting the column containing voter ID. Default is +#' NULL, if there is no voter ID in the file. In this case, a voter ID will be +#' assigned. +#' @param surname A string denoting the column containing the surname. +#' @param state A string denoting the column containing the state FIPS code. +#' @param county A string denoting the column containing the county FIPS code. +#' @param tract A string denoting the column containing the tract FIPS code. +#' @param block A string denoting the column containing the block FIPS code. +#' @param census_geo The census level at which to apply BISG. Passed to WRU. +#' @param use_surname Whether to use the surname in calculating race +#' probabilities. Passed to WRU. +#' @param surname_only Whether to only use the surname in calculating race +#' probabilities. Passed to WRU. +#' @param surname_year Which Census year to use for surname matching. Passed to +#' WRU. +#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. +#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. +#' @param return_surname_flag If TRUE, returns a flag indicating whether the +#' surnames matched. +#' @param return_geocode_flag If TRUE, returns a flag indicating whether the +#' first level of geocode matched. +#' @param verbose A flag indicating whether to print out status messages. +#' @return The voter file component extracted from the provided data frame, with +#' additional surname/geocode flags, as well as a data frame race prediction. +#' +#' @references Imai and Khanna (2016) "Improving Ecological Inference by +#' Predicting Individual Ethnicity from Voter Registration Records" +#' +#' @export wru_predict_race_wrapper +#' @import wru +#' @importFrom dplyr relocate +#' @importFrom utils getFromNamespace +wru_predict_race_wrapper <- function(voter_file, + census_data, + voter_id = NULL, + surname = "last_name", + state = NULL, + county = NULL, + tract = NULL, + block = NULL, + census_geo = NULL, + use_surname = TRUE, + surname_only = FALSE, + surname_year = 2010, + use_age = FALSE, + use_sex = FALSE, + return_surname_flag = FALSE, + return_geocode_flag = FALSE, + verbose = FALSE) { + # Tidy up voter file according to WRU's specifications + wru_voter_file <- tidy_voter_file_wru( + voter_file = voter_file, + voter_id = voter_id, + surname = surname, + state = state, + county = county, + tract = tract, + block = block + ) + + # Temporary check to force use_sex and use_age into FALSE + if (use_age) { + warning("age is currently disabled in wru... forcing use_age to be FALSE") + use_age <- FALSE + } + if (use_sex) { + warning("sex is currently disabled in wru... forcing use_sex to be FALSE") + use_sex <- FALSE + } + + # Get merge_surnames function out from wru + merge_surnames_copy <- utils::getFromNamespace("merge_surnames", "wru") + + # If necessary, check which surnames matched + if (return_surname_flag) { + if (verbose) { + message("Matching surnames.") + } + merged_surnames <- suppressWarnings( + merge_surnames_copy( + voter.file = wru_voter_file, + surname.year = surname_year, + clean.surname = TRUE, + impute.missing = TRUE + ) + ) + # Get matched surname flag + voter_file$matched_surname <- !(merged_surnames$surname.match == "") + } + + # Predict race using BISG via WRU + if (verbose) { + message("Performing BISG to obtain race probabilities.") + } + invisible(capture.output( + bisg <- suppressWarnings( + wru::predict_race( + voter.file = wru_voter_file, + census.surname = use_surname, + surname.only = surname_only, + surname.year = surname_year, + census.geo = census_geo, + census.data = census_data, + age = use_age, + sex = use_sex, + ) + ) + )) + # Re-order race predictions to match voter file + bisg <- bisg[match(wru_voter_file$voterid, bisg$voterid), ] + # Find out which geographic units didn't match + no_geocode_match <- is.na(bisg$pred.whi) + + # For voters that didn't match geocode, use the next highest level + if (sum(no_geocode_match) > 1) { + if (verbose) { + message( + "Some voters failed geocode matching. Matching at a higher level." + ) + } + no_match_voters <- wru_voter_file[no_geocode_match, ] + # Re-run BISG using a new geographic unit + if (census_geo == "block") { + new_geo <- "tract" + } else if (census_geo == "tract") { + new_geo <- "county" + } + # Re-run the BISG only on voters that didn't match + invisible(capture.output( + bisg_no_match <- suppressWarnings( + wru::predict_race( + voter.file = no_match_voters, + census.surname = use_surname, + surname.only = surname_only, + surname.year = surname_year, + census.geo = new_geo, + census.data = census_data, + age = use_age, + sex = use_sex, + ) + ) + )) + # Re-order race predictions to match voter file + bisg_no_match <- bisg_no_match[match( + no_match_voters$voterid, + bisg_no_match$voterid + ), ] + # Merge the new probabilities back into the old dataframe + bisg[no_geocode_match, ] <- bisg_no_match + + matched_geocode <- !no_geocode_match + } else { + matched_geocode <- TRUE + } + + # Store geocode match flag + if (return_geocode_flag & !surname_only) { + voter_file$matched_geocode <- matched_geocode + } + + # Final check if there are no matches + no_match_final <- is.na(bisg$pred.whi) + if (sum(no_match_final) > 1) { + if (verbose) { + message(paste( + "Some surnames did not match at higher geographic level.", + "Using surname only for these cases." + )) + } + # Use probabilities from surnames only for those that don't match + invisible(capture.output( + no_match_surnames <- suppressWarnings( + merge_surnames_copy( + voter.file = wru_voter_file[no_match_final, ], + surname.year = surname_year, + clean.surname = TRUE, + impute.missing = TRUE + ) + ) + )) + # Merge back into BISG estimates + p_cols <- c("p_whi", "p_bla", "p_his", "p_asi", "p_oth") + pred_cols <- c("pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth") + bisg[no_match_final, ][, pred_cols] <- no_match_surnames[, p_cols] + } + + # Finalize voter file + if (is.null(voter_id)) { + voter_file$voter_id <- bisg$voterid + voter_file <- dplyr::relocate(voter_file, voter_id, .before = 1) + } + pred_cols <- c("pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth") + voter_file[, pred_cols] <- bisg[, pred_cols] + + if (verbose) { + message("BISG complete.") + } + return(voter_file) +} From 44c5d0974a3e37a33563d2201450412618658cbd Mon Sep 17 00:00:00 2001 From: rachel-carroll Date: Sun, 2 Feb 2025 21:52:11 -0500 Subject: [PATCH 5/8] bring back non wru surname_utils functions --- R/performance_analysis.R | 245 ----------------------------------- R/surname_utils.R | 125 ------------------ R/wru_predict_race_wrapper.R | 210 ------------------------------ 3 files changed, 580 deletions(-) delete mode 100644 R/performance_analysis.R delete mode 100644 R/wru_predict_race_wrapper.R diff --git a/R/performance_analysis.R b/R/performance_analysis.R deleted file mode 100644 index ad9503af..00000000 --- a/R/performance_analysis.R +++ /dev/null @@ -1,245 +0,0 @@ -#' Performs a performance analysis using a voter file, census shape, and -#' district shape. -#' -#' @param voter_file A dataframe containing the voter file. -#' @param district_shape The shapefiles for the new districts or precincts to -#' consider. -#' @param census_shape The shapefiles for the Census blocks or tracts for which -#' the voter file will be geocoded against. -#' @param census_data A dataframe containing the Census tracts or blocks in the -#' region for the voter file. -#' @param join_district_shape A logical denoting whether the voter file already -#' has the district identity per voter. If TRUE, then a column names for the -#' district must be provided. If FALSE, then a distrct shape must be provided -#' in order to perform a spatial join. -#' @param join_census_shape A logical denoting whether the voter file already -#' has the Census block, tract, and county information. If TRUE, then column -#' names for these items must be provided. If FALSE, then a Census shape must -#' be provided in order to perform a spatial join. -#' @param state The state in which the functionality analysis is performed, as -#' a two character string. -#' @param voter_id A string denoting the column name for the voter ID. -#' @param surname A string denoting the column name for the surname. -#' @param district A string denoting the column name for the district. -#' @param census_state_col The column in the Census data that indicates state. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the state FIPS code. -#' @param census_county_col The column in the Census data that indicates county. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the county FIPS code. -#' @param census_tract_col The column in the Census data that indicates tract. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the tract FIPS code. -#' @param census_block_col The column in the Census data that indicates block. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the block FIPS code. -#' @param crs A string denoting the PROJ4 string for projecting maps. -#' @param coords The columns for the coordinates. -#' @param census_geo The geographic level at which to perform BISG. -#' @param use_surname Whether to use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_only Whether to only use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_year Which Census year to use for surname matching. Passed to -#' WRU. -#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. -#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. -#' @param normalize If TRUE, normalizes the district percentages. -#' @param verbose If TRUE, will output diagnostic strings. -#' @return The processed voter file and a summary of district turnout across -#' racial groups. -#' -#' @export performance_analysis -#' @importFrom dplyr filter group_by_at inner_join rename select summarise -#' @importFrom tidyselect all_of -performance_analysis <- function(voter_file, - district_shape, - census_shape, - census_data, - join_census_shape = TRUE, - join_district_shape = TRUE, - state = NULL, - voter_id = "voter_id", - surname = "last_name", - district = "district", - census_state_col = "STATEFP10", - census_county_col = "COUNTYFP10", - census_tract_col = "TRACTCE10", - census_block_col = "BLOCKCE10", - crs = NULL, - coords = c("lon", "lat"), - census_geo = "block", - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - normalize = TRUE, - verbose = FALSE) { - if (verbose) { - n_voters <- nrow(voter_file) - message(paste( - "Voter file has", n_voters, "rows.\nBeginning", - "functionality analysis..." - )) - } - - # De-duplicate voter file - voter_file <- dedupe_voter_file( - voter_file = voter_file, - voter_id = voter_id - ) - if (verbose) { - n_voters_new <- nrow(voter_file) - message(paste( - "De-duplicating removed", n_voters - n_voters_new, "voters.", - "Voter file now has", n_voters_new, "rows.\nMerging", - "voter file with census shape files." - )) - n_voters <- n_voters_new - } - - # Merge the voter file to census shape file, then pick out the right columns - if (join_census_shape) { - voter_file_w_census <- merge_voter_file_to_shape( - voter_file = voter_file, - shape_file = census_shape, - crs = crs, - coords = coords, - voter_id = voter_id - ) - # Filter out voters that didn't match on the block - voter_file_w_census <- dplyr::filter( - voter_file_w_census, - !is.na(.data[[census_block_col]]) - ) - - if (verbose) { - n_voters_new <- nrow(voter_file_w_census) - message(paste( - "Matching by Census block removed", n_voters - n_voters_new, - "voters. Voter file now has", n_voters_new, - "rows.\nMerging voter file with district shape files." - )) - n_voters <- n_voters_new - } - } else { - # If we don't need to merge to Census shape file, then rename variable - if (verbose) { - message(paste("Voter file already matched to Census shapefile.")) - } - voter_file_w_census <- voter_file - } - - if (join_district_shape) { - # Merge the voter file with the district shape file - voter_file_w_district <- merge_voter_file_to_shape( - voter_file = voter_file_w_census, - shape_file = district_shape, - crs = crs, - coords = coords, - voter_id = voter_id - ) - # Filter out voters that didn't match on a district - voter_file_w_district <- dplyr::filter( - voter_file_w_district, - !is.na(.data[[district]]) - ) - if (verbose) { - n_voters_new <- nrow(voter_file_w_district) - message(paste( - "Matching by district removed", n_voters - n_voters_new, - "voters. Voter file now has", n_voters_new, - "rows.\nApplying BISG." - )) - n_voters <- n_voters_new - } - } else { - if (verbose) { - # If we don't need to merge to district shape file, then rename variable - message(paste("Voter file already matched to district shape.")) - voter_file_w_district <- voter_file - } - } - - # Select the final set of columns needed for BISG - voter_file_final <- voter_file_w_district %>% - dplyr::rename( - c( - "st" = tidyselect::all_of(census_state_col), - "county" = tidyselect::all_of(census_county_col), - "tract" = tidyselect::all_of(census_tract_col), - "block" = tidyselect::all_of(census_block_col) - ) - ) %>% - dplyr::select( - tidyselect::all_of(c( - voter_id, - surname, - district, - "st", - "county", - "tract", - "block" - )) - ) - - # Apply BISG to the voter file to get race predictions - voter_file_final_w_race <- wru_predict_race_wrapper( - voter_file = as.data.frame(voter_file_final), - census_data = census_data, - voter_id = voter_id, - surname = surname, - state = state, - county = "county", - tract = "tract", - block = "block", - census_geo = census_geo, - use_surname = use_surname, - surname_only = surname_only, - surname_year = surname_year, - use_age = use_age, - use_sex = use_sex, - return_surname_flag = TRUE, - return_geocode_flag = TRUE, - verbose = TRUE - ) - # Add matching flags to voter file - if (verbose) { - n_surname_match <- sum(voter_file_final_w_race$matched_surname) - n_geocode_match <- sum(voter_file_final_w_race$matched_geocode) - message(paste0( - paste("BISG didn't match", n_voters - n_surname_match, "surnames.\n"), - paste("BISG didn't match", n_voters - n_geocode_match, "geocodes.") - )) - } - - # Aggregate percentages across districts - results <- precinct_agg_combine( - voter_file = voter_file_final_w_race, - group_col = district, - race_cols = NULL, - include_total = FALSE - ) - - # If necessary, normalize counts - if (normalize) { - races <- c( - "pred.whi_prop", - "pred.bla_prop", - "pred.his_prop", - "pred.asi_prop", - "pred.oth_prop" - ) - sums <- rowSums(results[, races]) - results[, races] <- results[, races] / sums - } - - if (verbose) { - message("Performance analysis complete.") - } - return(list( - voter_file = voter_file_final_w_race, - results = results - )) -} diff --git a/R/surname_utils.R b/R/surname_utils.R index b25cdc5d..9b4da9ad 100644 --- a/R/surname_utils.R +++ b/R/surname_utils.R @@ -122,34 +122,6 @@ get_multi_barreled_surnames <- function(voter_file, return(multi_barreled) } - -#' Determines which surnames match to the Census list. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param strip_special Whether to strip special characters before matching in -#' the surname database. -#' @return A vector of logicals denoting a match or not. -#' -#' @export surname_match -#' @importFrom utils getFromNamespace -surname_match <- function(voter_file, - surname_col = "last_name", - strip_special = FALSE) { - if (strip_special) { - voter_file <- eiCompare::strip_special_characters( - voter_file = voter_file, - surname_col = surname_col, - replace = "" - ) - } - - # Determine if there's a surname match - surname_match <- voter_file[[surname_col]] %in% wru::surnames2010$surname - return(surname_match) -} - - #' Briefly summarizes the surnames in a voter file. #' #' @param voter_file The voter file, with each row consisting of a voter. @@ -199,100 +171,3 @@ surname_summary <- function(voter_file, surname_col) { "special characters." )) } - - - -#' Predicts, for one row in a voter file, the probability of a voter having a -#' certain race by averaging over each "barrel" of the surname. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param surname_only Whether to obtain probabilities for surnames only. -#' @param census_data A data frame containing Census data corresponding to the -#' geographic information for units in the voter file. -#' @param census_geo The census level at which to apply BISG. Passed to WRU. -#' @param surname_year Which Census year to use for surname matching. Passed to -#' WRU. -#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. -#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. -#' @param state A string denoting the state for which the data is queried. -#' @param county A string denoting the column containing the county FIPS code. -#' @param tract A string denoting the column containing the tract FIPS code. -#' @param block A string denoting the column containing the block FIPS code. -#' @param pattern What pattern to split surnames on. By default, surnames are -#' split on a space(s), which assumes hyphens have already been removed. -#' @param remove_patterns A list of strings which will be removed from the -#' list of barrels. -#' @return A vector of probabilities for each surname. -#' -#' @export predict_race_multi_barreled -#' @importFrom utils getFromNamespace -predict_race_multi_barreled <- function(voter_file, - surname_col = "last_name", - surname_only = TRUE, - census_data = NULL, - census_geo = "block", - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - state = NULL, - county = NULL, - tract = NULL, - block = NULL, - pattern = "[ -]+", - remove_patterns = NULL) { - # Split up multi-barreled surnames - surnames <- stringr::str_split( - voter_file[[surname_col]], - pattern = pattern - )[[1]] - # Remove specific barrels - if (!is.null(remove_patterns)) { - surnames <- surnames[!(surnames %in% remove_patterns)] - } - - # Get merge_surnames function out from wru - merge_surnames_copy <- utils::getFromNamespace("merge_surnames", "wru") - - # Use surname only - if (surname_only) { - new_voter_file <- data.frame(surname = surnames) - # Calculate probabilities using surnames only - probabilities <- suppressWarnings( - merge_surnames_copy( - voter.file = new_voter_file, - surname.year = 2010, - clean.surname = FALSE, - impute.missing = TRUE - ) - ) - probabilities <- as.numeric(colMeans(probabilities[, c(-1, -2)])) - } else { - # Transfer geolocations - new_voter_file <- data.frame( - surname = surnames, - state = state, - county = voter_file[[county]], - tract = voter_file[[tract]], - block = voter_file[[block]] - ) - - # Predict race using full BISG - invisible(capture.output( - bisg <- suppressWarnings( - wru::predict_race( - voter.file = new_voter_file, - census.surname = TRUE, - surname.only = FALSE, - surname.year = surname_year, - census.geo = census_geo, - census.data = census_data, - age = use_age, - sex = use_sex, - ) - ) - )) - probabilities <- as.numeric(colMeans(bisg[, c(6:10)])) - } - return(probabilities) -} diff --git a/R/wru_predict_race_wrapper.R b/R/wru_predict_race_wrapper.R deleted file mode 100644 index 65e82337..00000000 --- a/R/wru_predict_race_wrapper.R +++ /dev/null @@ -1,210 +0,0 @@ -#' Prepares a voter file for the WRU predict_race function, and then predicts -#' race. -#' -#' This function assumes that the Census data is provided to the function. It -#' does not provide the capability of downloading the Census data, since this -#' is a time intensive process. -#' -#' @param voter_file The voter file, containing columns with a surname and -#' potentially geographic information. -#' @param census_data A data frame containing Census data corresponding to the -#' geographic information for units in the voter file. -#' @param voter_id A string denoting the column containing voter ID. Default is -#' NULL, if there is no voter ID in the file. In this case, a voter ID will be -#' assigned. -#' @param surname A string denoting the column containing the surname. -#' @param state A string denoting the column containing the state FIPS code. -#' @param county A string denoting the column containing the county FIPS code. -#' @param tract A string denoting the column containing the tract FIPS code. -#' @param block A string denoting the column containing the block FIPS code. -#' @param census_geo The census level at which to apply BISG. Passed to WRU. -#' @param use_surname Whether to use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_only Whether to only use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_year Which Census year to use for surname matching. Passed to -#' WRU. -#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. -#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. -#' @param return_surname_flag If TRUE, returns a flag indicating whether the -#' surnames matched. -#' @param return_geocode_flag If TRUE, returns a flag indicating whether the -#' first level of geocode matched. -#' @param verbose A flag indicating whether to print out status messages. -#' @return The voter file component extracted from the provided data frame, with -#' additional surname/geocode flags, as well as a data frame race prediction. -#' -#' @references Imai and Khanna (2016) "Improving Ecological Inference by -#' Predicting Individual Ethnicity from Voter Registration Records" -#' -#' @export wru_predict_race_wrapper -#' @import wru -#' @importFrom dplyr relocate -#' @importFrom utils getFromNamespace -wru_predict_race_wrapper <- function(voter_file, - census_data, - voter_id = NULL, - surname = "last_name", - state = NULL, - county = NULL, - tract = NULL, - block = NULL, - census_geo = NULL, - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - return_surname_flag = FALSE, - return_geocode_flag = FALSE, - verbose = FALSE) { - # Tidy up voter file according to WRU's specifications - wru_voter_file <- tidy_voter_file_wru( - voter_file = voter_file, - voter_id = voter_id, - surname = surname, - state = state, - county = county, - tract = tract, - block = block - ) - - # Temporary check to force use_sex and use_age into FALSE - if (use_age) { - warning("age is currently disabled in wru... forcing use_age to be FALSE") - use_age <- FALSE - } - if (use_sex) { - warning("sex is currently disabled in wru... forcing use_sex to be FALSE") - use_sex <- FALSE - } - - # Get merge_surnames function out from wru - merge_surnames_copy <- utils::getFromNamespace("merge_surnames", "wru") - - # If necessary, check which surnames matched - if (return_surname_flag) { - if (verbose) { - message("Matching surnames.") - } - merged_surnames <- suppressWarnings( - merge_surnames_copy( - voter.file = wru_voter_file, - surname.year = surname_year, - clean.surname = TRUE, - impute.missing = TRUE - ) - ) - # Get matched surname flag - voter_file$matched_surname <- !(merged_surnames$surname.match == "") - } - - # Predict race using BISG via WRU - if (verbose) { - message("Performing BISG to obtain race probabilities.") - } - invisible(capture.output( - bisg <- suppressWarnings( - wru::predict_race( - voter.file = wru_voter_file, - census.surname = use_surname, - surname.only = surname_only, - surname.year = surname_year, - census.geo = census_geo, - census.data = census_data, - age = use_age, - sex = use_sex, - ) - ) - )) - # Re-order race predictions to match voter file - bisg <- bisg[match(wru_voter_file$voterid, bisg$voterid), ] - # Find out which geographic units didn't match - no_geocode_match <- is.na(bisg$pred.whi) - - # For voters that didn't match geocode, use the next highest level - if (sum(no_geocode_match) > 1) { - if (verbose) { - message( - "Some voters failed geocode matching. Matching at a higher level." - ) - } - no_match_voters <- wru_voter_file[no_geocode_match, ] - # Re-run BISG using a new geographic unit - if (census_geo == "block") { - new_geo <- "tract" - } else if (census_geo == "tract") { - new_geo <- "county" - } - # Re-run the BISG only on voters that didn't match - invisible(capture.output( - bisg_no_match <- suppressWarnings( - wru::predict_race( - voter.file = no_match_voters, - census.surname = use_surname, - surname.only = surname_only, - surname.year = surname_year, - census.geo = new_geo, - census.data = census_data, - age = use_age, - sex = use_sex, - ) - ) - )) - # Re-order race predictions to match voter file - bisg_no_match <- bisg_no_match[match( - no_match_voters$voterid, - bisg_no_match$voterid - ), ] - # Merge the new probabilities back into the old dataframe - bisg[no_geocode_match, ] <- bisg_no_match - - matched_geocode <- !no_geocode_match - } else { - matched_geocode <- TRUE - } - - # Store geocode match flag - if (return_geocode_flag & !surname_only) { - voter_file$matched_geocode <- matched_geocode - } - - # Final check if there are no matches - no_match_final <- is.na(bisg$pred.whi) - if (sum(no_match_final) > 1) { - if (verbose) { - message(paste( - "Some surnames did not match at higher geographic level.", - "Using surname only for these cases." - )) - } - # Use probabilities from surnames only for those that don't match - invisible(capture.output( - no_match_surnames <- suppressWarnings( - merge_surnames_copy( - voter.file = wru_voter_file[no_match_final, ], - surname.year = surname_year, - clean.surname = TRUE, - impute.missing = TRUE - ) - ) - )) - # Merge back into BISG estimates - p_cols <- c("p_whi", "p_bla", "p_his", "p_asi", "p_oth") - pred_cols <- c("pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth") - bisg[no_match_final, ][, pred_cols] <- no_match_surnames[, p_cols] - } - - # Finalize voter file - if (is.null(voter_id)) { - voter_file$voter_id <- bisg$voterid - voter_file <- dplyr::relocate(voter_file, voter_id, .before = 1) - } - pred_cols <- c("pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth") - voter_file[, pred_cols] <- bisg[, pred_cols] - - if (verbose) { - message("BISG complete.") - } - return(voter_file) -} From 5d7293f9c57af6db2b17ee885d7ca569aa01e60b Mon Sep 17 00:00:00 2001 From: rachel-carroll Date: Sun, 2 Feb 2025 22:10:34 -0500 Subject: [PATCH 6/8] remove dependency on wru in surname_utils functionality and tests --- R/surname_utils.R | 23 ----------------------- tests/testthat/test_surname_utils.R | 16 ---------------- 2 files changed, 39 deletions(-) diff --git a/R/surname_utils.R b/R/surname_utils.R index 9b4da9ad..9a945bb7 100644 --- a/R/surname_utils.R +++ b/R/surname_utils.R @@ -147,27 +147,4 @@ surname_summary <- function(voter_file, surname_col) { "Voter file has", n_special_character_surnames, "voters containing special characters." )) - - # Print number of matches by default - n_organic_matches <- sum(eiCompare::surname_match( - voter_file = voter_file, - surname_col = surname_col, - strip_special = FALSE - )) - message(paste( - "Voter file has", n_organic_matches, - "voters with surnames matching the database by default." - )) - - # Print number of matches after stripping special characters - n_stripped_matches <- sum(eiCompare::surname_match( - voter_file = voter_file, - surname_col = surname_col, - strip_special = TRUE - )) - message(paste( - "Voter file has", n_stripped_matches, - "voters with surnames matching the database after removing", - "special characters." - )) } diff --git a/tests/testthat/test_surname_utils.R b/tests/testthat/test_surname_utils.R index b312bebc..f8a067a8 100644 --- a/tests/testthat/test_surname_utils.R +++ b/tests/testthat/test_surname_utils.R @@ -74,22 +74,6 @@ test_that("Surname utility functions work correctly.", { ) testthat::expect_equal(multi_barreled_surnames, expected_multi_surnames) - # Test surname match function - matches_no_special <- surname_match( - voter_file = voter_file, - surname_col = "surname", - strip_special = TRUE - ) - matches_w_special <- surname_match( - voter_file = voter_file, - surname_col = "surname", - strip_special = FALSE - ) - expected_no_special <- c(TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE) - expected_w_special <- c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE) - testthat::expect_equal(matches_no_special, expected_no_special) - testthat::expect_equal(matches_w_special, expected_w_special) - # Test surname summary function expect_error( surname_summary(voter_file = voter_file, surname_col = "surname"), From 4922cdbdd626282a613b71b14340875804c44bf5 Mon Sep 17 00:00:00 2001 From: rachel-carroll Date: Sun, 2 Feb 2025 22:11:02 -0500 Subject: [PATCH 7/8] namespace and cocumentation update --- NAMESPACE | 12 ---- man/performance_analysis.Rd | 112 ----------------------------- man/predict_race_multi_barreled.Rd | 64 ----------------- man/surname_match.Rd | 22 ------ man/wru_predict_race_wrapper.Rd | 84 ---------------------- 5 files changed, 294 deletions(-) delete mode 100644 man/performance_analysis.Rd delete mode 100644 man/predict_race_multi_barreled.Rd delete mode 100644 man/surname_match.Rd delete mode 100644 man/wru_predict_race_wrapper.Rd diff --git a/NAMESPACE b/NAMESPACE index a0f8cace..283b26e6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,10 +33,8 @@ export(mean_and_ci) export(merge_voter_file_to_shape) export(od_plot_create) export(overlay_density_plot) -export(performance_analysis) export(plot_bivariate) export(precinct_agg_combine) -export(predict_race_multi_barreled) export(race_cand_cors) export(race_check_2_3) export(resolve_missing_vals) @@ -45,14 +43,11 @@ export(stdize_votes) export(stdize_votes_all) export(strip_special_characters) export(sum_over_cols) -export(surname_match) export(surname_summary) export(tidy_voter_file_wru) -export(wru_predict_race_wrapper) import(ei) import(eiPack) import(ggplot2) -import(wru) importFrom(bayestestR,ci) importFrom(coda,as.mcmc) importFrom(coda,gelman.plot) @@ -60,11 +55,6 @@ importFrom(coda,mcmc.list) importFrom(doSNOW,registerDoSNOW) importFrom(dplyr,filter) importFrom(dplyr,group_by_at) -importFrom(dplyr,inner_join) -importFrom(dplyr,relocate) -importFrom(dplyr,rename) -importFrom(dplyr,select) -importFrom(dplyr,summarise) importFrom(dplyr,summarise_at) importFrom(foreach,"%do%") importFrom(foreach,"%dopar%") @@ -108,7 +98,5 @@ importFrom(stringr,str_trim) importFrom(tidyr,pivot_longer) importFrom(tidyr,replace_na) importFrom(tidyr,separate) -importFrom(tidyselect,all_of) importFrom(utils,capture.output) -importFrom(utils,getFromNamespace) importFrom(utils,setTxtProgressBar) diff --git a/man/performance_analysis.Rd b/man/performance_analysis.Rd deleted file mode 100644 index f713f25a..00000000 --- a/man/performance_analysis.Rd +++ /dev/null @@ -1,112 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/performance_analysis.R -\name{performance_analysis} -\alias{performance_analysis} -\title{Performs a performance analysis using a voter file, census shape, and -district shape.} -\usage{ -performance_analysis( - voter_file, - district_shape, - census_shape, - census_data, - join_census_shape = TRUE, - join_district_shape = TRUE, - state = NULL, - voter_id = "voter_id", - surname = "last_name", - district = "district", - census_state_col = "STATEFP10", - census_county_col = "COUNTYFP10", - census_tract_col = "TRACTCE10", - census_block_col = "BLOCKCE10", - crs = NULL, - coords = c("lon", "lat"), - census_geo = "block", - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - normalize = TRUE, - verbose = FALSE -) -} -\arguments{ -\item{voter_file}{A dataframe containing the voter file.} - -\item{district_shape}{The shapefiles for the new districts or precincts to -consider.} - -\item{census_shape}{The shapefiles for the Census blocks or tracts for which -the voter file will be geocoded against.} - -\item{census_data}{A dataframe containing the Census tracts or blocks in the -region for the voter file.} - -\item{join_census_shape}{A logical denoting whether the voter file already -has the Census block, tract, and county information. If TRUE, then column -names for these items must be provided. If FALSE, then a Census shape must -be provided in order to perform a spatial join.} - -\item{join_district_shape}{A logical denoting whether the voter file already -has the district identity per voter. If TRUE, then a column names for the -district must be provided. If FALSE, then a distrct shape must be provided -in order to perform a spatial join.} - -\item{state}{The state in which the functionality analysis is performed, as -a two character string.} - -\item{voter_id}{A string denoting the column name for the voter ID.} - -\item{surname}{A string denoting the column name for the surname.} - -\item{district}{A string denoting the column name for the district.} - -\item{census_state_col}{The column in the Census data that indicates state. -If the voter file already has Census information, this should denote the -column in the voter file containing the state FIPS code.} - -\item{census_county_col}{The column in the Census data that indicates county. -If the voter file already has Census information, this should denote the -column in the voter file containing the county FIPS code.} - -\item{census_tract_col}{The column in the Census data that indicates tract. -If the voter file already has Census information, this should denote the -column in the voter file containing the tract FIPS code.} - -\item{census_block_col}{The column in the Census data that indicates block. -If the voter file already has Census information, this should denote the -column in the voter file containing the block FIPS code.} - -\item{crs}{A string denoting the PROJ4 string for projecting maps.} - -\item{coords}{The columns for the coordinates.} - -\item{census_geo}{The geographic level at which to perform BISG.} - -\item{use_surname}{Whether to use the surname in calculating race -probabilities. Passed to WRU.} - -\item{surname_only}{Whether to only use the surname in calculating race -probabilities. Passed to WRU.} - -\item{surname_year}{Which Census year to use for surname matching. Passed to -WRU.} - -\item{use_age}{Whether to use the age in the BISG calculation. Passed to WRU.} - -\item{use_sex}{Whether to use the sex in the BISG calculation. Passed to WRU.} - -\item{normalize}{If TRUE, normalizes the district percentages.} - -\item{verbose}{If TRUE, will output diagnostic strings.} -} -\value{ -The processed voter file and a summary of district turnout across - racial groups. -} -\description{ -Performs a performance analysis using a voter file, census shape, and -district shape. -} diff --git a/man/predict_race_multi_barreled.Rd b/man/predict_race_multi_barreled.Rd deleted file mode 100644 index 8443f5d7..00000000 --- a/man/predict_race_multi_barreled.Rd +++ /dev/null @@ -1,64 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/surname_utils.R -\name{predict_race_multi_barreled} -\alias{predict_race_multi_barreled} -\title{Predicts, for one row in a voter file, the probability of a voter having a -certain race by averaging over each "barrel" of the surname.} -\usage{ -predict_race_multi_barreled( - voter_file, - surname_col = "last_name", - surname_only = TRUE, - census_data = NULL, - census_geo = "block", - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - state = NULL, - county = NULL, - tract = NULL, - block = NULL, - pattern = "[ -]+", - remove_patterns = NULL -) -} -\arguments{ -\item{voter_file}{The voter file, with each row consisting of a voter.} - -\item{surname_col}{A string denoting the surname column.} - -\item{surname_only}{Whether to obtain probabilities for surnames only.} - -\item{census_data}{A data frame containing Census data corresponding to the -geographic information for units in the voter file.} - -\item{census_geo}{The census level at which to apply BISG. Passed to WRU.} - -\item{surname_year}{Which Census year to use for surname matching. Passed to -WRU.} - -\item{use_age}{Whether to use the age in the BISG calculation. Passed to WRU.} - -\item{use_sex}{Whether to use the sex in the BISG calculation. Passed to WRU.} - -\item{state}{A string denoting the state for which the data is queried.} - -\item{county}{A string denoting the column containing the county FIPS code.} - -\item{tract}{A string denoting the column containing the tract FIPS code.} - -\item{block}{A string denoting the column containing the block FIPS code.} - -\item{pattern}{What pattern to split surnames on. By default, surnames are -split on a space(s), which assumes hyphens have already been removed.} - -\item{remove_patterns}{A list of strings which will be removed from the -list of barrels.} -} -\value{ -A vector of probabilities for each surname. -} -\description{ -Predicts, for one row in a voter file, the probability of a voter having a -certain race by averaging over each "barrel" of the surname. -} diff --git a/man/surname_match.Rd b/man/surname_match.Rd deleted file mode 100644 index 28713ee0..00000000 --- a/man/surname_match.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/surname_utils.R -\name{surname_match} -\alias{surname_match} -\title{Determines which surnames match to the Census list.} -\usage{ -surname_match(voter_file, surname_col = "last_name", strip_special = FALSE) -} -\arguments{ -\item{voter_file}{The voter file, with each row consisting of a voter.} - -\item{surname_col}{A string denoting the surname column.} - -\item{strip_special}{Whether to strip special characters before matching in -the surname database.} -} -\value{ -A vector of logicals denoting a match or not. -} -\description{ -Determines which surnames match to the Census list. -} diff --git a/man/wru_predict_race_wrapper.Rd b/man/wru_predict_race_wrapper.Rd deleted file mode 100644 index b4ad2930..00000000 --- a/man/wru_predict_race_wrapper.Rd +++ /dev/null @@ -1,84 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/wru_predict_race_wrapper.R -\name{wru_predict_race_wrapper} -\alias{wru_predict_race_wrapper} -\title{Prepares a voter file for the WRU predict_race function, and then predicts -race.} -\usage{ -wru_predict_race_wrapper( - voter_file, - census_data, - voter_id = NULL, - surname = "last_name", - state = NULL, - county = NULL, - tract = NULL, - block = NULL, - census_geo = NULL, - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - return_surname_flag = FALSE, - return_geocode_flag = FALSE, - verbose = FALSE -) -} -\arguments{ -\item{voter_file}{The voter file, containing columns with a surname and -potentially geographic information.} - -\item{census_data}{A data frame containing Census data corresponding to the -geographic information for units in the voter file.} - -\item{voter_id}{A string denoting the column containing voter ID. Default is -NULL, if there is no voter ID in the file. In this case, a voter ID will be -assigned.} - -\item{surname}{A string denoting the column containing the surname.} - -\item{state}{A string denoting the column containing the state FIPS code.} - -\item{county}{A string denoting the column containing the county FIPS code.} - -\item{tract}{A string denoting the column containing the tract FIPS code.} - -\item{block}{A string denoting the column containing the block FIPS code.} - -\item{census_geo}{The census level at which to apply BISG. Passed to WRU.} - -\item{use_surname}{Whether to use the surname in calculating race -probabilities. Passed to WRU.} - -\item{surname_only}{Whether to only use the surname in calculating race -probabilities. Passed to WRU.} - -\item{surname_year}{Which Census year to use for surname matching. Passed to -WRU.} - -\item{use_age}{Whether to use the age in the BISG calculation. Passed to WRU.} - -\item{use_sex}{Whether to use the sex in the BISG calculation. Passed to WRU.} - -\item{return_surname_flag}{If TRUE, returns a flag indicating whether the -surnames matched.} - -\item{return_geocode_flag}{If TRUE, returns a flag indicating whether the -first level of geocode matched.} - -\item{verbose}{A flag indicating whether to print out status messages.} -} -\value{ -The voter file component extracted from the provided data frame, with -additional surname/geocode flags, as well as a data frame race prediction. -} -\description{ -This function assumes that the Census data is provided to the function. It -does not provide the capability of downloading the Census data, since this -is a time intensive process. -} -\references{ -Imai and Khanna (2016) "Improving Ecological Inference by -Predicting Individual Ethnicity from Voter Registration Records" -} From 3c523e52e5f7ce63b68e6b5edc47e773aead6354 Mon Sep 17 00:00:00 2001 From: rachel-carroll Date: Sun, 2 Feb 2025 22:13:39 -0500 Subject: [PATCH 8/8] r cmd chk note fixes --- DESCRIPTION | 2 +- R/overlay_density_plot.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 56310f80..41bc628c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,7 +43,7 @@ License: GPL-3 Depends: R (>= 3.5.0), eiPack, ei Imports: bayestestR, coda, data.table, doSNOW, dplyr, foreach, ggplot2, graphics, magrittr, mcmcse, methods, - overlapping, purrr, rlang, sf, stringr, tidyr,tidyselect + overlapping, purrr, rlang, sf, stringr, tidyr NeedsCompilation: no Suggests: knitr, plyr, rmarkdown, reshape2, RColorBrewer, RJSONIO, testthat, tigris diff --git a/R/overlay_density_plot.R b/R/overlay_density_plot.R index b9134214..ac63ab79 100644 --- a/R/overlay_density_plot.R +++ b/R/overlay_density_plot.R @@ -103,7 +103,7 @@ overlay_density_plot <- function(agg_betas, results_table, race_cols, cand_cols, } - out <- inner_join(rt_sub, out, by = "Candidate") + out <- dplyr::inner_join(rt_sub, out, by = "Candidate") out$sd_minus <- out$mean_size - out$sd_size out$sd_plus <- out$mean_size + out$sd_size