diff --git a/DESCRIPTION b/DESCRIPTION index 46f14cd..a2da2ea 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,10 +40,10 @@ Authors@R: URL: https://github.com/RPVote/eiCompare Description: Provides a comprehensive suite of tools for estimating the candidate preferences of racial/ethnic voting blocs in elections. Includes functions for predicting voter race/ethnicity and conducting ecological inference. Race/ethnicity prediction builds on race prediction developed by Imai et al. (2016) . Ecological inference methods are based on King (1997) , ; King et. al. (2004) , . License: GPL-3 -Depends: R (>= 3.5.0), eiPack, ei, wru (>= 1.0.0) +Depends: R (>= 3.5.0), eiPack, ei Imports: bayestestR, coda, data.table, doSNOW, dplyr, foreach, ggplot2, graphics, magrittr, mcmcse, methods, - overlapping, purrr, rlang, sf, stringr, tidyr, tidyselect, viridis + overlapping, purrr, rlang, sf, stringr, tidyr, viridis NeedsCompilation: no Suggests: knitr, plyr, rmarkdown, reshape2, RColorBrewer, RJSONIO, testthat, tigris diff --git a/NAMESPACE b/NAMESPACE index 0539b3e..fcddcf0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -33,10 +33,8 @@ export(mean_and_ci) export(merge_voter_file_to_shape) export(od_plot_create) export(overlay_density_plot) -export(performance_analysis) export(plot_bivariate) export(precinct_agg_combine) -export(predict_race_multi_barreled) export(race_cand_cors) export(race_check_2_3) export(resolve_missing_vals) @@ -47,14 +45,11 @@ export(stdize_votes) export(stdize_votes_all) export(strip_special_characters) export(sum_over_cols) -export(surname_match) export(surname_summary) export(tidy_voter_file_wru) -export(wru_predict_race_wrapper) import(ei) import(eiPack) import(ggplot2) -import(wru) importFrom(bayestestR,ci) importFrom(coda,as.mcmc) importFrom(coda,gelman.plot) @@ -62,11 +57,6 @@ importFrom(coda,mcmc.list) importFrom(doSNOW,registerDoSNOW) importFrom(dplyr,filter) importFrom(dplyr,group_by_at) -importFrom(dplyr,inner_join) -importFrom(dplyr,relocate) -importFrom(dplyr,rename) -importFrom(dplyr,select) -importFrom(dplyr,summarise) importFrom(dplyr,summarise_at) importFrom(foreach,"%do%") importFrom(foreach,"%dopar%") @@ -110,7 +100,5 @@ importFrom(stringr,str_trim) importFrom(tidyr,pivot_longer) importFrom(tidyr,replace_na) importFrom(tidyr,separate) -importFrom(tidyselect,all_of) importFrom(utils,capture.output) -importFrom(utils,getFromNamespace) importFrom(utils,setTxtProgressBar) diff --git a/R/overlay_density_plot.R b/R/overlay_density_plot.R index b913421..ac63ab7 100644 --- a/R/overlay_density_plot.R +++ b/R/overlay_density_plot.R @@ -103,7 +103,7 @@ overlay_density_plot <- function(agg_betas, results_table, race_cols, cand_cols, } - out <- inner_join(rt_sub, out, by = "Candidate") + out <- dplyr::inner_join(rt_sub, out, by = "Candidate") out$sd_minus <- out$mean_size - out$sd_size out$sd_plus <- out$mean_size + out$sd_size diff --git a/R/performance_analysis.R b/R/performance_analysis.R deleted file mode 100644 index ad9503a..0000000 --- a/R/performance_analysis.R +++ /dev/null @@ -1,245 +0,0 @@ -#' Performs a performance analysis using a voter file, census shape, and -#' district shape. -#' -#' @param voter_file A dataframe containing the voter file. -#' @param district_shape The shapefiles for the new districts or precincts to -#' consider. -#' @param census_shape The shapefiles for the Census blocks or tracts for which -#' the voter file will be geocoded against. -#' @param census_data A dataframe containing the Census tracts or blocks in the -#' region for the voter file. -#' @param join_district_shape A logical denoting whether the voter file already -#' has the district identity per voter. If TRUE, then a column names for the -#' district must be provided. If FALSE, then a distrct shape must be provided -#' in order to perform a spatial join. -#' @param join_census_shape A logical denoting whether the voter file already -#' has the Census block, tract, and county information. If TRUE, then column -#' names for these items must be provided. If FALSE, then a Census shape must -#' be provided in order to perform a spatial join. -#' @param state The state in which the functionality analysis is performed, as -#' a two character string. -#' @param voter_id A string denoting the column name for the voter ID. -#' @param surname A string denoting the column name for the surname. -#' @param district A string denoting the column name for the district. -#' @param census_state_col The column in the Census data that indicates state. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the state FIPS code. -#' @param census_county_col The column in the Census data that indicates county. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the county FIPS code. -#' @param census_tract_col The column in the Census data that indicates tract. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the tract FIPS code. -#' @param census_block_col The column in the Census data that indicates block. -#' If the voter file already has Census information, this should denote the -#' column in the voter file containing the block FIPS code. -#' @param crs A string denoting the PROJ4 string for projecting maps. -#' @param coords The columns for the coordinates. -#' @param census_geo The geographic level at which to perform BISG. -#' @param use_surname Whether to use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_only Whether to only use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_year Which Census year to use for surname matching. Passed to -#' WRU. -#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. -#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. -#' @param normalize If TRUE, normalizes the district percentages. -#' @param verbose If TRUE, will output diagnostic strings. -#' @return The processed voter file and a summary of district turnout across -#' racial groups. -#' -#' @export performance_analysis -#' @importFrom dplyr filter group_by_at inner_join rename select summarise -#' @importFrom tidyselect all_of -performance_analysis <- function(voter_file, - district_shape, - census_shape, - census_data, - join_census_shape = TRUE, - join_district_shape = TRUE, - state = NULL, - voter_id = "voter_id", - surname = "last_name", - district = "district", - census_state_col = "STATEFP10", - census_county_col = "COUNTYFP10", - census_tract_col = "TRACTCE10", - census_block_col = "BLOCKCE10", - crs = NULL, - coords = c("lon", "lat"), - census_geo = "block", - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - normalize = TRUE, - verbose = FALSE) { - if (verbose) { - n_voters <- nrow(voter_file) - message(paste( - "Voter file has", n_voters, "rows.\nBeginning", - "functionality analysis..." - )) - } - - # De-duplicate voter file - voter_file <- dedupe_voter_file( - voter_file = voter_file, - voter_id = voter_id - ) - if (verbose) { - n_voters_new <- nrow(voter_file) - message(paste( - "De-duplicating removed", n_voters - n_voters_new, "voters.", - "Voter file now has", n_voters_new, "rows.\nMerging", - "voter file with census shape files." - )) - n_voters <- n_voters_new - } - - # Merge the voter file to census shape file, then pick out the right columns - if (join_census_shape) { - voter_file_w_census <- merge_voter_file_to_shape( - voter_file = voter_file, - shape_file = census_shape, - crs = crs, - coords = coords, - voter_id = voter_id - ) - # Filter out voters that didn't match on the block - voter_file_w_census <- dplyr::filter( - voter_file_w_census, - !is.na(.data[[census_block_col]]) - ) - - if (verbose) { - n_voters_new <- nrow(voter_file_w_census) - message(paste( - "Matching by Census block removed", n_voters - n_voters_new, - "voters. Voter file now has", n_voters_new, - "rows.\nMerging voter file with district shape files." - )) - n_voters <- n_voters_new - } - } else { - # If we don't need to merge to Census shape file, then rename variable - if (verbose) { - message(paste("Voter file already matched to Census shapefile.")) - } - voter_file_w_census <- voter_file - } - - if (join_district_shape) { - # Merge the voter file with the district shape file - voter_file_w_district <- merge_voter_file_to_shape( - voter_file = voter_file_w_census, - shape_file = district_shape, - crs = crs, - coords = coords, - voter_id = voter_id - ) - # Filter out voters that didn't match on a district - voter_file_w_district <- dplyr::filter( - voter_file_w_district, - !is.na(.data[[district]]) - ) - if (verbose) { - n_voters_new <- nrow(voter_file_w_district) - message(paste( - "Matching by district removed", n_voters - n_voters_new, - "voters. Voter file now has", n_voters_new, - "rows.\nApplying BISG." - )) - n_voters <- n_voters_new - } - } else { - if (verbose) { - # If we don't need to merge to district shape file, then rename variable - message(paste("Voter file already matched to district shape.")) - voter_file_w_district <- voter_file - } - } - - # Select the final set of columns needed for BISG - voter_file_final <- voter_file_w_district %>% - dplyr::rename( - c( - "st" = tidyselect::all_of(census_state_col), - "county" = tidyselect::all_of(census_county_col), - "tract" = tidyselect::all_of(census_tract_col), - "block" = tidyselect::all_of(census_block_col) - ) - ) %>% - dplyr::select( - tidyselect::all_of(c( - voter_id, - surname, - district, - "st", - "county", - "tract", - "block" - )) - ) - - # Apply BISG to the voter file to get race predictions - voter_file_final_w_race <- wru_predict_race_wrapper( - voter_file = as.data.frame(voter_file_final), - census_data = census_data, - voter_id = voter_id, - surname = surname, - state = state, - county = "county", - tract = "tract", - block = "block", - census_geo = census_geo, - use_surname = use_surname, - surname_only = surname_only, - surname_year = surname_year, - use_age = use_age, - use_sex = use_sex, - return_surname_flag = TRUE, - return_geocode_flag = TRUE, - verbose = TRUE - ) - # Add matching flags to voter file - if (verbose) { - n_surname_match <- sum(voter_file_final_w_race$matched_surname) - n_geocode_match <- sum(voter_file_final_w_race$matched_geocode) - message(paste0( - paste("BISG didn't match", n_voters - n_surname_match, "surnames.\n"), - paste("BISG didn't match", n_voters - n_geocode_match, "geocodes.") - )) - } - - # Aggregate percentages across districts - results <- precinct_agg_combine( - voter_file = voter_file_final_w_race, - group_col = district, - race_cols = NULL, - include_total = FALSE - ) - - # If necessary, normalize counts - if (normalize) { - races <- c( - "pred.whi_prop", - "pred.bla_prop", - "pred.his_prop", - "pred.asi_prop", - "pred.oth_prop" - ) - sums <- rowSums(results[, races]) - results[, races] <- results[, races] / sums - } - - if (verbose) { - message("Performance analysis complete.") - } - return(list( - voter_file = voter_file_final_w_race, - results = results - )) -} diff --git a/R/surname_utils.R b/R/surname_utils.R index b25cdc5..9a945bb 100644 --- a/R/surname_utils.R +++ b/R/surname_utils.R @@ -122,34 +122,6 @@ get_multi_barreled_surnames <- function(voter_file, return(multi_barreled) } - -#' Determines which surnames match to the Census list. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param strip_special Whether to strip special characters before matching in -#' the surname database. -#' @return A vector of logicals denoting a match or not. -#' -#' @export surname_match -#' @importFrom utils getFromNamespace -surname_match <- function(voter_file, - surname_col = "last_name", - strip_special = FALSE) { - if (strip_special) { - voter_file <- eiCompare::strip_special_characters( - voter_file = voter_file, - surname_col = surname_col, - replace = "" - ) - } - - # Determine if there's a surname match - surname_match <- voter_file[[surname_col]] %in% wru::surnames2010$surname - return(surname_match) -} - - #' Briefly summarizes the surnames in a voter file. #' #' @param voter_file The voter file, with each row consisting of a voter. @@ -175,124 +147,4 @@ surname_summary <- function(voter_file, surname_col) { "Voter file has", n_special_character_surnames, "voters containing special characters." )) - - # Print number of matches by default - n_organic_matches <- sum(eiCompare::surname_match( - voter_file = voter_file, - surname_col = surname_col, - strip_special = FALSE - )) - message(paste( - "Voter file has", n_organic_matches, - "voters with surnames matching the database by default." - )) - - # Print number of matches after stripping special characters - n_stripped_matches <- sum(eiCompare::surname_match( - voter_file = voter_file, - surname_col = surname_col, - strip_special = TRUE - )) - message(paste( - "Voter file has", n_stripped_matches, - "voters with surnames matching the database after removing", - "special characters." - )) -} - - - -#' Predicts, for one row in a voter file, the probability of a voter having a -#' certain race by averaging over each "barrel" of the surname. -#' -#' @param voter_file The voter file, with each row consisting of a voter. -#' @param surname_col A string denoting the surname column. -#' @param surname_only Whether to obtain probabilities for surnames only. -#' @param census_data A data frame containing Census data corresponding to the -#' geographic information for units in the voter file. -#' @param census_geo The census level at which to apply BISG. Passed to WRU. -#' @param surname_year Which Census year to use for surname matching. Passed to -#' WRU. -#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. -#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. -#' @param state A string denoting the state for which the data is queried. -#' @param county A string denoting the column containing the county FIPS code. -#' @param tract A string denoting the column containing the tract FIPS code. -#' @param block A string denoting the column containing the block FIPS code. -#' @param pattern What pattern to split surnames on. By default, surnames are -#' split on a space(s), which assumes hyphens have already been removed. -#' @param remove_patterns A list of strings which will be removed from the -#' list of barrels. -#' @return A vector of probabilities for each surname. -#' -#' @export predict_race_multi_barreled -#' @importFrom utils getFromNamespace -predict_race_multi_barreled <- function(voter_file, - surname_col = "last_name", - surname_only = TRUE, - census_data = NULL, - census_geo = "block", - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - state = NULL, - county = NULL, - tract = NULL, - block = NULL, - pattern = "[ -]+", - remove_patterns = NULL) { - # Split up multi-barreled surnames - surnames <- stringr::str_split( - voter_file[[surname_col]], - pattern = pattern - )[[1]] - # Remove specific barrels - if (!is.null(remove_patterns)) { - surnames <- surnames[!(surnames %in% remove_patterns)] - } - - # Get merge_surnames function out from wru - merge_surnames_copy <- utils::getFromNamespace("merge_surnames", "wru") - - # Use surname only - if (surname_only) { - new_voter_file <- data.frame(surname = surnames) - # Calculate probabilities using surnames only - probabilities <- suppressWarnings( - merge_surnames_copy( - voter.file = new_voter_file, - surname.year = 2010, - clean.surname = FALSE, - impute.missing = TRUE - ) - ) - probabilities <- as.numeric(colMeans(probabilities[, c(-1, -2)])) - } else { - # Transfer geolocations - new_voter_file <- data.frame( - surname = surnames, - state = state, - county = voter_file[[county]], - tract = voter_file[[tract]], - block = voter_file[[block]] - ) - - # Predict race using full BISG - invisible(capture.output( - bisg <- suppressWarnings( - wru::predict_race( - voter.file = new_voter_file, - census.surname = TRUE, - surname.only = FALSE, - surname.year = surname_year, - census.geo = census_geo, - census.data = census_data, - age = use_age, - sex = use_sex, - ) - ) - )) - probabilities <- as.numeric(colMeans(bisg[, c(6:10)])) - } - return(probabilities) } diff --git a/R/wru_predict_race_wrapper.R b/R/wru_predict_race_wrapper.R deleted file mode 100644 index 65e8233..0000000 --- a/R/wru_predict_race_wrapper.R +++ /dev/null @@ -1,210 +0,0 @@ -#' Prepares a voter file for the WRU predict_race function, and then predicts -#' race. -#' -#' This function assumes that the Census data is provided to the function. It -#' does not provide the capability of downloading the Census data, since this -#' is a time intensive process. -#' -#' @param voter_file The voter file, containing columns with a surname and -#' potentially geographic information. -#' @param census_data A data frame containing Census data corresponding to the -#' geographic information for units in the voter file. -#' @param voter_id A string denoting the column containing voter ID. Default is -#' NULL, if there is no voter ID in the file. In this case, a voter ID will be -#' assigned. -#' @param surname A string denoting the column containing the surname. -#' @param state A string denoting the column containing the state FIPS code. -#' @param county A string denoting the column containing the county FIPS code. -#' @param tract A string denoting the column containing the tract FIPS code. -#' @param block A string denoting the column containing the block FIPS code. -#' @param census_geo The census level at which to apply BISG. Passed to WRU. -#' @param use_surname Whether to use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_only Whether to only use the surname in calculating race -#' probabilities. Passed to WRU. -#' @param surname_year Which Census year to use for surname matching. Passed to -#' WRU. -#' @param use_age Whether to use the age in the BISG calculation. Passed to WRU. -#' @param use_sex Whether to use the sex in the BISG calculation. Passed to WRU. -#' @param return_surname_flag If TRUE, returns a flag indicating whether the -#' surnames matched. -#' @param return_geocode_flag If TRUE, returns a flag indicating whether the -#' first level of geocode matched. -#' @param verbose A flag indicating whether to print out status messages. -#' @return The voter file component extracted from the provided data frame, with -#' additional surname/geocode flags, as well as a data frame race prediction. -#' -#' @references Imai and Khanna (2016) "Improving Ecological Inference by -#' Predicting Individual Ethnicity from Voter Registration Records" -#' -#' @export wru_predict_race_wrapper -#' @import wru -#' @importFrom dplyr relocate -#' @importFrom utils getFromNamespace -wru_predict_race_wrapper <- function(voter_file, - census_data, - voter_id = NULL, - surname = "last_name", - state = NULL, - county = NULL, - tract = NULL, - block = NULL, - census_geo = NULL, - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - return_surname_flag = FALSE, - return_geocode_flag = FALSE, - verbose = FALSE) { - # Tidy up voter file according to WRU's specifications - wru_voter_file <- tidy_voter_file_wru( - voter_file = voter_file, - voter_id = voter_id, - surname = surname, - state = state, - county = county, - tract = tract, - block = block - ) - - # Temporary check to force use_sex and use_age into FALSE - if (use_age) { - warning("age is currently disabled in wru... forcing use_age to be FALSE") - use_age <- FALSE - } - if (use_sex) { - warning("sex is currently disabled in wru... forcing use_sex to be FALSE") - use_sex <- FALSE - } - - # Get merge_surnames function out from wru - merge_surnames_copy <- utils::getFromNamespace("merge_surnames", "wru") - - # If necessary, check which surnames matched - if (return_surname_flag) { - if (verbose) { - message("Matching surnames.") - } - merged_surnames <- suppressWarnings( - merge_surnames_copy( - voter.file = wru_voter_file, - surname.year = surname_year, - clean.surname = TRUE, - impute.missing = TRUE - ) - ) - # Get matched surname flag - voter_file$matched_surname <- !(merged_surnames$surname.match == "") - } - - # Predict race using BISG via WRU - if (verbose) { - message("Performing BISG to obtain race probabilities.") - } - invisible(capture.output( - bisg <- suppressWarnings( - wru::predict_race( - voter.file = wru_voter_file, - census.surname = use_surname, - surname.only = surname_only, - surname.year = surname_year, - census.geo = census_geo, - census.data = census_data, - age = use_age, - sex = use_sex, - ) - ) - )) - # Re-order race predictions to match voter file - bisg <- bisg[match(wru_voter_file$voterid, bisg$voterid), ] - # Find out which geographic units didn't match - no_geocode_match <- is.na(bisg$pred.whi) - - # For voters that didn't match geocode, use the next highest level - if (sum(no_geocode_match) > 1) { - if (verbose) { - message( - "Some voters failed geocode matching. Matching at a higher level." - ) - } - no_match_voters <- wru_voter_file[no_geocode_match, ] - # Re-run BISG using a new geographic unit - if (census_geo == "block") { - new_geo <- "tract" - } else if (census_geo == "tract") { - new_geo <- "county" - } - # Re-run the BISG only on voters that didn't match - invisible(capture.output( - bisg_no_match <- suppressWarnings( - wru::predict_race( - voter.file = no_match_voters, - census.surname = use_surname, - surname.only = surname_only, - surname.year = surname_year, - census.geo = new_geo, - census.data = census_data, - age = use_age, - sex = use_sex, - ) - ) - )) - # Re-order race predictions to match voter file - bisg_no_match <- bisg_no_match[match( - no_match_voters$voterid, - bisg_no_match$voterid - ), ] - # Merge the new probabilities back into the old dataframe - bisg[no_geocode_match, ] <- bisg_no_match - - matched_geocode <- !no_geocode_match - } else { - matched_geocode <- TRUE - } - - # Store geocode match flag - if (return_geocode_flag & !surname_only) { - voter_file$matched_geocode <- matched_geocode - } - - # Final check if there are no matches - no_match_final <- is.na(bisg$pred.whi) - if (sum(no_match_final) > 1) { - if (verbose) { - message(paste( - "Some surnames did not match at higher geographic level.", - "Using surname only for these cases." - )) - } - # Use probabilities from surnames only for those that don't match - invisible(capture.output( - no_match_surnames <- suppressWarnings( - merge_surnames_copy( - voter.file = wru_voter_file[no_match_final, ], - surname.year = surname_year, - clean.surname = TRUE, - impute.missing = TRUE - ) - ) - )) - # Merge back into BISG estimates - p_cols <- c("p_whi", "p_bla", "p_his", "p_asi", "p_oth") - pred_cols <- c("pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth") - bisg[no_match_final, ][, pred_cols] <- no_match_surnames[, p_cols] - } - - # Finalize voter file - if (is.null(voter_id)) { - voter_file$voter_id <- bisg$voterid - voter_file <- dplyr::relocate(voter_file, voter_id, .before = 1) - } - pred_cols <- c("pred.whi", "pred.bla", "pred.his", "pred.asi", "pred.oth") - voter_file[, pred_cols] <- bisg[, pred_cols] - - if (verbose) { - message("BISG complete.") - } - return(voter_file) -} diff --git a/README.md b/README.md index 3457914..6353efe 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![R build status](https://github.com/RPVote/eiCompare/workflows/R-CMD-check/badge.svg)](https://github.com/RPVote/eiCompare/actions?workflow=R-CMD-check) [![Style status](https://github.com/RPVote/eiCompare/workflows/Styler/badge.svg)](https://github.com/RPVote/eiCompare/actions?workflow=Styler) -`eiCompare` is an R package built to help practitioners and academics quantify racially polarized voting (RPV) with ease and confidence. It builds on top of several existing packages, augmenting their utility for measuring racially polarized voting in elections. Underlying packages include `ei`, `eiPack`, and `wru`. +`eiCompare` is an R package built to help practitioners and academics quantify racially polarized voting (RPV) with ease and confidence. It builds on top of several existing packages, augmenting their utility for measuring racially polarized voting in elections. Underlying packages include `ei` and `eiPack`. `eiCompare` was built with several types of users in mind: diff --git a/man/performance_analysis.Rd b/man/performance_analysis.Rd deleted file mode 100644 index f713f25..0000000 --- a/man/performance_analysis.Rd +++ /dev/null @@ -1,112 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/performance_analysis.R -\name{performance_analysis} -\alias{performance_analysis} -\title{Performs a performance analysis using a voter file, census shape, and -district shape.} -\usage{ -performance_analysis( - voter_file, - district_shape, - census_shape, - census_data, - join_census_shape = TRUE, - join_district_shape = TRUE, - state = NULL, - voter_id = "voter_id", - surname = "last_name", - district = "district", - census_state_col = "STATEFP10", - census_county_col = "COUNTYFP10", - census_tract_col = "TRACTCE10", - census_block_col = "BLOCKCE10", - crs = NULL, - coords = c("lon", "lat"), - census_geo = "block", - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - normalize = TRUE, - verbose = FALSE -) -} -\arguments{ -\item{voter_file}{A dataframe containing the voter file.} - -\item{district_shape}{The shapefiles for the new districts or precincts to -consider.} - -\item{census_shape}{The shapefiles for the Census blocks or tracts for which -the voter file will be geocoded against.} - -\item{census_data}{A dataframe containing the Census tracts or blocks in the -region for the voter file.} - -\item{join_census_shape}{A logical denoting whether the voter file already -has the Census block, tract, and county information. If TRUE, then column -names for these items must be provided. If FALSE, then a Census shape must -be provided in order to perform a spatial join.} - -\item{join_district_shape}{A logical denoting whether the voter file already -has the district identity per voter. If TRUE, then a column names for the -district must be provided. If FALSE, then a distrct shape must be provided -in order to perform a spatial join.} - -\item{state}{The state in which the functionality analysis is performed, as -a two character string.} - -\item{voter_id}{A string denoting the column name for the voter ID.} - -\item{surname}{A string denoting the column name for the surname.} - -\item{district}{A string denoting the column name for the district.} - -\item{census_state_col}{The column in the Census data that indicates state. -If the voter file already has Census information, this should denote the -column in the voter file containing the state FIPS code.} - -\item{census_county_col}{The column in the Census data that indicates county. -If the voter file already has Census information, this should denote the -column in the voter file containing the county FIPS code.} - -\item{census_tract_col}{The column in the Census data that indicates tract. -If the voter file already has Census information, this should denote the -column in the voter file containing the tract FIPS code.} - -\item{census_block_col}{The column in the Census data that indicates block. -If the voter file already has Census information, this should denote the -column in the voter file containing the block FIPS code.} - -\item{crs}{A string denoting the PROJ4 string for projecting maps.} - -\item{coords}{The columns for the coordinates.} - -\item{census_geo}{The geographic level at which to perform BISG.} - -\item{use_surname}{Whether to use the surname in calculating race -probabilities. Passed to WRU.} - -\item{surname_only}{Whether to only use the surname in calculating race -probabilities. Passed to WRU.} - -\item{surname_year}{Which Census year to use for surname matching. Passed to -WRU.} - -\item{use_age}{Whether to use the age in the BISG calculation. Passed to WRU.} - -\item{use_sex}{Whether to use the sex in the BISG calculation. Passed to WRU.} - -\item{normalize}{If TRUE, normalizes the district percentages.} - -\item{verbose}{If TRUE, will output diagnostic strings.} -} -\value{ -The processed voter file and a summary of district turnout across - racial groups. -} -\description{ -Performs a performance analysis using a voter file, census shape, and -district shape. -} diff --git a/man/predict_race_multi_barreled.Rd b/man/predict_race_multi_barreled.Rd deleted file mode 100644 index 8443f5d..0000000 --- a/man/predict_race_multi_barreled.Rd +++ /dev/null @@ -1,64 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/surname_utils.R -\name{predict_race_multi_barreled} -\alias{predict_race_multi_barreled} -\title{Predicts, for one row in a voter file, the probability of a voter having a -certain race by averaging over each "barrel" of the surname.} -\usage{ -predict_race_multi_barreled( - voter_file, - surname_col = "last_name", - surname_only = TRUE, - census_data = NULL, - census_geo = "block", - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - state = NULL, - county = NULL, - tract = NULL, - block = NULL, - pattern = "[ -]+", - remove_patterns = NULL -) -} -\arguments{ -\item{voter_file}{The voter file, with each row consisting of a voter.} - -\item{surname_col}{A string denoting the surname column.} - -\item{surname_only}{Whether to obtain probabilities for surnames only.} - -\item{census_data}{A data frame containing Census data corresponding to the -geographic information for units in the voter file.} - -\item{census_geo}{The census level at which to apply BISG. Passed to WRU.} - -\item{surname_year}{Which Census year to use for surname matching. Passed to -WRU.} - -\item{use_age}{Whether to use the age in the BISG calculation. Passed to WRU.} - -\item{use_sex}{Whether to use the sex in the BISG calculation. Passed to WRU.} - -\item{state}{A string denoting the state for which the data is queried.} - -\item{county}{A string denoting the column containing the county FIPS code.} - -\item{tract}{A string denoting the column containing the tract FIPS code.} - -\item{block}{A string denoting the column containing the block FIPS code.} - -\item{pattern}{What pattern to split surnames on. By default, surnames are -split on a space(s), which assumes hyphens have already been removed.} - -\item{remove_patterns}{A list of strings which will be removed from the -list of barrels.} -} -\value{ -A vector of probabilities for each surname. -} -\description{ -Predicts, for one row in a voter file, the probability of a voter having a -certain race by averaging over each "barrel" of the surname. -} diff --git a/man/surname_match.Rd b/man/surname_match.Rd deleted file mode 100644 index 28713ee..0000000 --- a/man/surname_match.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/surname_utils.R -\name{surname_match} -\alias{surname_match} -\title{Determines which surnames match to the Census list.} -\usage{ -surname_match(voter_file, surname_col = "last_name", strip_special = FALSE) -} -\arguments{ -\item{voter_file}{The voter file, with each row consisting of a voter.} - -\item{surname_col}{A string denoting the surname column.} - -\item{strip_special}{Whether to strip special characters before matching in -the surname database.} -} -\value{ -A vector of logicals denoting a match or not. -} -\description{ -Determines which surnames match to the Census list. -} diff --git a/man/wru_predict_race_wrapper.Rd b/man/wru_predict_race_wrapper.Rd deleted file mode 100644 index b4ad293..0000000 --- a/man/wru_predict_race_wrapper.Rd +++ /dev/null @@ -1,84 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/wru_predict_race_wrapper.R -\name{wru_predict_race_wrapper} -\alias{wru_predict_race_wrapper} -\title{Prepares a voter file for the WRU predict_race function, and then predicts -race.} -\usage{ -wru_predict_race_wrapper( - voter_file, - census_data, - voter_id = NULL, - surname = "last_name", - state = NULL, - county = NULL, - tract = NULL, - block = NULL, - census_geo = NULL, - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - return_surname_flag = FALSE, - return_geocode_flag = FALSE, - verbose = FALSE -) -} -\arguments{ -\item{voter_file}{The voter file, containing columns with a surname and -potentially geographic information.} - -\item{census_data}{A data frame containing Census data corresponding to the -geographic information for units in the voter file.} - -\item{voter_id}{A string denoting the column containing voter ID. Default is -NULL, if there is no voter ID in the file. In this case, a voter ID will be -assigned.} - -\item{surname}{A string denoting the column containing the surname.} - -\item{state}{A string denoting the column containing the state FIPS code.} - -\item{county}{A string denoting the column containing the county FIPS code.} - -\item{tract}{A string denoting the column containing the tract FIPS code.} - -\item{block}{A string denoting the column containing the block FIPS code.} - -\item{census_geo}{The census level at which to apply BISG. Passed to WRU.} - -\item{use_surname}{Whether to use the surname in calculating race -probabilities. Passed to WRU.} - -\item{surname_only}{Whether to only use the surname in calculating race -probabilities. Passed to WRU.} - -\item{surname_year}{Which Census year to use for surname matching. Passed to -WRU.} - -\item{use_age}{Whether to use the age in the BISG calculation. Passed to WRU.} - -\item{use_sex}{Whether to use the sex in the BISG calculation. Passed to WRU.} - -\item{return_surname_flag}{If TRUE, returns a flag indicating whether the -surnames matched.} - -\item{return_geocode_flag}{If TRUE, returns a flag indicating whether the -first level of geocode matched.} - -\item{verbose}{A flag indicating whether to print out status messages.} -} -\value{ -The voter file component extracted from the provided data frame, with -additional surname/geocode flags, as well as a data frame race prediction. -} -\description{ -This function assumes that the Census data is provided to the function. It -does not provide the capability of downloading the Census data, since this -is a time intensive process. -} -\references{ -Imai and Khanna (2016) "Improving Ecological Inference by -Predicting Individual Ethnicity from Voter Registration Records" -} diff --git a/tests/testthat/test_surname_utils.R b/tests/testthat/test_surname_utils.R index b312beb..f8a067a 100644 --- a/tests/testthat/test_surname_utils.R +++ b/tests/testthat/test_surname_utils.R @@ -74,22 +74,6 @@ test_that("Surname utility functions work correctly.", { ) testthat::expect_equal(multi_barreled_surnames, expected_multi_surnames) - # Test surname match function - matches_no_special <- surname_match( - voter_file = voter_file, - surname_col = "surname", - strip_special = TRUE - ) - matches_w_special <- surname_match( - voter_file = voter_file, - surname_col = "surname", - strip_special = FALSE - ) - expected_no_special <- c(TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE) - expected_w_special <- c(TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE) - testthat::expect_equal(matches_no_special, expected_no_special) - testthat::expect_equal(matches_w_special, expected_w_special) - # Test surname summary function expect_error( surname_summary(voter_file = voter_file, surname_col = "surname"), diff --git a/tests/testthat/test_wru_predict_race_wrapper.R b/tests/testthat/test_wru_predict_race_wrapper.R deleted file mode 100644 index 98137b6..0000000 --- a/tests/testthat/test_wru_predict_race_wrapper.R +++ /dev/null @@ -1,66 +0,0 @@ -context("Testing WRU predict race wrapper function.") - -test_that("WRU wrapper correctly calculates probabilities.", { - # Create voter file - voter_file <- data.frame( - voter_id = c("1", "2"), - surname = c("JOHNSON", "HERNANDEZ"), - precinct = c("23", "34"), - state = "NY", - county = c("087", "087"), - tract = c("010101", "010101"), - block = c("1001", "1016") - ) - - # Load Rockland county Census information - data(rockland_census) - rockland_census$NY$year <- 2010 - # Run predict race wrapper function - bisg <- wru_predict_race_wrapper( - voter_file = voter_file, - census_data = rockland_census, - voter_id = "voter_id", - surname = "surname", - state = "NY", - county = "county", - tract = "tract", - block = "block", - census_geo = "block", - use_surname = TRUE, - surname_only = FALSE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - return_surname_flag = TRUE, - return_geocode_flag = TRUE, - verbose = FALSE - ) - expect_true(all(!is.na(bisg))) - expect_true(all(bisg$merged_surname)) - expect_true(all(bisg$merged_geocode)) - expect_true("precinct" %in% names(bisg)) - - - # Run predict race wrapper function - bisg <- wru_predict_race_wrapper( - voter_file = voter_file, - census_data = rockland_census, - voter_id = "voter_id", - surname = "surname", - state = "NY", - county = "county", - tract = "tract", - block = "block", - census_geo = "block", - use_surname = TRUE, - surname_only = TRUE, - surname_year = 2010, - use_age = FALSE, - use_sex = FALSE, - return_surname_flag = TRUE, - return_geocode_flag = TRUE, - verbose = FALSE - ) - expect_true(all(!is.na(bisg))) - testthat::expect_false(any(names(bisg) == "merged_geocode")) -})