From e3c098e8257b78c7151a1001dfe0bfdadbc6c209 Mon Sep 17 00:00:00 2001 From: Pratik Sachdeva Date: Fri, 30 Oct 2020 21:31:20 -0700 Subject: [PATCH 1/5] initial commit of utility functions for performing BISG within eiCompare --- R/bisg_utils.R | 316 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 316 insertions(+) create mode 100644 R/bisg_utils.R diff --git a/R/bisg_utils.R b/R/bisg_utils.R new file mode 100644 index 00000000..9c0c21bd --- /dev/null +++ b/R/bisg_utils.R @@ -0,0 +1,316 @@ +#' Gets Census race counts necessary for BISG. +#' +#' This function gets the columns necessary to run BISG in compliance with +#' the Census' Bureau's surname lists. It can extract columns for either the +#' decennial Census or the ACS. +#' +#' @param geography The geographic level at which to obtain Census data. If +#' obtaining data from the decennial Census, can be up to "block". If +#' obtaining data from the ACS, can only be up to "block group". +#' @param state The state from which to obtain Census data, as a string. +#' @param county The county(ies) from which to obtain Census data. If NULL, +#' data is obtained from all counties in the state. +#' @param year The year to obtain Census data from. If 2010, uses decennial +#' data. Otherwise, uses the 5-year ACS summary data. +#' @return A tibble containing the race counts per geography unit for white, +#' black, Hispanic/Latino, Asian, and other voters. +#' +#' @import dplyr +#' @importFrom tidycensus get_decennial get_acs +#' @importFrom tigris list_counties +#' @export get_census_race_counts +get_census_race_counts <- function( + geography, state, county = NULL, year = 2010 +) { + # Get FIPS county codes if they're not provided + if (is.null(county)) { + # Filter county codes from tidycensus by state + county <- tigris::list_counties(state)[["county_code"]] + } + # Decennial data + if (year == 2010) { + # Variable names for SF1 dataset + variables <- c("P005003", + "P005004", + "P005005", + "P005006", + "P005007", + "P005008", + "P005009", + "P005010") + # Use tidycensus to extract Census counts + counts <- suppressMessages(tidycensus::get_decennial( + geography = geography, + variables = variables, + year = year, + state = state, + county = county, + output = "wide")) %>% + dplyr::mutate( + fips = GEOID, + whi = P005003, + bla = P005004, + his = P005010, + asi = P005006 + P005007, + oth = P005005 + P005008 + P005009) %>% + dplyr::select(fips, whi, bla, his, asi, oth) + } else { + # Variable names for ACS + variables <- c("B03002_003", + "B03002_004", + "B03002_005", + "B03002_006", + "B03002_007", + "B03002_008", + "B03002_009", + "B03002_010", + "B03002_011", + "B03002_012") + # Use tidycensus to extract counts for the ACS + counts <- suppressMessages(tidycensus::get_acs( + geography = geography, + variables = variables, + year = year, + output = "wide", + state = state, + county = county, + survey = "acs5")) %>% + dplyr::mutate( + fips = GEOID, + whi = B03002_003E, + bla = B03002_004E, + his = B03002_012E, + asi = B03002_006E + B03002_007E, + oth = B03002_008E + B03002_009E + B03002_010E + B03002_011E + ) %>% + dplyr::select( + fips, whi, bla, his, asi, oth + ) + } + return(counts) +} + +#' Computes the probability a person is located in a specific geographic unit. +#' +#' This is a utility function for performing BISG. It operates on a dataframe +#' obtained from the Census Bureau via the provided eiCompare helper function. +#' +#' @param counts A tibble containing counts (divided amongst constituent groups) +#' per geographic units (rows). +#' @param cols The columns denoting the constituent groups within each +#' geographic units. +#' @return A vector containing the probability of selecting a person from each +#' geographic unit. +#' +#' @export compute_p_g +compute_p_g <- function(counts, cols = c("whi", "bla", "his", "asi", "oth")) { + total_by_geo <- rowSums(counts[, cols]) + p_g <- total_by_geo / sum(total_by_geo) + return(p_g) +} + + +#' Computes the probability a person is of a specific racial group, conditioned +#' on geographic unit. +#' +#' This is a utility function for performing BISG. It operates on a dataframe +#' obtained from the Census Bureau via the provided eiCompare helper function. +#' +#' @param counts A tibble containing counts (divided amongst constituent groups) +#' per geographic units (rows). +#' @param cols The columns denoting the constituent groups within each +#' geographic units. +#' @return A tibble with dimensions equal to counts, with each entry denoting +#' the probability of selecting a person per racial group (columns) conditioned +#' on racial group (rows). +#' +#' @export compute_p_r_cond_g +compute_p_r_cond_g <- function(counts, + cols = c("whi", "bla", "his", "asi", "oth")) { + p_r_g <- counts %>% + dplyr::mutate( + dplyr::across(.cols = cols, .fns = ~ . / rowSums(counts[, cols])) + ) %>% + dplyr::mutate(dplyr::across(.cols = cols, .fns = ~ ifelse(is.na(.), 0, .))) + return(p_r_g) +} + + +#' Computes the probability a person is in a specific geographic unit, +#' conditioned on a racial group. +#' +#' This is a utility function for performing BISG. It operates on a dataframe +#' obtained from the Census Bureau via the provided eiCompare helper function. +#' +#' @param counts A tibble containing counts (divided amongst constituent groups) +#' per geographic units (rows). +#' @param cols The columns denoting the constituent groups within each +#' geographic units. +#' @return A tibble with dimensions equal to counts, with each entry denoting +#' the probability of selecting a person per geographic unit (rows) +#' conditioned on racial group (columns). +#' +#' @export compute_p_r_cond_g +compute_p_g_cond_r <- function(counts, + cols = c("whi", "bla", "his", "asi", "oth")) { + # Need P(G) and P(R|G) to invert for P(G|R) according to Bayes' Theorem + p_g <- compute_p_g(counts, cols) + p_r_g <- compute_p_r_cond_g(counts, cols) + # Apply Bayes' Theorem + p_g_r <- p_r_g %>% + dplyr::mutate( + dplyr::across(.cols = cols, .fns = ~ . * dplyr::all_of(p_g)) + ) %>% + dplyr::mutate(dplyr::across(.cols = cols, .fns = ~ . / sum(.))) + return(p_g_r) +} + + +#' Computes the probability a person is of a specific racial group, conditioned +#' on surname. +#' +#' This is a utility function for performing BISG. It operates on a voter file, +#' and utilizes WRU's function for calculating probability of race by surname. +#' +#' @param voter_file A tibble containing a list of voters (by row), and a +#' column that denotes their surname. +#' @param surname_col A string denoting which column contains the voter surname. +#' +#' @return A tibble with rows denoting voters and columns denoting the +#' probability that each voter is of a particular racial group. +#' +#' @export compute_p_r_cond_s +compute_p_r_cond_s <- function(voter_file, surname_col) { + p_r_s <- wru::merge_surnames( + voter.file = dplyr::rename(voter_file, + surname = dplyr::all_of(surname_col)), + surname.year = 2010, + clean.surname = TRUE, + impute.missing = TRUE + ) %>% + dplyr::rename( + !!surname_col := surname, + whi = p_whi, + bla = p_bla, + his = p_his, + asi = p_asi, + oth = p_oth + ) %>% + dplyr::select( + all_of(surname_col), whi, bla, his, asi, oth + ) + return(p_r_s) +} + + +#' Computes the probability a person is of a specific racial group, conditioned +#' on surname and geolocation. +#' +#' This is a utility function for performing BISG. It operates on a voter file, +#' and counts obtained from the Census Bureau via eiCompare's helper function. +#' +#' @param voter_file A tibble containing a list of voters (by row), and a +#' column that denotes their surname. +#' @param counts A tibble containing counts (divided amongst constituent groups) +#' per geographic units (rows). +#' @param surname_col A string denoting which column contains the voter surname. +#' @param geo_col A string denoting which column contains the geographic unit +#' ID. +#' @param race_ols A list of strings denoting the columns containing racial +#' groups. +#' @param geo_col_counts A string denoting the column in the counts tibble that +#' refers to the geographic unit. +#' +#' @return A tibble with rows denoting voters and columns denoting the +#' probability that each voter is of a particular racial group. +#' +#' @export compute_p_r_cond_s_g +compute_p_r_cond_s_g <- function( + voter_file, counts, surname_col, geo_col, race_cols, geo_col_counts = "fips" +) { + # Compute probability of race conditioned on surname + p_r_s <- compute_p_r_cond_s(voter_file = voter_file, + surname_col = surname_col) + # Compute probability of geography conditioned on race, for all geographies + p_g_r_all <- compute_p_g_cond_r(counts = counts, cols = race_cols) + # Merge p(G|R) into the voter file by FIPS code + p_g_r <- dplyr::left_join( + x = voter_file, + y = p_g_r_all, + by = stats::setNames(geo_col_counts, geo_col) + ) + # Compute the final probability of race + p_r_s_g <- (p_r_s[, race_cols] * p_g_r[, race_cols]) %>% + dplyr::rowwise() %>% + dplyr::mutate(total = sum(dplyr::across(.cols = race_cols))) %>% + dplyr::ungroup() %>% + dplyr::mutate(dplyr::across(.cols = race_cols, .fns = ~ . / total)) %>% + dplyr::select(dplyr::all_of(race_cols)) +} + + +#' Performs Bayesian Improved Surname Geocoding: Computes the probability a +#' person is of a specific racial group, conditioned on surname and geolocation. +#' +#' @param voter_file A tibble containing a list of voters (by row), and a +#' column that denotes their surname. +#' @param surname_col A string denoting which column contains the voter surname. +#' @param geo_col A string denoting which column contains the geographic unit +#' ID. +#' @param census_counts A tibble containing counts (divided amongst constituent +#' groups) per geographic units (rows). If NULL, these counts will be obtained +#' using the eiCompare helper function and the other parameters. +#' @param geography The geographic level at which to obtain Census data. If +#' obtaining data from the decennial Census, can be up to "block". If +#' obtaining data from the ACS, can only be up to "block group". +#' @param state The state from which to obtain Census data, as a string. +#' @param county The county(ies) from which to obtain Census data. If NULL, +#' data is obtained from all counties in the state. +#' @param year The year to obtain Census data from. If 2010, uses decennial +#' data. Otherwise, uses the 5-year ACS summary data. +#' @param cols A list of strings denoting the columns containing racial groups. +#' @param geo_col_counts A string denoting the column in the counts tibble that +#' refers to the geographic unit. +#' @param race_ols A list of strings denoting the columns containing racial +#' groups. +#' @param verbose A boolean denoting the verbosity. +#' @return A tibble with rows denoting voters and columns denoting the +#' probability that each voter is of a particular racial group. +#' +#' @export bisg +bisg <- function( + voter_file, surname_col, geo_col, census_counts = NULL, geography = NULL, + state = NULL, county = NULL, year = NULL, geo_col_counts = "fips", + race_cols = c("whi", "bla", "his", "asi", "oth"), verbose = FALSE +) { + # If counts aren't provided, obtain then via tidycensus + if (is.null(census_counts)) { + if (verbose) { + message("Obtaining counts from Census Bureau using tidycensus.") + } + census_counts <- get_census_race_counts( + geography = geography, + state = state, + county = county, + year = year + ) + } + if (verbose) { + message("Calculating BISG probabilities.") + } + # Compute the probability of race conditioned on geography and surname + p_r_s_g <- compute_p_r_cond_s_g( + voter_file = voter_file, + counts = census_counts, + surname_col = surname_col, + geo_col = geo_col, + race_cols = race_cols, + geo_col_counts = geo_col_counts + ) + # Combine BISG probabilities back into voter file + voter_file_w_bisg <- dplyr::bind_cols(voter_file, p_r_s_g) + if (verbose) { + message("BISG complete.") + } + return(voter_file_w_bisg) +} \ No newline at end of file From ae89c2debcaeba8a560ca427d2d4bef2c517ca2a Mon Sep 17 00:00:00 2001 From: Pratik Sachdeva Date: Sat, 31 Oct 2020 21:53:07 -0700 Subject: [PATCH 2/5] added functionality to look at bigger geographic region when no matches happen - need to deal with fips codes --- R/bisg_utils.R | 95 ++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 11 deletions(-) diff --git a/R/bisg_utils.R b/R/bisg_utils.R index 9c0c21bd..22a2237c 100644 --- a/R/bisg_utils.R +++ b/R/bisg_utils.R @@ -1,3 +1,17 @@ +swap_census_geography <- function(census_geo) { + if (census_geo == "block") { + new_geo <- "block group" + } else if (census_geo == "block group") { + new_geo <- "tract" + } else if (census_geo == "tract") { + new_geo <- "county" + } else if (census_geo == "county") { + new_geo <- "state" + } + return(new_geo) +} + + #' Gets Census race counts necessary for BISG. #' #' This function gets the columns necessary to run BISG in compliance with @@ -181,13 +195,13 @@ compute_p_g_cond_r <- function(counts, #' #' @export compute_p_r_cond_s compute_p_r_cond_s <- function(voter_file, surname_col) { - p_r_s <- wru::merge_surnames( + p_r_s <- suppressWarnings(suppressMessages(wru::merge_surnames( voter.file = dplyr::rename(voter_file, surname = dplyr::all_of(surname_col)), surname.year = 2010, clean.surname = TRUE, impute.missing = TRUE - ) %>% + ))) %>% dplyr::rename( !!surname_col := surname, whi = p_whi, @@ -226,11 +240,15 @@ compute_p_r_cond_s <- function(voter_file, surname_col) { #' #' @export compute_p_r_cond_s_g compute_p_r_cond_s_g <- function( - voter_file, counts, surname_col, geo_col, race_cols, geo_col_counts = "fips" + voter_file, counts, surname_col, geo_col, + race_cols = c("whi", "bla", "his", "asi", "oth"), geo_col_counts = "fips", + p_r_s = NULL ) { - # Compute probability of race conditioned on surname - p_r_s <- compute_p_r_cond_s(voter_file = voter_file, - surname_col = surname_col) + if (is.null(p_r_s)) { + # Compute probability of race conditioned on surname + p_r_s <- compute_p_r_cond_s(voter_file = voter_file, + surname_col = surname_col) + } # Compute probability of geography conditioned on race, for all geographies p_g_r_all <- compute_p_g_cond_r(counts = counts, cols = race_cols) # Merge p(G|R) into the voter file by FIPS code @@ -246,6 +264,7 @@ compute_p_r_cond_s_g <- function( dplyr::ungroup() %>% dplyr::mutate(dplyr::across(.cols = race_cols, .fns = ~ . / total)) %>% dplyr::select(dplyr::all_of(race_cols)) + return(p_r_s_g) } @@ -281,7 +300,8 @@ compute_p_r_cond_s_g <- function( bisg <- function( voter_file, surname_col, geo_col, census_counts = NULL, geography = NULL, state = NULL, county = NULL, year = NULL, geo_col_counts = "fips", - race_cols = c("whi", "bla", "his", "asi", "oth"), verbose = FALSE + race_cols = c("whi", "bla", "his", "asi", "oth"), impute_missing = TRUE, + verbose = FALSE ) { # If counts aren't provided, obtain then via tidycensus if (is.null(census_counts)) { @@ -295,22 +315,75 @@ bisg <- function( year = year ) } + # Merge voter file with WRU surname database if (verbose) { - message("Calculating BISG probabilities.") + message("Merging surnames.") } + p_r_s <- compute_p_r_cond_s( + voter_file = voter_file, + surname_col = surname_col + ) # Compute the probability of race conditioned on geography and surname + if (verbose) { + message("Calculating BISG probabilities.") + } p_r_s_g <- compute_p_r_cond_s_g( voter_file = voter_file, counts = census_counts, surname_col = surname_col, geo_col = geo_col, race_cols = race_cols, - geo_col_counts = geo_col_counts + geo_col_counts = geo_col_counts, + p_r_s = p_r_s ) + # If necessary, impute the records located in blocks recorded as having no + # population + if (impute_missing) { + no_geocode_match <- is.na(p_r_s_g$whi) + # Imputing is only necessary if an entry didn't match + while (any(no_geocode_match)) { + if (verbose) { + message("Some voters didn't match to a geography.") + } + # Case 1: user did not provide a geography + if (is.null(geography)) { + if (verbose) { + message(paste("Geography not provided.", + "Imputing with surname probabilities.")) + } + p_r_s_g[no_geocode_match, race_cols] <- p_r_s[no_geocode_match, + race_cols] + } else { + # Case 2: user provides geography; extract new counts + geography <- swap_census_geography(geography) + if (verbose) { + message(paste("Re-performing BISG at the", geography, "level.")) + } + census_counts <- get_census_race_counts( + geography = geography, + state = state, + county = county, + year = year + ) + # Re-perform BISG + new_p_r_s_g <- compute_p_r_cond_s_g( + voter_file = voter_file[no_geocode_match, ], + counts = census_counts, + surname_col = surname_col, + geo_col = geo_col, + race_cols = race_cols, + geo_col_counts = geo_col_counts, + p_r_s = p_r_s[no_geocode_match, ] + ) + p_r_s_g[no_geocode_match, ] <- new_p_r_s_g + } + no_geocode_match <- is.na(p_r_s_g$whi) + } + } # Combine BISG probabilities back into voter file - voter_file_w_bisg <- dplyr::bind_cols(voter_file, p_r_s_g) + voter_file <- dplyr::bind_cols(voter_file, p_r_s_g) if (verbose) { message("BISG complete.") } - return(voter_file_w_bisg) + return(voter_file) } \ No newline at end of file From 883e8c427e1be5fd103c3424a4e088d67de1c863 Mon Sep 17 00:00:00 2001 From: Pratik Sachdeva Date: Sun, 1 Nov 2020 18:25:17 -0800 Subject: [PATCH 3/5] Fixed the merging with different levels of fips codes in compute_p_r_cond_s_g --- R/bisg_utils.R | 68 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 14 deletions(-) diff --git a/R/bisg_utils.R b/R/bisg_utils.R index 22a2237c..06ca8b4a 100644 --- a/R/bisg_utils.R +++ b/R/bisg_utils.R @@ -1,3 +1,13 @@ +#' Gets identifier for geographic unit at the level above the provided unit. +#' +#' This function accept a Census geographic identifier and returns the +#' identifier one level above the provided one, in terms of spatial coverage. +#' +#' @param census_geo A string indicating the geographic level. +#' @return A string indicating the geographic unit one level above the provided +#' unit. +#' +#' @export swap_census_geography swap_census_geography <- function(census_geo) { if (census_geo == "block") { new_geo <- "block group" @@ -7,6 +17,8 @@ swap_census_geography <- function(census_geo) { new_geo <- "county" } else if (census_geo == "county") { new_geo <- "state" + } else { + stop("Incorrect Census geographic identifier.") } return(new_geo) } @@ -26,6 +38,7 @@ swap_census_geography <- function(census_geo) { #' data is obtained from all counties in the state. #' @param year The year to obtain Census data from. If 2010, uses decennial #' data. Otherwise, uses the 5-year ACS summary data. +#' @param cache A bool denoting whether the Census data should be cached. #' @return A tibble containing the race counts per geography unit for white, #' black, Hispanic/Latino, Asian, and other voters. #' @@ -34,7 +47,7 @@ swap_census_geography <- function(census_geo) { #' @importFrom tigris list_counties #' @export get_census_race_counts get_census_race_counts <- function( - geography, state, county = NULL, year = 2010 + geography, state, county = NULL, year = 2010, cache = FALSE ) { # Get FIPS county codes if they're not provided if (is.null(county)) { @@ -59,7 +72,8 @@ get_census_race_counts <- function( year = year, state = state, county = county, - output = "wide")) %>% + output = "wide", + cache_table = cache)) %>% dplyr::mutate( fips = GEOID, whi = P005003, @@ -88,7 +102,8 @@ get_census_race_counts <- function( output = "wide", state = state, county = county, - survey = "acs5")) %>% + survey = "acs5", + cache_table = cache)) %>% dplyr::mutate( fips = GEOID, whi = B03002_003E, @@ -104,6 +119,7 @@ get_census_race_counts <- function( return(counts) } + #' Computes the probability a person is located in a specific geographic unit. #' #' This is a utility function for performing BISG. It operates on a dataframe @@ -138,9 +154,11 @@ compute_p_g <- function(counts, cols = c("whi", "bla", "his", "asi", "oth")) { #' the probability of selecting a person per racial group (columns) conditioned #' on racial group (rows). #' +#' @importFrom dplyr across mutate #' @export compute_p_r_cond_g -compute_p_r_cond_g <- function(counts, - cols = c("whi", "bla", "his", "asi", "oth")) { +compute_p_r_cond_g <- function( + counts, cols = c("whi", "bla", "his", "asi", "oth") +) { p_r_g <- counts %>% dplyr::mutate( dplyr::across(.cols = cols, .fns = ~ . / rowSums(counts[, cols])) @@ -164,9 +182,11 @@ compute_p_r_cond_g <- function(counts, #' the probability of selecting a person per geographic unit (rows) #' conditioned on racial group (columns). #' +#' @importFrom dplyr across all_of mutate #' @export compute_p_r_cond_g -compute_p_g_cond_r <- function(counts, - cols = c("whi", "bla", "his", "asi", "oth")) { +compute_p_g_cond_r <- function( + counts, cols = c("whi", "bla", "his", "asi", "oth") +) { # Need P(G) and P(R|G) to invert for P(G|R) according to Bayes' Theorem p_g <- compute_p_g(counts, cols) p_r_g <- compute_p_r_cond_g(counts, cols) @@ -189,10 +209,11 @@ compute_p_g_cond_r <- function(counts, #' @param voter_file A tibble containing a list of voters (by row), and a #' column that denotes their surname. #' @param surname_col A string denoting which column contains the voter surname. -#' #' @return A tibble with rows denoting voters and columns denoting the #' probability that each voter is of a particular racial group. #' +#' @import dplyr +#' @importFrom wru merge_surnames #' @export compute_p_r_cond_s compute_p_r_cond_s <- function(voter_file, surname_col) { p_r_s <- suppressWarnings(suppressMessages(wru::merge_surnames( @@ -230,14 +251,20 @@ compute_p_r_cond_s <- function(voter_file, surname_col) { #' @param surname_col A string denoting which column contains the voter surname. #' @param geo_col A string denoting which column contains the geographic unit #' ID. -#' @param race_ols A list of strings denoting the columns containing racial +#' @param race_cols A list of strings denoting the columns containing racial #' groups. #' @param geo_col_counts A string denoting the column in the counts tibble that #' refers to the geographic unit. +#' @param p_r_s A dataframe denoting the probability of race conditioned on +#' surname that matches with the provided voter file. Can accelerate +#' computation if provided up front. #' #' @return A tibble with rows denoting voters and columns denoting the #' probability that each voter is of a particular racial group. #' +#' @import dplyr +#' @importFrom stats setNames +#' @importFrom stringr str_length str_sub #' @export compute_p_r_cond_s_g compute_p_r_cond_s_g <- function( voter_file, counts, surname_col, geo_col, @@ -251,6 +278,11 @@ compute_p_r_cond_s_g <- function( } # Compute probability of geography conditioned on race, for all geographies p_g_r_all <- compute_p_g_cond_r(counts = counts, cols = race_cols) + # Check if the IDs for voter file and the counts have the same length + fips_length <- unique(stringr::str_length(counts[[geo_col_counts]])) + voter_file[[geo_col]] <- stringr::str_sub( + voter_file[[geo_col]], + end = fips_length) # Merge p(G|R) into the voter file by FIPS code p_g_r <- dplyr::left_join( x = voter_file, @@ -287,21 +319,26 @@ compute_p_r_cond_s_g <- function( #' data is obtained from all counties in the state. #' @param year The year to obtain Census data from. If 2010, uses decennial #' data. Otherwise, uses the 5-year ACS summary data. -#' @param cols A list of strings denoting the columns containing racial groups. #' @param geo_col_counts A string denoting the column in the counts tibble that #' refers to the geographic unit. -#' @param race_ols A list of strings denoting the columns containing racial +#' @param race_cols A list of strings denoting the columns containing racial #' groups. +#' @param impute_missing A bool denoting whether voter file entries that do not +#' match at the geographic level should be imputed with either the surname +#' probabilities, or should be imputed with probabilities calculated at a +#' broader geographic unit. #' @param verbose A boolean denoting the verbosity. +#' @param cache A boolean denoting whether Census data should be cached. #' @return A tibble with rows denoting voters and columns denoting the #' probability that each voter is of a particular racial group. #' +#' @importFrom dplyr bind_cols #' @export bisg bisg <- function( voter_file, surname_col, geo_col, census_counts = NULL, geography = NULL, state = NULL, county = NULL, year = NULL, geo_col_counts = "fips", race_cols = c("whi", "bla", "his", "asi", "oth"), impute_missing = TRUE, - verbose = FALSE + verbose = FALSE, cache = FALSE ) { # If counts aren't provided, obtain then via tidycensus if (is.null(census_counts)) { @@ -312,7 +349,8 @@ bisg <- function( geography = geography, state = state, county = county, - year = year + year = year, + cache = cache ) } # Merge voter file with WRU surname database @@ -359,11 +397,13 @@ bisg <- function( if (verbose) { message(paste("Re-performing BISG at the", geography, "level.")) } + # Get Census counts at a broader spatial extent census_counts <- get_census_race_counts( geography = geography, state = state, county = county, - year = year + year = year, + cache = cache ) # Re-perform BISG new_p_r_s_g <- compute_p_r_cond_s_g( From 9a12e93a8cd47a29fb74b00865c2a9a24df5c057 Mon Sep 17 00:00:00 2001 From: Pratik Sachdeva Date: Fri, 20 Nov 2020 17:25:41 -0800 Subject: [PATCH 4/5] initial passes at some functions for errors --- R/error_prop_utils.R | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 R/error_prop_utils.R diff --git a/R/error_prop_utils.R b/R/error_prop_utils.R new file mode 100644 index 00000000..50221e38 --- /dev/null +++ b/R/error_prop_utils.R @@ -0,0 +1,38 @@ +#' Calculate error on the product of the terms in the numerator of the BISG +#' estimate. +#' +#' This serves as a helper function for computing the BISG error estimate. +#' +#' @param p_g_r +#' @param p_r_s +#' @param p_g_r_err +#' @param p_r_s_err +compute_product_err <- function( + p_g_r, p_r_s, p_g_r_err, p_r_s_err +) { + error <- p_r_s^2 * p_g_r_err + p_g_r^2 * p_r_s_err + return(error) +} + + +#' Calculate error on the product of the terms in the numerator of the BISG +#' estimate. +#' +#' This serves as a helper function for computing the BISG error estimate. +#' +#' @param p_g_r +#' @param p_r_s +#' @param p_g_r_err +#' @param p_r_s_err +compute_bisg_err <- function( + p_g_r, p_r_s, p_g_r_err, p_r_s_err +) { + product <- p_g_r * p_r_s + # Calculate error on product term + product_err <- compute_product_err(p_g_r, p_r_s, p_g_r_err, p_r_s_err) + # Calculate sum of product term (normalization constant) + norm <- rowSums(product) + norm_err <- rowSums(product_err) + final_err <- product_err / norm^2 + product^2 * (norm_err / norm^4) + return(final_err) +} \ No newline at end of file From 8ef42cc81efee752bdf60d443f7bd69c2af79062 Mon Sep 17 00:00:00 2001 From: Pratik Sachdeva Date: Sat, 12 Dec 2020 16:32:20 -0800 Subject: [PATCH 5/5] adding some in progress error prop util tests, fixing description for warnings --- DESCRIPTION | 1 + R/bisg_utils.R | 3 +- R/error_prop_utils.R | 47 +++++++++++++++----------- tests/testthat/test_bisg_utils.R | 10 ++++++ tests/testthat/test_error_prop_utils.R | 34 +++++++++++++++++++ 5 files changed, 75 insertions(+), 20 deletions(-) create mode 100644 tests/testthat/test_bisg_utils.R create mode 100644 tests/testthat/test_error_prop_utils.R diff --git a/DESCRIPTION b/DESCRIPTION index f48fa3a3..8a811d32 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -65,6 +65,7 @@ Imports: stringr, tidyr, leaflet, + tidycensus, tidyselect NeedsCompilation: no Suggests: diff --git a/R/bisg_utils.R b/R/bisg_utils.R index 06ca8b4a..787dcbaa 100644 --- a/R/bisg_utils.R +++ b/R/bisg_utils.R @@ -43,6 +43,7 @@ swap_census_geography <- function(census_geo) { #' black, Hispanic/Latino, Asian, and other voters. #' #' @import dplyr +#' @importFrom rlang := #' @importFrom tidycensus get_decennial get_acs #' @importFrom tigris list_counties #' @export get_census_race_counts @@ -81,7 +82,7 @@ get_census_race_counts <- function( his = P005010, asi = P005006 + P005007, oth = P005005 + P005008 + P005009) %>% - dplyr::select(fips, whi, bla, his, asi, oth) + dplyr::select(dplyr::all_of(c("fips", "whi", "bla", "his", "asi", "oth"))) } else { # Variable names for ACS variables <- c("B03002_003", diff --git a/R/error_prop_utils.R b/R/error_prop_utils.R index 50221e38..de3527ca 100644 --- a/R/error_prop_utils.R +++ b/R/error_prop_utils.R @@ -3,30 +3,39 @@ #' #' This serves as a helper function for computing the BISG error estimate. #' -#' @param p_g_r -#' @param p_r_s -#' @param p_g_r_err -#' @param p_r_s_err -compute_product_err <- function( - p_g_r, p_r_s, p_g_r_err, p_r_s_err -) { - error <- p_r_s^2 * p_g_r_err + p_g_r^2 * p_r_s_err - return(error) +#' @param p_g_r A tibble denoting the probability of a geography, given race. +#' Each row in the tibble denotes a voter. +#' @param p_r_s A tibble denoting the probability of a racial group, given a +#' surname. Each row in the tibble denotes a voter. +#' @param p_g_r_err A tibble denoting the error on the probability of a +#' geography, conditioned on racial group. Same size as p_g_r. +#' @param p_r_s_err A tibble denoting the error on the probability of racial +#' group, conditioned on race. Same size as p_r_s. +#' +#' @return The error on p_g_r times p_r_s. +compute_product_err <- function(p_g_r, p_r_s, p_g_r_err, p_r_s_err) { + err <- p_r_s^2 * p_g_r_err + p_g_r^2 * p_r_s_err + return(err) } -#' Calculate error on the product of the terms in the numerator of the BISG -#' estimate. +#' Calculate error on the BISG output, given input probability distributions. #' -#' This serves as a helper function for computing the BISG error estimate. +#' This function calculates the error on the posterior probability given the +#' two distributions - probability of geography conditioned on race, and +#' probability of race conditioned on surname - that are used to perform BISG. +#' +#' @param p_g_r A tibble denoting the probability of a geography, given race. +#' Each row in the tibble denotes a voter. +#' @param p_r_s A tibble denoting the probability of a racial group, given a +#' surname. Each row in the tibble denotes a voter. +#' @param p_g_r_err A tibble denoting the error on the probability of a +#' geography, conditioned on racial group. Same size as p_g_r. +#' @param p_r_s_err A tibble denoting the error on the probability of racial +#' group, conditioned on race. Same size as p_r_s. #' -#' @param p_g_r -#' @param p_r_s -#' @param p_g_r_err -#' @param p_r_s_err -compute_bisg_err <- function( - p_g_r, p_r_s, p_g_r_err, p_r_s_err -) { +#' @return The error on the posterior probability. +compute_bisg_err <- function(p_g_r, p_r_s, p_g_r_err, p_r_s_err) { product <- p_g_r * p_r_s # Calculate error on product term product_err <- compute_product_err(p_g_r, p_r_s, p_g_r_err, p_r_s_err) diff --git a/tests/testthat/test_bisg_utils.R b/tests/testthat/test_bisg_utils.R new file mode 100644 index 00000000..1b4e5693 --- /dev/null +++ b/tests/testthat/test_bisg_utils.R @@ -0,0 +1,10 @@ +context("Test BISG utility functions.") + +test_that("swap_census_geography operates correctly", { + expect_equal("block group", eiCompare::swap_census_geography("block")) + expect_equal("tract", eiCompare::swap_census_geography("block group")) + expect_equal("county", eiCompare::swap_census_geography("tract")) + expect_equal("state", eiCompare::swap_census_geography("county")) + expect_equal("block group", eiCompare::swap_census_geography("block")) + expect_error(eiCompare::swap_census_geography("test")) +}) \ No newline at end of file diff --git a/tests/testthat/test_error_prop_utils.R b/tests/testthat/test_error_prop_utils.R new file mode 100644 index 00000000..d0b63507 --- /dev/null +++ b/tests/testthat/test_error_prop_utils.R @@ -0,0 +1,34 @@ +context("Test error propagation utility functions.") + +test_that("compute_product_err calculates error correctly", { + p_g_r <- data.frame(x = c(0.5), y = c(0.2), z = c(0.1)) + p_g_r_err <- data.frame(x = c(0.1), y = c(0.05), z = c(0.02)) + p_r_s <- data.frame(x = c(0.2), y = c(0.1), z = c(0.7)) + p_r_s_err <- data.frame(x = c(0.04), y = c(0.01), z = c(0.2)) + expected_err <- eiCompare::compute_product_err( + p_g_r = p_g_r, + p_r_s = p_r_s, + p_g_r_err = p_g_r_err, + p_r_s_err = p_r_s_err + ) + p_g_r_samples <- matrix( + rnorm( + n = 300000, + mean = as.numeric(p_g_r), + sd = sqrt(as.numeric(p_g_r_err)) + ), + byrow = TRUE, + ncol = 3 + ) + p_r_s_samples <- matrix( + rnorm( + n = 300000, + mean = as.numeric(p_r_s), + sd = sqrt(as.numeric(p_r_s_err)) + ), + byrow = TRUE, + ncol = 3 + ) + observed_err <- matrixStats::colVars(p_g_r_samples * p_r_s_samples) + testthat::expect_equal(observed_err, expected_err, tolerance = 0.1) +}) \ No newline at end of file