diff --git a/DESCRIPTION b/DESCRIPTION index 51c56b5..4d40343 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,6 +19,7 @@ Imports: janitor, purrr, readr, + rlang, rvest, stringr, tibble, diff --git a/NAMESPACE b/NAMESPACE index 5172ca0..e964796 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,5 +2,4 @@ export(crosswalk_data) export(get_crosswalk) -export(get_crosswalk_chain) export(list_nhgis_crosswalks) diff --git a/R/crosswalk_data.R b/R/crosswalk_data.R index 917e5f5..f909391 100644 --- a/R/crosswalk_data.R +++ b/R/crosswalk_data.R @@ -1,7 +1,7 @@ ## explicitly enable/acknowledge data.table (used by tidytable) .datatable.aware = TRUE -#' Apply a Crosswalk to Transform Data +#' Interpolate data using a crosswalk(s) #' #' Applies geographic crosswalk weights to transform data from a source geography #' to a target geography. Accepts the output from `get_crosswalk()` and automatically @@ -28,6 +28,11 @@ #' @param return_intermediate Logical. If TRUE and crosswalk has multiple steps, #' returns a list containing both the final result and intermediate results #' from each step. Default is FALSE, which returns only the final result. +#' @param show_join_quality Logical. If TRUE (default), prints diagnostic messages +#' about join quality, including the number of data rows not matching the crosswalk +#' and vice versa. For state-nested geographies (tract, county, block group, etc.), +#' also reports state-level concentration of unmatched rows. Set to FALSE to +#' suppress these messages. #' #' @return If `return_intermediate = FALSE` (default), a tibble with data summarized #' to the final target geography. @@ -55,6 +60,11 @@ #' - Columns starting with "mean_", "median_", "percent_", or "ratio_" are treated #' as non-count variables #' +#' **Other columns**: Columns that are not the geoid column, count columns, or +#' non-count columns (e.g., metadata like `data_year`) are preserved by taking +#' the first non-missing value within each target geography group. If all values +#' are missing, NA is returned. +#' #' **Multi-step crosswalks**: When `get_crosswalk()` returns multiple crosswalks #' (for transformations that change both geography and year), this function #' automatically applies them in sequence. @@ -108,7 +118,8 @@ crosswalk_data <- function( geoid_column = "geoid", count_columns = NULL, non_count_columns = NULL, - return_intermediate = FALSE) { + return_intermediate = FALSE, + show_join_quality = TRUE) { # Determine if crosswalk is a list (from get_crosswalk) or a single tibble crosswalk_list <- extract_crosswalk_list(crosswalk) @@ -166,7 +177,10 @@ crosswalk_data <- function( crosswalk = step_crosswalk, geoid_column = current_geoid_column, count_columns = count_columns, - non_count_columns = non_count_columns) + non_count_columns = non_count_columns, + step_number = i, + total_steps = n_steps, + show_join_quality = show_join_quality) # Store intermediate result if requested if (return_intermediate) { @@ -246,6 +260,295 @@ validate_crosswalk_tibble <- function(crosswalk, name) { } +#' Check if Geography is Nested Within States +#' +#' Internal function that determines whether a geography type has GEOIDs +#' where the first two characters represent the state FIPS code. +#' +#' Geographies nested within states (state FIPS derivable from GEOID): +#' - block, block_group, tract, county, place, puma, congressional districts +#' +#' Geographies NOT nested within states (cross state boundaries): +#' - zcta, cbsa/core_based_statistical_area, urban_area +#' +#' @param geography Character. The geography type to check. +#' @return Logical. TRUE if geography is nested within states. +#' @keywords internal +#' @noRd +is_state_nested_geography <- function(geography) { + if (is.null(geography) || is.na(geography)) { + return(FALSE) + } + + geography_lower <- geography |> + stringr::str_to_lower() |> + stringr::str_squish() |> + stringr::str_replace_all("_", " ") + + # Geographies where GEOID starts with state FIPS + + state_nested <- c( + "block", "blocks", "blk", + "block group", "blockgroup", "bg", + "tract", "tracts", "tr", + "county", "counties", "co", + "place", "places", "pl", + "puma", "pumas", "puma22", + "cd118", "cd119", "congressional district" + ) + + geography_lower %in% state_nested +} + + +#' State FIPS to Abbreviation Lookup +#' +#' Internal lookup table for converting state FIPS codes to abbreviations. +#' @keywords internal +#' @noRd +state_fips_to_abbr <- tibble::tibble( + state_fips = c( + "01", "02", "04", "05", "06", "08", "09", "10", "11", "12", + "13", "15", "16", "17", "18", "19", "20", "21", "22", "23", + "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", + "34", "35", "36", "37", "38", "39", "40", "41", "42", "44", + "45", "46", "47", "48", "49", "50", "51", "53", "54", "55", + "56", "72", "78", "66", "60", "69" + ), + state_abbr = c( + "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", + "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", + "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", + "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", + "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", + "WY", "PR", "VI", "GU", "AS", "MP" + ) +) + + +#' Analyze State Concentration of Unmatched GEOIDs +#' +#' Internal function that analyzes the state-level distribution of unmatched GEOIDs. +#' +#' @param unmatched_geoids Character vector of GEOIDs that did not match. +#' @return A list with state_counts tibble, top_states tibble, and is_concentrated flag. +#' @keywords internal +#' @noRd +analyze_state_concentration <- function(unmatched_geoids) { + if (length(unmatched_geoids) == 0) { + return(NULL) + } + + state_counts <- tibble::tibble( + geoid = unmatched_geoids, + state_fips = stringr::str_sub(geoid, 1, 2) + ) |> + dplyr::count(state_fips, name = "n_unmatched") |> + dplyr::mutate( + pct_of_unmatched = n_unmatched / sum(n_unmatched) * 100 + ) |> + dplyr::arrange(dplyr::desc(n_unmatched)) + + top_states <- state_counts |> + dplyr::slice_head(n = 3) + + is_concentrated <- any(state_counts$pct_of_unmatched > 15) + + list( + state_counts = state_counts, + top_states = top_states, + is_concentrated = is_concentrated + ) +} + + +#' Format Join Quality Message +#' +#' Internal function that formats join quality diagnostics as messages. +#' +#' @param join_quality List of join quality statistics. +#' @param step_number Integer. Current step number. +#' @param total_steps Integer. Total number of steps. +#' @return Character vector of messages to print. +#' @keywords internal +#' @noRd +format_join_quality_message <- function(join_quality, step_number, total_steps) { + messages <- character(0) + + step_prefix <- if (total_steps > 1) { + stringr::str_c("Step ", step_number, " join quality: ") + } else { + "Join quality: " + } + + # Report unmatched data rows + if (join_quality$n_data_unmatched > 0) { + msg1 <- stringr::str_c( + step_prefix, + format(join_quality$n_data_unmatched, big.mark = ","), + " of ", + format(join_quality$n_data_total, big.mark = ","), + " data rows (", + sprintf("%.1f%%", join_quality$pct_data_unmatched), + ") did not match the crosswalk." + ) + messages <- c(messages, msg1) + + # Add state concentration info if available + if (!is.null(join_quality$state_analysis_data) && + nrow(join_quality$state_analysis_data$top_states) > 0) { + + top_states_str <- join_quality$state_analysis_data$top_states |> + dplyr::left_join(state_fips_to_abbr, by = "state_fips") |> + dplyr::mutate( + state_abbr = dplyr::if_else(is.na(state_abbr), state_fips, state_abbr), + label = stringr::str_c( + state_abbr, " (", + sprintf("%.0f%%", pct_of_unmatched), ", ", + format(n_unmatched, big.mark = ","), " rows)" + ) + ) |> + dplyr::pull(label) |> + stringr::str_c(collapse = ", ") + + msg2 <- stringr::str_c(" Top states with unmatched data rows: ", top_states_str) + messages <- c(messages, msg2) + } + } + + # Report crosswalk source GEOIDs not in data + if (join_quality$n_crosswalk_unmatched > 0) { + msg3 <- stringr::str_c( + step_prefix, + format(join_quality$n_crosswalk_unmatched, big.mark = ","), + " of ", + format(join_quality$n_crosswalk_total, big.mark = ","), + " crosswalk source GEOIDs (", + sprintf("%.1f%%", join_quality$pct_crosswalk_unmatched), + ") were not in input data." + ) + messages <- c(messages, msg3) + + # Add state concentration info if available + if (!is.null(join_quality$state_analysis_crosswalk) && + nrow(join_quality$state_analysis_crosswalk$top_states) > 0) { + + top_states_str <- join_quality$state_analysis_crosswalk$top_states |> + dplyr::left_join(state_fips_to_abbr, by = "state_fips") |> + dplyr::mutate( + state_abbr = dplyr::if_else(is.na(state_abbr), state_fips, state_abbr), + label = stringr::str_c( + state_abbr, " (", + sprintf("%.0f%%", pct_of_unmatched), ", ", + format(n_unmatched, big.mark = ","), " rows)" + ) + ) |> + dplyr::pull(label) |> + stringr::str_c(collapse = ", ") + + msg4 <- stringr::str_c(" Top states not in data: ", top_states_str) + messages <- c(messages, msg4) + } + + msg5 <- " (This is expected if your data covers a geographic subset.)" + messages <- c(messages, msg5) + } + + return(messages) +} + + +#' Report Join Quality Between Data and Crosswalk +#' +#' Internal function that computes and reports join quality diagnostics. +#' +#' @param data The input data tibble. +#' @param crosswalk The crosswalk tibble. +#' @param geoid_column Column name for source geoid in data. +#' @param step_number Integer. Which step this is (for multi-step reporting). +#' @param total_steps Integer. Total number of steps. +#' @param source_geography Character or NULL. The source geography type, used to +#' determine if state-level analysis is applicable. +#' @return A list with join quality statistics (also prints messages if issues found). +#' @keywords internal +#' @noRd +report_join_quality <- function(data, crosswalk, geoid_column, step_number = 1, + total_steps = 1, source_geography = NULL) { + + # Ensure geoid columns are character for consistent comparison + data_geoids <- data |> + dplyr::mutate(dplyr::across(dplyr::all_of(geoid_column), as.character)) |> + dplyr::pull(!!rlang::sym(geoid_column)) |> + unique() + + crosswalk_source_geoids <- crosswalk |> + dplyr::pull(source_geoid) |> + unique() + + # Data rows not in crosswalk + data_not_in_crosswalk <- setdiff(data_geoids, crosswalk_source_geoids) + n_data_unmatched <- length(data_not_in_crosswalk) + n_data_total <- length(data_geoids) + pct_data_unmatched <- if (n_data_total > 0) { + n_data_unmatched / n_data_total * 100 + } else { + 0 + } + + # Crosswalk source GEOIDs not in data + crosswalk_not_in_data <- setdiff(crosswalk_source_geoids, data_geoids) + n_crosswalk_unmatched <- length(crosswalk_not_in_data) + n_crosswalk_total <- length(crosswalk_source_geoids) + pct_crosswalk_unmatched <- if (n_crosswalk_total > 0) { + n_crosswalk_unmatched / n_crosswalk_total * 100 + } else { + 0 + } + + # State concentration analysis only for state-nested geographies + # (ZCTAs, CBSAs, urban areas cross state boundaries so state analysis not meaningful) + do_state_analysis <- is_state_nested_geography(source_geography) + + # State concentration analysis for unmatched data rows + state_analysis_data <- if (n_data_unmatched > 0 && do_state_analysis) { + analyze_state_concentration(data_not_in_crosswalk) + } else { + NULL + } + + # State concentration analysis for crosswalk rows not in data + state_analysis_crosswalk <- if (n_crosswalk_unmatched > 0 && do_state_analysis) { + analyze_state_concentration(crosswalk_not_in_data) + } else { + NULL + } + + # Build quality statistics + join_quality <- list( + n_data_total = n_data_total, + n_data_unmatched = n_data_unmatched, + pct_data_unmatched = pct_data_unmatched, + data_geoids_unmatched = data_not_in_crosswalk, + state_analysis_data = state_analysis_data, + n_crosswalk_total = n_crosswalk_total, + n_crosswalk_unmatched = n_crosswalk_unmatched, + pct_crosswalk_unmatched = pct_crosswalk_unmatched, + crosswalk_geoids_unmatched = crosswalk_not_in_data, + state_analysis_crosswalk = state_analysis_crosswalk, + source_geography = source_geography, + state_analysis_applicable = do_state_analysis + ) + + # Print messages if there are issues + messages <- format_join_quality_message(join_quality, step_number, total_steps) + if (length(messages) > 0) { + purrr::walk(messages, message) + } + + return(join_quality) +} + + #' Apply a Single Crosswalk Step #' #' Internal function that applies one crosswalk tibble to data. @@ -255,6 +558,9 @@ validate_crosswalk_tibble <- function(crosswalk, name) { #' @param geoid_column Column name for source geoid #' @param count_columns Count variable columns #' @param non_count_columns Non-count variable columns +#' @param step_number Integer. Current step number for multi-step reporting. +#' @param total_steps Integer. Total number of steps for multi-step reporting. +#' @param show_join_quality Logical. Whether to report join quality diagnostics. #' @return Crosswalked data #' @keywords internal #' @noRd @@ -263,7 +569,10 @@ apply_single_crosswalk <- function( crosswalk, geoid_column, count_columns, - non_count_columns) { + non_count_columns, + step_number = 1, + total_steps = 1, + show_join_quality = TRUE) { # Check if crosswalk is empty if (nrow(crosswalk) == 0) { @@ -276,6 +585,26 @@ apply_single_crosswalk <- function( # Store metadata for later attachment crosswalk_metadata <- attr(crosswalk, "crosswalk_metadata") + # Extract source geography from metadata for state analysis determination + source_geography <- if (!is.null(crosswalk_metadata)) { + crosswalk_metadata$source_geography + } else { + NULL + } + + # Report join quality (if enabled) + join_quality <- if (show_join_quality) { + report_join_quality( + data = data, + crosswalk = crosswalk, + geoid_column = geoid_column, + step_number = step_number, + total_steps = total_steps, + source_geography = source_geography) + } else { + NULL + } + # Determine grouping columns (target_geography_name may not always be present) group_cols <- "target_geoid" if ("target_geography_name" %in% names(crosswalk)) { @@ -286,6 +615,17 @@ apply_single_crosswalk <- function( current_count_cols <- intersect(count_columns, names(data)) current_non_count_cols <- intersect(non_count_columns, names(data)) + # Identify "other" columns (not geoid, count, or non-count columns) + # These will be aggregated by taking the first non-missing value + crosswalk_cols <- c("source_geoid", "target_geoid", "allocation_factor_source_to_target", + "target_geography_name", "weighting_factor", "source_year", "target_year", + "population_2020", "housing_2020", "land_area_sqmi") + other_cols <- setdiff( + names(data), + c(geoid_column, current_count_cols, current_non_count_cols, crosswalk_cols) + ) + + # Join crosswalk to data result <- data |> dplyr::mutate( @@ -305,6 +645,10 @@ apply_single_crosswalk <- function( tidytable::across( .cols = tidytable::all_of(current_non_count_cols), .fns = ~ stats::weighted.mean(.x, allocation_factor_source_to_target, na.rm = TRUE)), + ## other columns: take first non-missing value (or NA if all missing) + tidytable::across( + .cols = tidytable::all_of(other_cols), + .fns = ~ dplyr::first(.x, na_rm = TRUE)), tidytable::across( .cols = tidytable::all_of(c(current_count_cols, current_non_count_cols)), .fns = ~ sum(!is.na(.x)), @@ -312,7 +656,7 @@ apply_single_crosswalk <- function( tidytable::mutate( tidytable::across( .cols = tidytable::all_of(c(current_count_cols, current_non_count_cols)), - .fns = ~ tidytable::if_else(get(stringr::str_c(tidytable::cur_column(), "_validx$")) > 0, .x, NA))) |> + .fns = ~ tidytable::if_else(get(stringr::str_c(tidytable::cur_column(), "_validx")) > 0, .x, NA))) |> dplyr::select(-dplyr::matches("_validx$")) |> dplyr::rename_with( .cols = dplyr::everything(), @@ -322,5 +666,8 @@ apply_single_crosswalk <- function( # Attach metadata attr(result, "crosswalk_metadata") <- crosswalk_metadata + # Attach join quality statistics + attr(result, "join_quality") <- join_quality + return(result) } diff --git a/R/get_crosswalk.R b/R/get_crosswalk.R index add135b..5f8b0f3 100644 --- a/R/get_crosswalk.R +++ b/R/get_crosswalk.R @@ -1,4 +1,4 @@ -#' Get a Geographic Crosswalk +#' Get a crosswalk(s) to translate data across time and geographies #' #' Retrieves a crosswalk with interpolation values from a source geography to a target #' geography, optionally across different years. Always returns a list with a consistent @@ -359,4 +359,6 @@ and counties. The provided geography '", geography, "' is not supported.")} return(result) } -utils::globalVariables(c("allocation_factor_source_to_target")) \ No newline at end of file +utils::globalVariables(c( + "allocation_factor_source_to_target", "geoid", "label", + "n_unmatched", "pct_of_unmatched", "state_abbr")) \ No newline at end of file diff --git a/R/get_crosswalk_chain.R b/R/get_crosswalk_chain.R index 1f630be..44bcdc4 100644 --- a/R/get_crosswalk_chain.R +++ b/R/get_crosswalk_chain.R @@ -18,28 +18,8 @@ #' \item{message}{A formatted message describing the crosswalk chain} #' } #' -#' @export -#' @examples -#' \dontrun{ -#' # Get crosswalks for 2010 tracts to 2020 ZCTAs (requires two steps) -#' chain <- get_crosswalk_chain( -#' source_geography = "tract", -#' target_geography = "zcta", -#' source_year = 2010, -#' target_year = 2020, -#' weight = "population") -#' -#' # Apply crosswalks sequentially -#' data_step1 <- crosswalk_data( -#' data = my_data, -#' crosswalk = chain$crosswalks$step_1, -#' count_columns = "count_population") -#' -#' data_final <- crosswalk_data( -#' data = data_step1, -#' crosswalk = chain$crosswalks$step_2, -#' count_columns = "count_population") -#' } +#' @keywords internal +#' @noRd get_crosswalk_chain <- function( source_geography, target_geography, diff --git a/R/get_geocorr_crosswalk.R b/R/get_geocorr_crosswalk.R index e9c9ba0..0dbcf20 100644 --- a/R/get_geocorr_crosswalk.R +++ b/R/get_geocorr_crosswalk.R @@ -44,7 +44,7 @@ get_geocorr_crosswalk <- function( target_geography, weight = "population", cache = NULL) { - + outpath = "no file exists here" ## identify the relevant file paths for potentially-cached crosswalks if (!is.null(cache)) { @@ -239,7 +239,7 @@ get_geocorr_crosswalk <- function( df1 = readr::read_csv(file.path("https://mcdc.missouri.edu", "temp", csv_path), show_col_types = FALSE) |> janitor::clean_names() } - + df2 = df1 |> dplyr::slice(2:nrow(df1)) |> ## naming conventions for some geographies are inconsistent; we standardize @@ -291,7 +291,10 @@ get_geocorr_crosswalk <- function( dplyr::across( .cols = dplyr::matches("^cd11"), .fns = ~ stringr::str_c(stab, "-", .x), - .names = "{.col}_name")) |> + .names = "{.col}_name"), + dplyr::across( + .cols = dplyr::matches("puma22"), + .fns = ~ stringr::str_c(state, .x) )) |> dplyr::rename_with( .cols = dplyr::matches("state|stab"), .fn = ~ stringr::str_replace_all(.x, c("state" = "state_fips", "stab" = "state_abbreviation"))) |> diff --git a/R/get_nhgis_crosswalk.R b/R/get_nhgis_crosswalk.R index c36f0df..8f0d6e9 100644 --- a/R/get_nhgis_crosswalk.R +++ b/R/get_nhgis_crosswalk.R @@ -109,7 +109,7 @@ standardize_geography <- function(geography, context = "source") { available crosswalks.") } -#' List Available NHGIS Crosswalks +#' List supported NHGIS crosswalks #' #' Returns a tibble of all available NHGIS geographic crosswalks with their #' corresponding parameters that can be used with get_nhgis_crosswalk(). @@ -408,6 +408,80 @@ list_nhgis_crosswalks <- function() { return(nhgis_crosswalks) } + +#' Check if Direct NHGIS Crosswalk is Available +#' +#' Internal helper function that checks if NHGIS provides a direct crosswalk +#' for the given source/target geography and year combination. This is used +#' to determine if a single-step NHGIS crosswalk can be used instead of +#' a multi-step approach. +#' +#' @param source_geography Character. Source geography name. +#' @param target_geography Character. Target geography name. +#' @param source_year Character or numeric. Source year. +#' @param target_year Character or numeric. Target year. +#' +#' @return Logical. TRUE if NHGIS has a direct crosswalk, FALSE otherwise. +#' @keywords internal +#' @noRd +is_nhgis_crosswalk_available <- function( + source_geography, + target_geography, + source_year, + target_year) { + + # Get the list of available crosswalks + available_crosswalks <- list_nhgis_crosswalks() + + # Standardize inputs for matching + source_year_chr <- as.character(source_year) + target_year_chr <- as.character(target_year) + + # Standardize geography names to match list_nhgis_crosswalks() output + source_geog_std <- standardize_geography_for_nhgis_check(source_geography) + target_geog_std <- standardize_geography_for_nhgis_check(target_geography) + + # Check if matching crosswalk exists + match_exists <- available_crosswalks |> + dplyr::filter( + source_geography == source_geog_std, + target_geography == target_geog_std, + source_year == source_year_chr, + target_year == target_year_chr) |> + nrow() > 0 + + return(match_exists) +} + + +#' Standardize Geography for NHGIS Availability Check +#' +#' Internal helper to standardize geography names to match list_nhgis_crosswalks() output. +#' +#' @param geography Character. Geography name in various formats. +#' @return Character. Standardized geography name matching list_nhgis_crosswalks() output. +#' @keywords internal +#' @noRd +standardize_geography_for_nhgis_check <- function(geography) { + geography <- geography |> + stringr::str_to_lower() |> + stringr::str_squish() |> + stringr::str_replace_all("_", " ") + + dplyr::case_when( + geography %in% c("blk", "block", "blocks", "census block") ~ "block", + geography %in% c("bg", "blockgroup", "block group", "census block group") ~ "block_group", + geography %in% c("tr", "tract", "tracts", "census tract") ~ "tract", + geography %in% c("co", "county", "counties", "cnty") ~ "county", + geography %in% c("pl", "place", "places") ~ "place", + geography %in% c("zcta", "zctas", "zip code", "zip code tabulation area") ~ "zcta", + geography %in% c("puma", "pumas", "puma22", "public use microdata area") ~ "puma", + geography %in% c("cbsa", "core based statistical area") ~ "core_based_statistical_area", + geography %in% c("ua", "urban area", "urban areas") ~ "urban_area", + TRUE ~ geography) +} + + #' Get NHGIS Geographic Crosswalk #' #' Retrieves a geographic crosswalk from the IPUMS NHGIS API based on user-specified diff --git a/R/plan_crosswalk_chain.R b/R/plan_crosswalk_chain.R index ee1a38b..20be49f 100644 --- a/R/plan_crosswalk_chain.R +++ b/R/plan_crosswalk_chain.R @@ -113,7 +113,32 @@ plan_crosswalk_chain <- function( return(result) } - # Case 4: Different geography AND different year - multi-step required + # Case 4: Different geography AND different year + # First, check if NHGIS has a direct crosswalk for this combination + # (e.g., block 2010 -> zcta 2020 is available directly from NHGIS) + nhgis_direct_available <- is_nhgis_crosswalk_available( + source_geography = source_geography, + target_geography = target_geography, + source_year = source_year, + target_year = target_year) + + if (nhgis_direct_available) { + # Single-step NHGIS crosswalk available + result$steps <- tibble::tibble( + step_number = 1L, + source_geography = source_geography, + source_year = source_year_chr, + target_geography = target_geography, + target_year = target_year_chr, + crosswalk_source = "nhgis", + description = stringr::str_c( + source_year_chr, " ", source_geog_std, " -> ", + target_year_chr, " ", target_geog_std, " (direct NHGIS crosswalk)")) + result$composition_note <- "Single crosswalk; use allocation_factor_source_to_target directly." + return(result) + } + + # No direct NHGIS crosswalk available - multi-step required result$is_multi_step <- TRUE # Check if NHGIS supports the source geography for inter-temporal crosswalk diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..3be4410 --- /dev/null +++ b/README.Rmd @@ -0,0 +1,336 @@ +--- +output: github_document +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "man/figures/README-", + out.width = "100%", + eval = FALSE +) +``` + +# crosswalk + +An R interface to inter-geography and inter-temporal crosswalks. + +## Overview + +This package provides a consistent API and standardized versions of crosswalks to enable consistent approaches that work across different geography and year combinations. The package also facilitates +interpolation--that is, adjusting source geography/year values by their crosswalk weights and translating +these values to the desired target geography/year--including diagnostics of the joins between source data +and crosswalks. + +The package sources crosswalks from: + +- **Geocorr 2022** (Missouri Census Data Center) - for same-year crosswalks between geographies +- **IPUMS NHGIS** - for inter-temporal crosswalks (across different census years) +- **CT Data Collaborative** - for Connecticut 2020→2022 crosswalks (planning region changes) + +## Why Use `crosswalk`? + +- **Programmatic access**: No more manual downloads from web interfaces +- **Standardized output**: Consistent column names across all crosswalk sources +- **Metadata tracking**: Full provenance stored as attributes +- **Multi-step handling**: Automatic chaining when both geography and year change +- **Local caching**: Reproducible workflows with cached crosswalks + +## Installation + +```{r} +# Install from GitHub +renv::install("UI-Research/crosswalk") +``` + +## Overview + +First we obtain a crosswalk and apply it to our data: +```{r} +library(crosswalk) +library(dplyr) +library(stringr) +library(sf) + +source_data = tidycensus::get_acs( + year = 2023, + geography = "zcta", + output = "wide", + variables = c(below_poverty_level = "B17001_002")) %>% + dplyr::select( + source_geoid = GEOID, + count_below_poverty_level = below_poverty_levelE) + +get_crosswalk( + source_geography = "block", + target_geography = "puma", + source_year = 2010, + target_year = 2020, + weight = "population") + +# Get a crosswalk from ZCTAs to PUMAs (same year, uses Geocorr (2022)) +zcta_puma_crosswalk <- get_crosswalk( + source_geography = "zcta", + target_geography = "puma22", + weight = "population") + +# Apply the crosswalk to your data +crosswalked_data <- crosswalk_data( + geoid_column = "source_geoid", + data = source_data, + crosswalk = zcta_puma_crosswalk) +``` + +What does the crosswalk(s) reflect and how was it sourced? +```{r} +attr(crosswalked_data, "crosswalk_metadata") +``` + +How well did the crosswalk join to our source data? +```{r} +## look at all the characteristics of the join(s) between the source data +## and the crosswalks +join_quality = attr(crosswalked_data, "join_quality") + +## what share of records in the source data do not join to a crosswalk and +## thus are dropped during the crosswalking process? +join_quality$pct_data_unmatched + +## zctas aren't nested within states, otherwise join_quality$state_analysis_data +## would help us to ID whether non-joining source data were clustered within one +## or a few states. instead we can join to spatial data to diagnose further: +zctas_sf = tigris::zctas(year = 2023) +states_sf = tigris::states(year = 2023, cb = TRUE) + +## apart from DC, which has a disproportionate number of non-joining ZCTAs-- +## seemingly corresponding to federal areas and buildings--the distribution of +## non-joining ZCTAs appears proportionate to state-level populations and is +## distributed across many states: +zctas_sf %>% + dplyr::filter(GEOID20 %in% join_quality$data_geoids_unmatched) %>% + sf::st_intersection(states_sf %>% select(NAME)) %>% + sf::st_drop_geometry() %>% + dplyr::count(NAME, sort = TRUE) +``` + +And how accurate was the crosswalking process? +```{r} +comparison_data = tidycensus::get_acs( + year = 2023, + geography = "puma", + output = "wide", + variables = c( + below_poverty_level = "B17001_002")) %>% + dplyr::select( + source_geoid = GEOID, + count_below_poverty_level_acs = below_poverty_levelE) + +combined_data = dplyr::left_join( + comparison_data, + crosswalked_data, + by = c("source_geoid" = "geoid")) + +combined_data %>% + dplyr::select(source_geoid, dplyr::matches("count")) %>% + dplyr::mutate(difference_percent = (count_below_poverty_level_acs - count_below_poverty_level) / count_below_poverty_level_acs) %>% + ggplot2::ggplot() + + ggplot2::geom_histogram(ggplot2::aes(x = difference_percent)) + + ggplot2::theme_minimal() + + ggplot2::theme(panel.grid = ggplot2::element_blank()) + + ggplot2::scale_x_continuous(labels = scales::percent) + + ggplot2::labs( + title = "Crosswalked data approximates observed values", + subtitle = "Block group-level source data would produce more accurate crosswalked values", + y = "", + x = "Percent difference between observed and crosswalked values") +``` + +## Core Functions + +The package has two main functions: + +| Function | Purpose | +|--------------------------------------|----------------------------------| +| `get_crosswalk()` | Fetch crosswalk(s) | +| `crosswalk_data()` | Apply crosswalk(s) to interpolate data to the target geography-year | + +## Understanding `get_crosswalk()` Output + +`get_crosswalk()` **always returns a list** structured as follows: + +```{r} +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + source_year = 2010, + target_year = 2020, + weight = "population" +) + +names(result) +#> [1] "crosswalks" "plan" "message" +``` + +The list contains three elements: + +| Element | Description | +|------------------------------|------------------------------------------| +| `crosswalks` | A named list of crosswalks (`step_1`, `step_2`, etc.) of length one or greater | +| `plan` | Details about what crosswalks are being fetched | +| `message` | A human-readable description of the crosswalk chain | + +### Single-Step vs. Multi-Step Crosswalks + +**Single-step crosswalks** (same year, different geography OR same geography, different year): + +```{r} +# Same year, different geography (Geocorr) +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + weight = "population" +) +# result$crosswalks$step_1 contains one crosswalk + +# Same geography, different year (NHGIS) +result <- get_crosswalk( + source_geography = "tract", + target_geography = "tract", + source_year = 2010, + target_year = 2020 +) +# result$crosswalks$step_1 contains one crosswalk +``` + +**Multi-step crosswalks** (different geography AND different year): + +When both geography and year change, no single crosswalk source provides this directly. The package automatically plans and fetches a two-step chain: + +1. **Step 1 (NHGIS)**: Change year, keep geography constant +2. **Step 2 (Geocorr)**: Change geography at target year + +```{r} +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + source_year = 2010, + target_year = 2020, + weight = "population" +) + +# Two crosswalks are returned +names(result$crosswalks) +#> [1] "step_1" "step_2" + +# Step 1: 2010 tracts -> 2020 tracts (NHGIS) +# Step 2: 2020 tracts -> 2020 ZCTAs (Geocorr) +``` + +### Crosswalk Structure + +Each crosswalk contains standardized columns: + +| Column | Description | +|----------------------------|--------------------------------------------| +| `source_geoid` | Identifier for source geography | +| `target_geoid` | Identifier for target geography | +| `allocation_factor_source_to_target` | Weight for interpolating values | +| `weighting_factor` | What attribute was used (population, housing, land) | + +Additional columns may include `source_year`, `target_year`, `population_2020`, `housing_2020`, and `land_area_sqmi` depending on the source. + +### Accessing Metadata + +Each crosswalk tibble has a `crosswalk_metadata` attribute that documents what the crosswalk +represents and how it was created: + +```{r} +metadata <- attr(result$crosswalks$step_1, "crosswalk_metadata") +names(metadata) +#> [1] "call_parameters" "data_source" "data_source_full_name" "download_url" ... +``` + +## Using `crosswalk_data()` to Interpolate Data + +`crosswalk_data()` applies crosswalk weights to transform your data. It automatically handles multi-step crosswalks. + +### Column Naming Convention + +The function auto-detects columns based on prefixes: + +| Prefix | Treatment | +|-------------------------------|-----------------------------------------| +| `count_` | Summed after weighting (for counts like population, housing units) | +| `mean_`, `median_`, `percent_`, `ratio_` | Weighted mean (for rates, percentages, averages) | + +You can also specify columns explicitly via `count_columns` and `non_count_columns`. +All non-count variables are interpolated using weighted means, weighting by the allocation factor from the crosswalk. + + +## Supported Geography and Year Combinations + +### Inter-Geography Crosswalks (Geocorr) + +2022-vintage crosswalks between any of these geographies: + +- block, block group, tract, county +- place, zcta, puma22 +- cd118, cd119, urban_area, core_based_statistical_area + +### Inter-Temporal Crosswalks (NHGIS) + +NHGIS provides cross-decade crosswalks with the following structure: + +**Source geographies:** block, block_group, tract + +**Target geographies:** +- From blocks (decennial years only): block, block_group, tract, county, place, zcta, puma, urban_area, cbsa +- From block_group or tract: block_group, tract, county + +| Source Years | Target Years | +|------------------------------|------------------------------| +| 1990, 2000 | 2010, 2014, 2015, 2020, 2022 | +| 2010, 2011, 2012, 2014, 2015 | 1990, 2000, 2020, 2022 | +| 2020, 2022 | 1990, 2000, 2010, 2014, 2015 | + +**Notes:** +- Within-decade crosswalks (e.g., 2010→2014) are not available from NHGIS +- Block→ZCTA, Block→PUMA, etc. are only available for decennial years (1990, 2000, 2010, 2020) +- The package automatically uses direct NHGIS crosswalks when available (e.g., `get_crosswalk(source_geography = "block", target_geography = "zcta", source_year = 2010, target_year = 2020)` returns a single-step NHGIS crosswalk) + +### 2020→2022 Crosswalks (CTData) + +For 2020 to 2022 transformations, the package uses CT Data Collaborative crosswalks for Connecticut (where planning regions replaced counties) and identity mappings for other states (where no changes occurred). + +## API Keys + +NHGIS crosswalks require an IPUMS API key. Get one at https://account.ipums.org/api_keys and add to your `.Renviron`: + +```{r} +usethis::edit_r_environ() +# Add: IPUMS_API_KEY=your_key_here +``` + +## Caching + +Use the `cache` parameter to save crosswalks locally for ease: + +```{r} +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + weight = "population", + cache = here::here("crosswalks-cache")) +``` + + +## Citations + +The intellectual credit for the underlying crosswalks belongs to the original developers. + +**For NHGIS**, see citation requirements at: https://www.nhgis.org/citation-and-use-nhgis-data + +**For Geocorr**, a suggested citation: + +> Missouri Census Data Center, University of Missouri. (2022). Geocorr 2022: Geographic Correspondence Engine. Retrieved from: https://mcdc.missouri.edu/applications/geocorr2022.html \ No newline at end of file diff --git a/README.html b/README.html new file mode 100644 index 0000000..575b5f1 --- /dev/null +++ b/README.html @@ -0,0 +1,976 @@ + + + + +
+ + + + + + + + + + + + + + +An R interface to inter-geography and inter-temporal crosswalks.
+This package provides a consistent API and standardized versions of +crosswalks to enable consistent approaches that work across different +geography and year combinations. The package also facilitates +interpolation–that is, adjusting source geography/year values by their +crosswalk weights and translating these values to the desired target +geography/year–including diagnostics of the joins between source data +and crosswalks.
+The package sources crosswalks from:
+crosswalk?First we obtain a crosswalk and apply it to our data:
+library(crosswalk)
+library(dplyr)
+library(stringr)
+library(sf)
+
+source_data = tidycensus::get_acs(
+ year = 2023,
+ geography = "zcta",
+ output = "wide",
+ variables = c(below_poverty_level = "B17001_002")) %>%
+ dplyr::select(
+ source_geoid = GEOID,
+ count_below_poverty_level = below_poverty_levelE)
+
+get_crosswalk(
+ source_geography = "block",
+ target_geography = "puma",
+ source_year = 2010,
+ target_year = 2020,
+ weight = "population")
+
+# Get a crosswalk from ZCTAs to PUMAs (same year, uses Geocorr (2022))
+zcta_puma_crosswalk <- get_crosswalk(
+ source_geography = "zcta",
+ target_geography = "puma22",
+ weight = "population")
+
+# Apply the crosswalk to your data
+crosswalked_data <- crosswalk_data(
+ geoid_column = "source_geoid",
+ data = source_data,
+ crosswalk = zcta_puma_crosswalk)What does the crosswalk(s) reflect and how was it sourced?
+ +How well did the crosswalk join to our source data?
+## look at all the characteristics of the join(s) between the source data
+## and the crosswalks
+join_quality = attr(crosswalked_data, "join_quality")
+
+## what share of records in the source data do not join to a crosswalk and
+## thus are dropped during the crosswalking process?
+join_quality$pct_data_unmatched
+
+## zctas aren't nested within states, otherwise join_quality$state_analysis_data
+## would help us to ID whether non-joining source data were clustered within one
+## or a few states. instead we can join to spatial data to diagnose further:
+zctas_sf = tigris::zctas(year = 2023)
+states_sf = tigris::states(year = 2023, cb = TRUE)
+
+## apart from DC, which has a disproportionate number of non-joining ZCTAs--
+## seemingly corresponding to federal areas and buildings--the distribution of
+## non-joining ZCTAs appears proportionate to state-level populations and is
+## distributed across many states:
+zctas_sf %>%
+ dplyr::filter(GEOID20 %in% join_quality$data_geoids_unmatched) %>%
+ sf::st_intersection(states_sf %>% select(NAME)) %>%
+ sf::st_drop_geometry() %>%
+ dplyr::count(NAME, sort = TRUE)And how accurate was the crosswalking process?
+comparison_data = tidycensus::get_acs(
+ year = 2023,
+ geography = "puma",
+ output = "wide",
+ variables = c(
+ below_poverty_level = "B17001_002")) %>%
+ dplyr::select(
+ source_geoid = GEOID,
+ count_below_poverty_level_acs = below_poverty_levelE)
+
+combined_data = dplyr::left_join(
+ comparison_data,
+ crosswalked_data,
+ by = c("source_geoid" = "geoid"))
+
+combined_data %>%
+ dplyr::select(source_geoid, dplyr::matches("count")) %>%
+ dplyr::mutate(difference_percent = (count_below_poverty_level_acs - count_below_poverty_level) / count_below_poverty_level_acs) %>%
+ ggplot2::ggplot() +
+ ggplot2::geom_histogram(ggplot2::aes(x = difference_percent)) +
+ ggplot2::theme_minimal() +
+ ggplot2::theme(panel.grid = ggplot2::element_blank()) +
+ ggplot2::scale_x_continuous(labels = scales::percent) +
+ ggplot2::labs(
+ title = "Crosswalked data approximates observed values",
+ subtitle = "Block group-level source data would produce more accurate crosswalked values",
+ y = "",
+ x = "Percent difference between observed and crosswalked values")The package has two main functions:
+| Function | +Purpose | +
|---|---|
get_crosswalk() |
+Fetch crosswalk(s) | +
crosswalk_data() |
+Apply crosswalk(s) to interpolate data to the target +geography-year | +
get_crosswalk() Outputget_crosswalk() always returns a list
+structured as follows:
result <- get_crosswalk(
+ source_geography = "tract",
+ target_geography = "zcta",
+ source_year = 2010,
+ target_year = 2020,
+ weight = "population"
+)
+
+names(result)
+#> [1] "crosswalks" "plan" "message"The list contains three elements:
+| Element | +Description | +
|---|---|
crosswalks |
+A named list of crosswalks (step_1,
+step_2, etc.) of length one or greater |
+
plan |
+Details about what crosswalks are being fetched | +
message |
+A human-readable description of the crosswalk chain | +
Single-step crosswalks (same year, different +geography OR same geography, different year):
+# Same year, different geography (Geocorr)
+result <- get_crosswalk(
+ source_geography = "tract",
+ target_geography = "zcta",
+ weight = "population"
+)
+# result$crosswalks$step_1 contains one crosswalk
+
+# Same geography, different year (NHGIS)
+result <- get_crosswalk(
+ source_geography = "tract",
+ target_geography = "tract",
+ source_year = 2010,
+ target_year = 2020
+)
+# result$crosswalks$step_1 contains one crosswalkMulti-step crosswalks (different geography AND +different year):
+When both geography and year change, no single crosswalk source +provides this directly. The package automatically plans and fetches a +two-step chain:
+result <- get_crosswalk(
+ source_geography = "tract",
+ target_geography = "zcta",
+ source_year = 2010,
+ target_year = 2020,
+ weight = "population"
+)
+
+# Two crosswalks are returned
+names(result$crosswalks)
+#> [1] "step_1" "step_2"
+
+# Step 1: 2010 tracts -> 2020 tracts (NHGIS)
+# Step 2: 2020 tracts -> 2020 ZCTAs (Geocorr)Each crosswalk contains standardized columns:
+| Column | +Description | +
|---|---|
source_geoid |
+Identifier for source geography | +
target_geoid |
+Identifier for target geography | +
allocation_factor_source_to_target |
+Weight for interpolating values | +
weighting_factor |
+What attribute was used (population, housing, land) | +
Additional columns may include source_year,
+target_year, population_2020,
+housing_2020, and land_area_sqmi depending on
+the source.
Each crosswalk tibble has a crosswalk_metadata attribute
+that documents what the crosswalk represents and how it was created:
metadata <- attr(result$crosswalks$step_1, "crosswalk_metadata")
+names(metadata)
+#> [1] "call_parameters" "data_source" "data_source_full_name" "download_url" ...crosswalk_data() to Interpolate Datacrosswalk_data() applies crosswalk weights to transform
+your data. It automatically handles multi-step crosswalks.
The function auto-detects columns based on prefixes:
+| Prefix | +Treatment | +
|---|---|
count_ |
+Summed after weighting (for counts like population, housing +units) | +
mean_, median_, percent_,
+ratio_ |
+Weighted mean (for rates, percentages, averages) | +
You can also specify columns explicitly via
+count_columns and non_count_columns. All
+non-count variables are interpolated using weighted means, weighting by
+the allocation factor from the crosswalk.
2022-vintage crosswalks between any of these geographies:
+NHGIS provides cross-decade crosswalks with the following +structure:
+Source geographies: block, block_group, tract
+Target geographies: - From blocks (decennial years +only): block, block_group, tract, county, place, zcta, puma, urban_area, +cbsa - From block_group or tract: block_group, tract, county
+| Source Years | +Target Years | +
|---|---|
| 1990, 2000 | +2010, 2014, 2015, 2020, 2022 | +
| 2010, 2011, 2012, 2014, 2015 | +1990, 2000, 2020, 2022 | +
| 2020, 2022 | +1990, 2000, 2010, 2014, 2015 | +
Notes: - Within-decade crosswalks (e.g., 2010→2014)
+are not available from NHGIS - Block→ZCTA, Block→PUMA, etc. are only
+available for decennial years (1990, 2000, 2010, 2020) - The package
+automatically uses direct NHGIS crosswalks when available (e.g.,
+get_crosswalk(source_geography = "block", target_geography = "zcta", source_year = 2010, target_year = 2020)
+returns a single-step NHGIS crosswalk)
For 2020 to 2022 transformations, the package uses CT Data +Collaborative crosswalks for Connecticut (where planning regions +replaced counties) and identity mappings for other states (where no +changes occurred).
+NHGIS crosswalks require an IPUMS API key. Get one at https://account.ipums.org/api_keys
+and add to your .Renviron:
Use the cache parameter to save crosswalks locally for
+ease:
result <- get_crosswalk(
+ source_geography = "tract",
+ target_geography = "zcta",
+ weight = "population",
+ cache = here::here("crosswalks-cache"))The intellectual credit for the underlying crosswalks belongs to the +original developers.
+For NHGIS, see citation requirements at: https://www.nhgis.org/citation-and-use-nhgis-data
+For Geocorr, a suggested citation:
+++ + + diff --git a/README.md b/README.md index 91bb023..7d6b80c 100644 --- a/README.md +++ b/README.md @@ -1,99 +1,349 @@ -# crosswalk - -An R interface to inter-geography and inter-temporal crosswalks. - -## Overview - -This project provides a consistent API and standardized versions of -crosswalks to allow for programmatic interpolation over time and between -geographies. Say goodbye to manual crosswalk downloads and hello to -reproducible workflows. - -## Installation - -``` r -# Install dependencies -renv::install("UI-Research/crosswalks") -``` - -## Usage - -To get started with `library(crosswalk)`: - -``` r -# Load the package -library(crosswalk) - -## obtain a crosswalk to translate data in 2020-vintage place geographies -## to 2020-vintage county geographies, weighted by population -place_county_crosswalk = get_crosswalk( - source_geography = "place", - target_geography = "county", - weight = c("population"), - cache = here::here("data")) - -## obtain a crosswalk to translate data in 2000-vintage place geographies -## to 2010-vintage place geographies. all available weighting options are -## returned -get_crosswalk( - source_year = 2000, - target_year = 2010, - source_geography = "place", - target_geography = "place", - cache = here::here("data")) -``` - -## Why Use `library(crosswalk)`? - -Crosswalks are a critical component of conducting social sciences -research as they enable analysts to translate data from one geography -and/or temporal vintage to another. For example, if source data are only -available at the county level, a crosswalk can help to produce estimates -of those source data at the city level, enabling analysis at a geography -(the city) that may be either more relevant to target audience(s) and/or -may align with the geography of other data that form part of the -overarching analysis. - -There are excellent existing resources for crosswalks, including the -University of Missouri - Missouri Census Data Center's Geocorr 2022 -crosswalking application and the IPUMS National Historical Geographic -Information System (NHGIS). In fact, the crosswalks returned by using -`library(crosswalk)` are those from Geocorr and NHGIS. - -So why use this package at all? It provides: - -- A consistent, programmatic approach to acquire crosswalks, rather - than ad-hoc manual downloads; - -- Standardized and clear crosswalk variable names so that you can - easily work with multiple crosswalks using the same workflow; - -- Crosswalk metadata stored within the returned crosswalk–no more - commenting in your script with the 15 options you configured prior - to clicking "Download"; - -- The ability to easily "cache" crosswalks locally. - -In brief: this package facilitates a well documented and reproducible -analysis workflow, building on top of the robust underlying resources -already available for crosswalking. - -## Citations! - -The intellectual work and credit for the underlying crosswalks returned -by this package belongs to the original developers of those crosswalks. -You should (in the case of Geocorr crosswalks) and in some cases must -(in the case of NHGIS crosswalks) appropriately cite the developers when -you use these resources. - -**For NHGIS**, you should refer to the NHGIS website and terms of use, -including the recommended citations provided at: -Missouri Census Data Center, University of Missouri. (2022). Geocorr +2022: Geographic Correspondence Engine. Retrieved from: https://mcdc.missouri.edu/applications/geocorr2022.html
+