diff --git a/DESCRIPTION b/DESCRIPTION index 51c56b5..4d40343 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,6 +19,7 @@ Imports: janitor, purrr, readr, + rlang, rvest, stringr, tibble, diff --git a/NAMESPACE b/NAMESPACE index 5172ca0..e964796 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,5 +2,4 @@ export(crosswalk_data) export(get_crosswalk) -export(get_crosswalk_chain) export(list_nhgis_crosswalks) diff --git a/R/crosswalk_data.R b/R/crosswalk_data.R index 917e5f5..f909391 100644 --- a/R/crosswalk_data.R +++ b/R/crosswalk_data.R @@ -1,7 +1,7 @@ ## explicitly enable/acknowledge data.table (used by tidytable) .datatable.aware = TRUE -#' Apply a Crosswalk to Transform Data +#' Interpolate data using a crosswalk(s) #' #' Applies geographic crosswalk weights to transform data from a source geography #' to a target geography. Accepts the output from `get_crosswalk()` and automatically @@ -28,6 +28,11 @@ #' @param return_intermediate Logical. If TRUE and crosswalk has multiple steps, #' returns a list containing both the final result and intermediate results #' from each step. Default is FALSE, which returns only the final result. +#' @param show_join_quality Logical. If TRUE (default), prints diagnostic messages +#' about join quality, including the number of data rows not matching the crosswalk +#' and vice versa. For state-nested geographies (tract, county, block group, etc.), +#' also reports state-level concentration of unmatched rows. Set to FALSE to +#' suppress these messages. #' #' @return If `return_intermediate = FALSE` (default), a tibble with data summarized #' to the final target geography. @@ -55,6 +60,11 @@ #' - Columns starting with "mean_", "median_", "percent_", or "ratio_" are treated #' as non-count variables #' +#' **Other columns**: Columns that are not the geoid column, count columns, or +#' non-count columns (e.g., metadata like `data_year`) are preserved by taking +#' the first non-missing value within each target geography group. If all values +#' are missing, NA is returned. +#' #' **Multi-step crosswalks**: When `get_crosswalk()` returns multiple crosswalks #' (for transformations that change both geography and year), this function #' automatically applies them in sequence. @@ -108,7 +118,8 @@ crosswalk_data <- function( geoid_column = "geoid", count_columns = NULL, non_count_columns = NULL, - return_intermediate = FALSE) { + return_intermediate = FALSE, + show_join_quality = TRUE) { # Determine if crosswalk is a list (from get_crosswalk) or a single tibble crosswalk_list <- extract_crosswalk_list(crosswalk) @@ -166,7 +177,10 @@ crosswalk_data <- function( crosswalk = step_crosswalk, geoid_column = current_geoid_column, count_columns = count_columns, - non_count_columns = non_count_columns) + non_count_columns = non_count_columns, + step_number = i, + total_steps = n_steps, + show_join_quality = show_join_quality) # Store intermediate result if requested if (return_intermediate) { @@ -246,6 +260,295 @@ validate_crosswalk_tibble <- function(crosswalk, name) { } +#' Check if Geography is Nested Within States +#' +#' Internal function that determines whether a geography type has GEOIDs +#' where the first two characters represent the state FIPS code. +#' +#' Geographies nested within states (state FIPS derivable from GEOID): +#' - block, block_group, tract, county, place, puma, congressional districts +#' +#' Geographies NOT nested within states (cross state boundaries): +#' - zcta, cbsa/core_based_statistical_area, urban_area +#' +#' @param geography Character. The geography type to check. +#' @return Logical. TRUE if geography is nested within states. +#' @keywords internal +#' @noRd +is_state_nested_geography <- function(geography) { + if (is.null(geography) || is.na(geography)) { + return(FALSE) + } + + geography_lower <- geography |> + stringr::str_to_lower() |> + stringr::str_squish() |> + stringr::str_replace_all("_", " ") + + # Geographies where GEOID starts with state FIPS + + state_nested <- c( + "block", "blocks", "blk", + "block group", "blockgroup", "bg", + "tract", "tracts", "tr", + "county", "counties", "co", + "place", "places", "pl", + "puma", "pumas", "puma22", + "cd118", "cd119", "congressional district" + ) + + geography_lower %in% state_nested +} + + +#' State FIPS to Abbreviation Lookup +#' +#' Internal lookup table for converting state FIPS codes to abbreviations. +#' @keywords internal +#' @noRd +state_fips_to_abbr <- tibble::tibble( + state_fips = c( + "01", "02", "04", "05", "06", "08", "09", "10", "11", "12", + "13", "15", "16", "17", "18", "19", "20", "21", "22", "23", + "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", + "34", "35", "36", "37", "38", "39", "40", "41", "42", "44", + "45", "46", "47", "48", "49", "50", "51", "53", "54", "55", + "56", "72", "78", "66", "60", "69" + ), + state_abbr = c( + "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL", + "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", + "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", + "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", + "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", + "WY", "PR", "VI", "GU", "AS", "MP" + ) +) + + +#' Analyze State Concentration of Unmatched GEOIDs +#' +#' Internal function that analyzes the state-level distribution of unmatched GEOIDs. +#' +#' @param unmatched_geoids Character vector of GEOIDs that did not match. +#' @return A list with state_counts tibble, top_states tibble, and is_concentrated flag. +#' @keywords internal +#' @noRd +analyze_state_concentration <- function(unmatched_geoids) { + if (length(unmatched_geoids) == 0) { + return(NULL) + } + + state_counts <- tibble::tibble( + geoid = unmatched_geoids, + state_fips = stringr::str_sub(geoid, 1, 2) + ) |> + dplyr::count(state_fips, name = "n_unmatched") |> + dplyr::mutate( + pct_of_unmatched = n_unmatched / sum(n_unmatched) * 100 + ) |> + dplyr::arrange(dplyr::desc(n_unmatched)) + + top_states <- state_counts |> + dplyr::slice_head(n = 3) + + is_concentrated <- any(state_counts$pct_of_unmatched > 15) + + list( + state_counts = state_counts, + top_states = top_states, + is_concentrated = is_concentrated + ) +} + + +#' Format Join Quality Message +#' +#' Internal function that formats join quality diagnostics as messages. +#' +#' @param join_quality List of join quality statistics. +#' @param step_number Integer. Current step number. +#' @param total_steps Integer. Total number of steps. +#' @return Character vector of messages to print. +#' @keywords internal +#' @noRd +format_join_quality_message <- function(join_quality, step_number, total_steps) { + messages <- character(0) + + step_prefix <- if (total_steps > 1) { + stringr::str_c("Step ", step_number, " join quality: ") + } else { + "Join quality: " + } + + # Report unmatched data rows + if (join_quality$n_data_unmatched > 0) { + msg1 <- stringr::str_c( + step_prefix, + format(join_quality$n_data_unmatched, big.mark = ","), + " of ", + format(join_quality$n_data_total, big.mark = ","), + " data rows (", + sprintf("%.1f%%", join_quality$pct_data_unmatched), + ") did not match the crosswalk." + ) + messages <- c(messages, msg1) + + # Add state concentration info if available + if (!is.null(join_quality$state_analysis_data) && + nrow(join_quality$state_analysis_data$top_states) > 0) { + + top_states_str <- join_quality$state_analysis_data$top_states |> + dplyr::left_join(state_fips_to_abbr, by = "state_fips") |> + dplyr::mutate( + state_abbr = dplyr::if_else(is.na(state_abbr), state_fips, state_abbr), + label = stringr::str_c( + state_abbr, " (", + sprintf("%.0f%%", pct_of_unmatched), ", ", + format(n_unmatched, big.mark = ","), " rows)" + ) + ) |> + dplyr::pull(label) |> + stringr::str_c(collapse = ", ") + + msg2 <- stringr::str_c(" Top states with unmatched data rows: ", top_states_str) + messages <- c(messages, msg2) + } + } + + # Report crosswalk source GEOIDs not in data + if (join_quality$n_crosswalk_unmatched > 0) { + msg3 <- stringr::str_c( + step_prefix, + format(join_quality$n_crosswalk_unmatched, big.mark = ","), + " of ", + format(join_quality$n_crosswalk_total, big.mark = ","), + " crosswalk source GEOIDs (", + sprintf("%.1f%%", join_quality$pct_crosswalk_unmatched), + ") were not in input data." + ) + messages <- c(messages, msg3) + + # Add state concentration info if available + if (!is.null(join_quality$state_analysis_crosswalk) && + nrow(join_quality$state_analysis_crosswalk$top_states) > 0) { + + top_states_str <- join_quality$state_analysis_crosswalk$top_states |> + dplyr::left_join(state_fips_to_abbr, by = "state_fips") |> + dplyr::mutate( + state_abbr = dplyr::if_else(is.na(state_abbr), state_fips, state_abbr), + label = stringr::str_c( + state_abbr, " (", + sprintf("%.0f%%", pct_of_unmatched), ", ", + format(n_unmatched, big.mark = ","), " rows)" + ) + ) |> + dplyr::pull(label) |> + stringr::str_c(collapse = ", ") + + msg4 <- stringr::str_c(" Top states not in data: ", top_states_str) + messages <- c(messages, msg4) + } + + msg5 <- " (This is expected if your data covers a geographic subset.)" + messages <- c(messages, msg5) + } + + return(messages) +} + + +#' Report Join Quality Between Data and Crosswalk +#' +#' Internal function that computes and reports join quality diagnostics. +#' +#' @param data The input data tibble. +#' @param crosswalk The crosswalk tibble. +#' @param geoid_column Column name for source geoid in data. +#' @param step_number Integer. Which step this is (for multi-step reporting). +#' @param total_steps Integer. Total number of steps. +#' @param source_geography Character or NULL. The source geography type, used to +#' determine if state-level analysis is applicable. +#' @return A list with join quality statistics (also prints messages if issues found). +#' @keywords internal +#' @noRd +report_join_quality <- function(data, crosswalk, geoid_column, step_number = 1, + total_steps = 1, source_geography = NULL) { + + # Ensure geoid columns are character for consistent comparison + data_geoids <- data |> + dplyr::mutate(dplyr::across(dplyr::all_of(geoid_column), as.character)) |> + dplyr::pull(!!rlang::sym(geoid_column)) |> + unique() + + crosswalk_source_geoids <- crosswalk |> + dplyr::pull(source_geoid) |> + unique() + + # Data rows not in crosswalk + data_not_in_crosswalk <- setdiff(data_geoids, crosswalk_source_geoids) + n_data_unmatched <- length(data_not_in_crosswalk) + n_data_total <- length(data_geoids) + pct_data_unmatched <- if (n_data_total > 0) { + n_data_unmatched / n_data_total * 100 + } else { + 0 + } + + # Crosswalk source GEOIDs not in data + crosswalk_not_in_data <- setdiff(crosswalk_source_geoids, data_geoids) + n_crosswalk_unmatched <- length(crosswalk_not_in_data) + n_crosswalk_total <- length(crosswalk_source_geoids) + pct_crosswalk_unmatched <- if (n_crosswalk_total > 0) { + n_crosswalk_unmatched / n_crosswalk_total * 100 + } else { + 0 + } + + # State concentration analysis only for state-nested geographies + # (ZCTAs, CBSAs, urban areas cross state boundaries so state analysis not meaningful) + do_state_analysis <- is_state_nested_geography(source_geography) + + # State concentration analysis for unmatched data rows + state_analysis_data <- if (n_data_unmatched > 0 && do_state_analysis) { + analyze_state_concentration(data_not_in_crosswalk) + } else { + NULL + } + + # State concentration analysis for crosswalk rows not in data + state_analysis_crosswalk <- if (n_crosswalk_unmatched > 0 && do_state_analysis) { + analyze_state_concentration(crosswalk_not_in_data) + } else { + NULL + } + + # Build quality statistics + join_quality <- list( + n_data_total = n_data_total, + n_data_unmatched = n_data_unmatched, + pct_data_unmatched = pct_data_unmatched, + data_geoids_unmatched = data_not_in_crosswalk, + state_analysis_data = state_analysis_data, + n_crosswalk_total = n_crosswalk_total, + n_crosswalk_unmatched = n_crosswalk_unmatched, + pct_crosswalk_unmatched = pct_crosswalk_unmatched, + crosswalk_geoids_unmatched = crosswalk_not_in_data, + state_analysis_crosswalk = state_analysis_crosswalk, + source_geography = source_geography, + state_analysis_applicable = do_state_analysis + ) + + # Print messages if there are issues + messages <- format_join_quality_message(join_quality, step_number, total_steps) + if (length(messages) > 0) { + purrr::walk(messages, message) + } + + return(join_quality) +} + + #' Apply a Single Crosswalk Step #' #' Internal function that applies one crosswalk tibble to data. @@ -255,6 +558,9 @@ validate_crosswalk_tibble <- function(crosswalk, name) { #' @param geoid_column Column name for source geoid #' @param count_columns Count variable columns #' @param non_count_columns Non-count variable columns +#' @param step_number Integer. Current step number for multi-step reporting. +#' @param total_steps Integer. Total number of steps for multi-step reporting. +#' @param show_join_quality Logical. Whether to report join quality diagnostics. #' @return Crosswalked data #' @keywords internal #' @noRd @@ -263,7 +569,10 @@ apply_single_crosswalk <- function( crosswalk, geoid_column, count_columns, - non_count_columns) { + non_count_columns, + step_number = 1, + total_steps = 1, + show_join_quality = TRUE) { # Check if crosswalk is empty if (nrow(crosswalk) == 0) { @@ -276,6 +585,26 @@ apply_single_crosswalk <- function( # Store metadata for later attachment crosswalk_metadata <- attr(crosswalk, "crosswalk_metadata") + # Extract source geography from metadata for state analysis determination + source_geography <- if (!is.null(crosswalk_metadata)) { + crosswalk_metadata$source_geography + } else { + NULL + } + + # Report join quality (if enabled) + join_quality <- if (show_join_quality) { + report_join_quality( + data = data, + crosswalk = crosswalk, + geoid_column = geoid_column, + step_number = step_number, + total_steps = total_steps, + source_geography = source_geography) + } else { + NULL + } + # Determine grouping columns (target_geography_name may not always be present) group_cols <- "target_geoid" if ("target_geography_name" %in% names(crosswalk)) { @@ -286,6 +615,17 @@ apply_single_crosswalk <- function( current_count_cols <- intersect(count_columns, names(data)) current_non_count_cols <- intersect(non_count_columns, names(data)) + # Identify "other" columns (not geoid, count, or non-count columns) + # These will be aggregated by taking the first non-missing value + crosswalk_cols <- c("source_geoid", "target_geoid", "allocation_factor_source_to_target", + "target_geography_name", "weighting_factor", "source_year", "target_year", + "population_2020", "housing_2020", "land_area_sqmi") + other_cols <- setdiff( + names(data), + c(geoid_column, current_count_cols, current_non_count_cols, crosswalk_cols) + ) + + # Join crosswalk to data result <- data |> dplyr::mutate( @@ -305,6 +645,10 @@ apply_single_crosswalk <- function( tidytable::across( .cols = tidytable::all_of(current_non_count_cols), .fns = ~ stats::weighted.mean(.x, allocation_factor_source_to_target, na.rm = TRUE)), + ## other columns: take first non-missing value (or NA if all missing) + tidytable::across( + .cols = tidytable::all_of(other_cols), + .fns = ~ dplyr::first(.x, na_rm = TRUE)), tidytable::across( .cols = tidytable::all_of(c(current_count_cols, current_non_count_cols)), .fns = ~ sum(!is.na(.x)), @@ -312,7 +656,7 @@ apply_single_crosswalk <- function( tidytable::mutate( tidytable::across( .cols = tidytable::all_of(c(current_count_cols, current_non_count_cols)), - .fns = ~ tidytable::if_else(get(stringr::str_c(tidytable::cur_column(), "_validx$")) > 0, .x, NA))) |> + .fns = ~ tidytable::if_else(get(stringr::str_c(tidytable::cur_column(), "_validx")) > 0, .x, NA))) |> dplyr::select(-dplyr::matches("_validx$")) |> dplyr::rename_with( .cols = dplyr::everything(), @@ -322,5 +666,8 @@ apply_single_crosswalk <- function( # Attach metadata attr(result, "crosswalk_metadata") <- crosswalk_metadata + # Attach join quality statistics + attr(result, "join_quality") <- join_quality + return(result) } diff --git a/R/get_crosswalk.R b/R/get_crosswalk.R index add135b..5f8b0f3 100644 --- a/R/get_crosswalk.R +++ b/R/get_crosswalk.R @@ -1,4 +1,4 @@ -#' Get a Geographic Crosswalk +#' Get a crosswalk(s) to translate data across time and geographies #' #' Retrieves a crosswalk with interpolation values from a source geography to a target #' geography, optionally across different years. Always returns a list with a consistent @@ -359,4 +359,6 @@ and counties. The provided geography '", geography, "' is not supported.")} return(result) } -utils::globalVariables(c("allocation_factor_source_to_target")) \ No newline at end of file +utils::globalVariables(c( + "allocation_factor_source_to_target", "geoid", "label", + "n_unmatched", "pct_of_unmatched", "state_abbr")) \ No newline at end of file diff --git a/R/get_crosswalk_chain.R b/R/get_crosswalk_chain.R index 1f630be..44bcdc4 100644 --- a/R/get_crosswalk_chain.R +++ b/R/get_crosswalk_chain.R @@ -18,28 +18,8 @@ #' \item{message}{A formatted message describing the crosswalk chain} #' } #' -#' @export -#' @examples -#' \dontrun{ -#' # Get crosswalks for 2010 tracts to 2020 ZCTAs (requires two steps) -#' chain <- get_crosswalk_chain( -#' source_geography = "tract", -#' target_geography = "zcta", -#' source_year = 2010, -#' target_year = 2020, -#' weight = "population") -#' -#' # Apply crosswalks sequentially -#' data_step1 <- crosswalk_data( -#' data = my_data, -#' crosswalk = chain$crosswalks$step_1, -#' count_columns = "count_population") -#' -#' data_final <- crosswalk_data( -#' data = data_step1, -#' crosswalk = chain$crosswalks$step_2, -#' count_columns = "count_population") -#' } +#' @keywords internal +#' @noRd get_crosswalk_chain <- function( source_geography, target_geography, diff --git a/R/get_geocorr_crosswalk.R b/R/get_geocorr_crosswalk.R index e9c9ba0..0dbcf20 100644 --- a/R/get_geocorr_crosswalk.R +++ b/R/get_geocorr_crosswalk.R @@ -44,7 +44,7 @@ get_geocorr_crosswalk <- function( target_geography, weight = "population", cache = NULL) { - + outpath = "no file exists here" ## identify the relevant file paths for potentially-cached crosswalks if (!is.null(cache)) { @@ -239,7 +239,7 @@ get_geocorr_crosswalk <- function( df1 = readr::read_csv(file.path("https://mcdc.missouri.edu", "temp", csv_path), show_col_types = FALSE) |> janitor::clean_names() } - + df2 = df1 |> dplyr::slice(2:nrow(df1)) |> ## naming conventions for some geographies are inconsistent; we standardize @@ -291,7 +291,10 @@ get_geocorr_crosswalk <- function( dplyr::across( .cols = dplyr::matches("^cd11"), .fns = ~ stringr::str_c(stab, "-", .x), - .names = "{.col}_name")) |> + .names = "{.col}_name"), + dplyr::across( + .cols = dplyr::matches("puma22"), + .fns = ~ stringr::str_c(state, .x) )) |> dplyr::rename_with( .cols = dplyr::matches("state|stab"), .fn = ~ stringr::str_replace_all(.x, c("state" = "state_fips", "stab" = "state_abbreviation"))) |> diff --git a/R/get_nhgis_crosswalk.R b/R/get_nhgis_crosswalk.R index c36f0df..8f0d6e9 100644 --- a/R/get_nhgis_crosswalk.R +++ b/R/get_nhgis_crosswalk.R @@ -109,7 +109,7 @@ standardize_geography <- function(geography, context = "source") { available crosswalks.") } -#' List Available NHGIS Crosswalks +#' List supported NHGIS crosswalks #' #' Returns a tibble of all available NHGIS geographic crosswalks with their #' corresponding parameters that can be used with get_nhgis_crosswalk(). @@ -408,6 +408,80 @@ list_nhgis_crosswalks <- function() { return(nhgis_crosswalks) } + +#' Check if Direct NHGIS Crosswalk is Available +#' +#' Internal helper function that checks if NHGIS provides a direct crosswalk +#' for the given source/target geography and year combination. This is used +#' to determine if a single-step NHGIS crosswalk can be used instead of +#' a multi-step approach. +#' +#' @param source_geography Character. Source geography name. +#' @param target_geography Character. Target geography name. +#' @param source_year Character or numeric. Source year. +#' @param target_year Character or numeric. Target year. +#' +#' @return Logical. TRUE if NHGIS has a direct crosswalk, FALSE otherwise. +#' @keywords internal +#' @noRd +is_nhgis_crosswalk_available <- function( + source_geography, + target_geography, + source_year, + target_year) { + + # Get the list of available crosswalks + available_crosswalks <- list_nhgis_crosswalks() + + # Standardize inputs for matching + source_year_chr <- as.character(source_year) + target_year_chr <- as.character(target_year) + + # Standardize geography names to match list_nhgis_crosswalks() output + source_geog_std <- standardize_geography_for_nhgis_check(source_geography) + target_geog_std <- standardize_geography_for_nhgis_check(target_geography) + + # Check if matching crosswalk exists + match_exists <- available_crosswalks |> + dplyr::filter( + source_geography == source_geog_std, + target_geography == target_geog_std, + source_year == source_year_chr, + target_year == target_year_chr) |> + nrow() > 0 + + return(match_exists) +} + + +#' Standardize Geography for NHGIS Availability Check +#' +#' Internal helper to standardize geography names to match list_nhgis_crosswalks() output. +#' +#' @param geography Character. Geography name in various formats. +#' @return Character. Standardized geography name matching list_nhgis_crosswalks() output. +#' @keywords internal +#' @noRd +standardize_geography_for_nhgis_check <- function(geography) { + geography <- geography |> + stringr::str_to_lower() |> + stringr::str_squish() |> + stringr::str_replace_all("_", " ") + + dplyr::case_when( + geography %in% c("blk", "block", "blocks", "census block") ~ "block", + geography %in% c("bg", "blockgroup", "block group", "census block group") ~ "block_group", + geography %in% c("tr", "tract", "tracts", "census tract") ~ "tract", + geography %in% c("co", "county", "counties", "cnty") ~ "county", + geography %in% c("pl", "place", "places") ~ "place", + geography %in% c("zcta", "zctas", "zip code", "zip code tabulation area") ~ "zcta", + geography %in% c("puma", "pumas", "puma22", "public use microdata area") ~ "puma", + geography %in% c("cbsa", "core based statistical area") ~ "core_based_statistical_area", + geography %in% c("ua", "urban area", "urban areas") ~ "urban_area", + TRUE ~ geography) +} + + #' Get NHGIS Geographic Crosswalk #' #' Retrieves a geographic crosswalk from the IPUMS NHGIS API based on user-specified diff --git a/R/plan_crosswalk_chain.R b/R/plan_crosswalk_chain.R index ee1a38b..20be49f 100644 --- a/R/plan_crosswalk_chain.R +++ b/R/plan_crosswalk_chain.R @@ -113,7 +113,32 @@ plan_crosswalk_chain <- function( return(result) } - # Case 4: Different geography AND different year - multi-step required + # Case 4: Different geography AND different year + # First, check if NHGIS has a direct crosswalk for this combination + # (e.g., block 2010 -> zcta 2020 is available directly from NHGIS) + nhgis_direct_available <- is_nhgis_crosswalk_available( + source_geography = source_geography, + target_geography = target_geography, + source_year = source_year, + target_year = target_year) + + if (nhgis_direct_available) { + # Single-step NHGIS crosswalk available + result$steps <- tibble::tibble( + step_number = 1L, + source_geography = source_geography, + source_year = source_year_chr, + target_geography = target_geography, + target_year = target_year_chr, + crosswalk_source = "nhgis", + description = stringr::str_c( + source_year_chr, " ", source_geog_std, " -> ", + target_year_chr, " ", target_geog_std, " (direct NHGIS crosswalk)")) + result$composition_note <- "Single crosswalk; use allocation_factor_source_to_target directly." + return(result) + } + + # No direct NHGIS crosswalk available - multi-step required result$is_multi_step <- TRUE # Check if NHGIS supports the source geography for inter-temporal crosswalk diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..3be4410 --- /dev/null +++ b/README.Rmd @@ -0,0 +1,336 @@ +--- +output: github_document +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "man/figures/README-", + out.width = "100%", + eval = FALSE +) +``` + +# crosswalk + +An R interface to inter-geography and inter-temporal crosswalks. + +## Overview + +This package provides a consistent API and standardized versions of crosswalks to enable consistent approaches that work across different geography and year combinations. The package also facilitates +interpolation--that is, adjusting source geography/year values by their crosswalk weights and translating +these values to the desired target geography/year--including diagnostics of the joins between source data +and crosswalks. + +The package sources crosswalks from: + +- **Geocorr 2022** (Missouri Census Data Center) - for same-year crosswalks between geographies +- **IPUMS NHGIS** - for inter-temporal crosswalks (across different census years) +- **CT Data Collaborative** - for Connecticut 2020→2022 crosswalks (planning region changes) + +## Why Use `crosswalk`? + +- **Programmatic access**: No more manual downloads from web interfaces +- **Standardized output**: Consistent column names across all crosswalk sources +- **Metadata tracking**: Full provenance stored as attributes +- **Multi-step handling**: Automatic chaining when both geography and year change +- **Local caching**: Reproducible workflows with cached crosswalks + +## Installation + +```{r} +# Install from GitHub +renv::install("UI-Research/crosswalk") +``` + +## Overview + +First we obtain a crosswalk and apply it to our data: +```{r} +library(crosswalk) +library(dplyr) +library(stringr) +library(sf) + +source_data = tidycensus::get_acs( + year = 2023, + geography = "zcta", + output = "wide", + variables = c(below_poverty_level = "B17001_002")) %>% + dplyr::select( + source_geoid = GEOID, + count_below_poverty_level = below_poverty_levelE) + +get_crosswalk( + source_geography = "block", + target_geography = "puma", + source_year = 2010, + target_year = 2020, + weight = "population") + +# Get a crosswalk from ZCTAs to PUMAs (same year, uses Geocorr (2022)) +zcta_puma_crosswalk <- get_crosswalk( + source_geography = "zcta", + target_geography = "puma22", + weight = "population") + +# Apply the crosswalk to your data +crosswalked_data <- crosswalk_data( + geoid_column = "source_geoid", + data = source_data, + crosswalk = zcta_puma_crosswalk) +``` + +What does the crosswalk(s) reflect and how was it sourced? +```{r} +attr(crosswalked_data, "crosswalk_metadata") +``` + +How well did the crosswalk join to our source data? +```{r} +## look at all the characteristics of the join(s) between the source data +## and the crosswalks +join_quality = attr(crosswalked_data, "join_quality") + +## what share of records in the source data do not join to a crosswalk and +## thus are dropped during the crosswalking process? +join_quality$pct_data_unmatched + +## zctas aren't nested within states, otherwise join_quality$state_analysis_data +## would help us to ID whether non-joining source data were clustered within one +## or a few states. instead we can join to spatial data to diagnose further: +zctas_sf = tigris::zctas(year = 2023) +states_sf = tigris::states(year = 2023, cb = TRUE) + +## apart from DC, which has a disproportionate number of non-joining ZCTAs-- +## seemingly corresponding to federal areas and buildings--the distribution of +## non-joining ZCTAs appears proportionate to state-level populations and is +## distributed across many states: +zctas_sf %>% + dplyr::filter(GEOID20 %in% join_quality$data_geoids_unmatched) %>% + sf::st_intersection(states_sf %>% select(NAME)) %>% + sf::st_drop_geometry() %>% + dplyr::count(NAME, sort = TRUE) +``` + +And how accurate was the crosswalking process? +```{r} +comparison_data = tidycensus::get_acs( + year = 2023, + geography = "puma", + output = "wide", + variables = c( + below_poverty_level = "B17001_002")) %>% + dplyr::select( + source_geoid = GEOID, + count_below_poverty_level_acs = below_poverty_levelE) + +combined_data = dplyr::left_join( + comparison_data, + crosswalked_data, + by = c("source_geoid" = "geoid")) + +combined_data %>% + dplyr::select(source_geoid, dplyr::matches("count")) %>% + dplyr::mutate(difference_percent = (count_below_poverty_level_acs - count_below_poverty_level) / count_below_poverty_level_acs) %>% + ggplot2::ggplot() + + ggplot2::geom_histogram(ggplot2::aes(x = difference_percent)) + + ggplot2::theme_minimal() + + ggplot2::theme(panel.grid = ggplot2::element_blank()) + + ggplot2::scale_x_continuous(labels = scales::percent) + + ggplot2::labs( + title = "Crosswalked data approximates observed values", + subtitle = "Block group-level source data would produce more accurate crosswalked values", + y = "", + x = "Percent difference between observed and crosswalked values") +``` + +## Core Functions + +The package has two main functions: + +| Function | Purpose | +|--------------------------------------|----------------------------------| +| `get_crosswalk()` | Fetch crosswalk(s) | +| `crosswalk_data()` | Apply crosswalk(s) to interpolate data to the target geography-year | + +## Understanding `get_crosswalk()` Output + +`get_crosswalk()` **always returns a list** structured as follows: + +```{r} +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + source_year = 2010, + target_year = 2020, + weight = "population" +) + +names(result) +#> [1] "crosswalks" "plan" "message" +``` + +The list contains three elements: + +| Element | Description | +|------------------------------|------------------------------------------| +| `crosswalks` | A named list of crosswalks (`step_1`, `step_2`, etc.) of length one or greater | +| `plan` | Details about what crosswalks are being fetched | +| `message` | A human-readable description of the crosswalk chain | + +### Single-Step vs. Multi-Step Crosswalks + +**Single-step crosswalks** (same year, different geography OR same geography, different year): + +```{r} +# Same year, different geography (Geocorr) +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + weight = "population" +) +# result$crosswalks$step_1 contains one crosswalk + +# Same geography, different year (NHGIS) +result <- get_crosswalk( + source_geography = "tract", + target_geography = "tract", + source_year = 2010, + target_year = 2020 +) +# result$crosswalks$step_1 contains one crosswalk +``` + +**Multi-step crosswalks** (different geography AND different year): + +When both geography and year change, no single crosswalk source provides this directly. The package automatically plans and fetches a two-step chain: + +1. **Step 1 (NHGIS)**: Change year, keep geography constant +2. **Step 2 (Geocorr)**: Change geography at target year + +```{r} +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + source_year = 2010, + target_year = 2020, + weight = "population" +) + +# Two crosswalks are returned +names(result$crosswalks) +#> [1] "step_1" "step_2" + +# Step 1: 2010 tracts -> 2020 tracts (NHGIS) +# Step 2: 2020 tracts -> 2020 ZCTAs (Geocorr) +``` + +### Crosswalk Structure + +Each crosswalk contains standardized columns: + +| Column | Description | +|----------------------------|--------------------------------------------| +| `source_geoid` | Identifier for source geography | +| `target_geoid` | Identifier for target geography | +| `allocation_factor_source_to_target` | Weight for interpolating values | +| `weighting_factor` | What attribute was used (population, housing, land) | + +Additional columns may include `source_year`, `target_year`, `population_2020`, `housing_2020`, and `land_area_sqmi` depending on the source. + +### Accessing Metadata + +Each crosswalk tibble has a `crosswalk_metadata` attribute that documents what the crosswalk +represents and how it was created: + +```{r} +metadata <- attr(result$crosswalks$step_1, "crosswalk_metadata") +names(metadata) +#> [1] "call_parameters" "data_source" "data_source_full_name" "download_url" ... +``` + +## Using `crosswalk_data()` to Interpolate Data + +`crosswalk_data()` applies crosswalk weights to transform your data. It automatically handles multi-step crosswalks. + +### Column Naming Convention + +The function auto-detects columns based on prefixes: + +| Prefix | Treatment | +|-------------------------------|-----------------------------------------| +| `count_` | Summed after weighting (for counts like population, housing units) | +| `mean_`, `median_`, `percent_`, `ratio_` | Weighted mean (for rates, percentages, averages) | + +You can also specify columns explicitly via `count_columns` and `non_count_columns`. +All non-count variables are interpolated using weighted means, weighting by the allocation factor from the crosswalk. + + +## Supported Geography and Year Combinations + +### Inter-Geography Crosswalks (Geocorr) + +2022-vintage crosswalks between any of these geographies: + +- block, block group, tract, county +- place, zcta, puma22 +- cd118, cd119, urban_area, core_based_statistical_area + +### Inter-Temporal Crosswalks (NHGIS) + +NHGIS provides cross-decade crosswalks with the following structure: + +**Source geographies:** block, block_group, tract + +**Target geographies:** +- From blocks (decennial years only): block, block_group, tract, county, place, zcta, puma, urban_area, cbsa +- From block_group or tract: block_group, tract, county + +| Source Years | Target Years | +|------------------------------|------------------------------| +| 1990, 2000 | 2010, 2014, 2015, 2020, 2022 | +| 2010, 2011, 2012, 2014, 2015 | 1990, 2000, 2020, 2022 | +| 2020, 2022 | 1990, 2000, 2010, 2014, 2015 | + +**Notes:** +- Within-decade crosswalks (e.g., 2010→2014) are not available from NHGIS +- Block→ZCTA, Block→PUMA, etc. are only available for decennial years (1990, 2000, 2010, 2020) +- The package automatically uses direct NHGIS crosswalks when available (e.g., `get_crosswalk(source_geography = "block", target_geography = "zcta", source_year = 2010, target_year = 2020)` returns a single-step NHGIS crosswalk) + +### 2020→2022 Crosswalks (CTData) + +For 2020 to 2022 transformations, the package uses CT Data Collaborative crosswalks for Connecticut (where planning regions replaced counties) and identity mappings for other states (where no changes occurred). + +## API Keys + +NHGIS crosswalks require an IPUMS API key. Get one at https://account.ipums.org/api_keys and add to your `.Renviron`: + +```{r} +usethis::edit_r_environ() +# Add: IPUMS_API_KEY=your_key_here +``` + +## Caching + +Use the `cache` parameter to save crosswalks locally for ease: + +```{r} +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + weight = "population", + cache = here::here("crosswalks-cache")) +``` + + +## Citations + +The intellectual credit for the underlying crosswalks belongs to the original developers. + +**For NHGIS**, see citation requirements at: https://www.nhgis.org/citation-and-use-nhgis-data + +**For Geocorr**, a suggested citation: + +> Missouri Census Data Center, University of Missouri. (2022). Geocorr 2022: Geographic Correspondence Engine. Retrieved from: https://mcdc.missouri.edu/applications/geocorr2022.html \ No newline at end of file diff --git a/README.html b/README.html new file mode 100644 index 0000000..575b5f1 --- /dev/null +++ b/README.html @@ -0,0 +1,976 @@ + + + + + + + + + + + + + + + + + + + +

crosswalk

+

An R interface to inter-geography and inter-temporal crosswalks.

+

Overview

+

This package provides a consistent API and standardized versions of +crosswalks to enable consistent approaches that work across different +geography and year combinations. The package also facilitates +interpolation–that is, adjusting source geography/year values by their +crosswalk weights and translating these values to the desired target +geography/year–including diagnostics of the joins between source data +and crosswalks.

+

The package sources crosswalks from:

+ +

Why Use crosswalk?

+ +

Installation

+
# Install from GitHub
+renv::install("UI-Research/crosswalk")
+

Overview

+

First we obtain a crosswalk and apply it to our data:

+
library(crosswalk)
+library(dplyr)
+library(stringr)
+library(sf)
+
+source_data = tidycensus::get_acs(
+    year = 2023,
+    geography = "zcta",
+    output = "wide",
+    variables = c(below_poverty_level = "B17001_002")) %>%
+  dplyr::select(
+    source_geoid = GEOID,
+    count_below_poverty_level = below_poverty_levelE)
+
+get_crosswalk(
+  source_geography = "block",
+  target_geography = "puma",
+  source_year = 2010,
+  target_year = 2020,
+  weight = "population")
+
+# Get a crosswalk from ZCTAs to PUMAs (same year, uses Geocorr (2022))
+zcta_puma_crosswalk <- get_crosswalk(
+  source_geography = "zcta",
+  target_geography = "puma22",
+  weight = "population")
+
+# Apply the crosswalk to your data
+crosswalked_data <- crosswalk_data(
+  geoid_column = "source_geoid",
+  data = source_data,
+  crosswalk = zcta_puma_crosswalk)
+

What does the crosswalk(s) reflect and how was it sourced?

+
attr(crosswalked_data, "crosswalk_metadata")
+

How well did the crosswalk join to our source data?

+
## look at all the characteristics of the join(s) between the source data
+## and the crosswalks
+join_quality = attr(crosswalked_data, "join_quality")
+
+## what share of records in the source data do not join to a crosswalk and
+## thus are dropped during the crosswalking process?
+join_quality$pct_data_unmatched
+
+## zctas aren't nested within states, otherwise join_quality$state_analysis_data 
+## would help us to ID whether non-joining source data were clustered within one
+## or a few states. instead we can join to spatial data to diagnose further:
+zctas_sf = tigris::zctas(year = 2023)
+states_sf = tigris::states(year = 2023, cb = TRUE)
+
+## apart from DC, which has a disproportionate number of non-joining ZCTAs--
+## seemingly corresponding to federal areas and buildings--the distribution of
+## non-joining ZCTAs appears proportionate to state-level populations and is 
+## distributed across many states:
+zctas_sf %>% 
+  dplyr::filter(GEOID20 %in% join_quality$data_geoids_unmatched) %>%
+  sf::st_intersection(states_sf %>% select(NAME)) %>%
+  sf::st_drop_geometry() %>%
+  dplyr::count(NAME, sort = TRUE)
+

And how accurate was the crosswalking process?

+
comparison_data = tidycensus::get_acs(
+    year = 2023,
+    geography = "puma",
+    output = "wide",
+    variables = c(
+      below_poverty_level = "B17001_002")) %>%
+  dplyr::select(
+    source_geoid = GEOID,
+    count_below_poverty_level_acs = below_poverty_levelE)
+
+combined_data = dplyr::left_join(
+  comparison_data,
+  crosswalked_data,
+  by = c("source_geoid" = "geoid")) 
+  
+combined_data %>%
+  dplyr::select(source_geoid, dplyr::matches("count")) %>%
+  dplyr::mutate(difference_percent = (count_below_poverty_level_acs - count_below_poverty_level) / count_below_poverty_level_acs) %>%
+  ggplot2::ggplot() +
+    ggplot2::geom_histogram(ggplot2::aes(x = difference_percent)) +
+    ggplot2::theme_minimal() +
+    ggplot2::theme(panel.grid = ggplot2::element_blank()) +
+    ggplot2::scale_x_continuous(labels = scales::percent) +
+    ggplot2::labs(
+      title = "Crosswalked data approximates observed values",
+      subtitle = "Block group-level source data would produce more accurate crosswalked values",
+      y = "",
+      x = "Percent difference between observed and crosswalked values")
+

Core Functions

+

The package has two main functions:

+ + + + + + + + + + + + + + + + + +
FunctionPurpose
get_crosswalk()Fetch crosswalk(s)
crosswalk_data()Apply crosswalk(s) to interpolate data to the target +geography-year
+

Understanding +get_crosswalk() Output

+

get_crosswalk() always returns a list +structured as follows:

+
result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  source_year = 2010,
+  target_year = 2020,
+  weight = "population"
+)
+
+names(result)
+#> [1] "crosswalks" "plan" "message"
+

The list contains three elements:

+ + + + + + + + + + + + + + + + + + + + + +
ElementDescription
crosswalksA named list of crosswalks (step_1, +step_2, etc.) of length one or greater
planDetails about what crosswalks are being fetched
messageA human-readable description of the crosswalk chain
+

Single-Step vs. Multi-Step +Crosswalks

+

Single-step crosswalks (same year, different +geography OR same geography, different year):

+
# Same year, different geography (Geocorr)
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  weight = "population"
+)
+# result$crosswalks$step_1 contains one crosswalk
+
+# Same geography, different year (NHGIS)
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "tract",
+  source_year = 2010,
+  target_year = 2020
+)
+# result$crosswalks$step_1 contains one crosswalk
+

Multi-step crosswalks (different geography AND +different year):

+

When both geography and year change, no single crosswalk source +provides this directly. The package automatically plans and fetches a +two-step chain:

+
    +
  1. Step 1 (NHGIS): Change year, keep geography +constant
  2. +
  3. Step 2 (Geocorr): Change geography at target +year
  4. +
+
result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  source_year = 2010,
+  target_year = 2020,
+  weight = "population"
+)
+
+# Two crosswalks are returned
+names(result$crosswalks)
+#> [1] "step_1" "step_2"
+
+# Step 1: 2010 tracts -> 2020 tracts (NHGIS)
+# Step 2: 2020 tracts -> 2020 ZCTAs (Geocorr)
+

Crosswalk Structure

+

Each crosswalk contains standardized columns:

+ + + + + + + + + + + + + + + + + + + + + + + + + +
ColumnDescription
source_geoidIdentifier for source geography
target_geoidIdentifier for target geography
allocation_factor_source_to_targetWeight for interpolating values
weighting_factorWhat attribute was used (population, housing, land)
+

Additional columns may include source_year, +target_year, population_2020, +housing_2020, and land_area_sqmi depending on +the source.

+

Accessing Metadata

+

Each crosswalk tibble has a crosswalk_metadata attribute +that documents what the crosswalk represents and how it was created:

+
metadata <- attr(result$crosswalks$step_1, "crosswalk_metadata")
+names(metadata)
+#> [1] "call_parameters" "data_source" "data_source_full_name" "download_url" ...
+

Using +crosswalk_data() to Interpolate Data

+

crosswalk_data() applies crosswalk weights to transform +your data. It automatically handles multi-step crosswalks.

+

Column Naming Convention

+

The function auto-detects columns based on prefixes:

+ + + + + + + + + + + + + + + + + +
PrefixTreatment
count_Summed after weighting (for counts like population, housing +units)
mean_, median_, percent_, +ratio_Weighted mean (for rates, percentages, averages)
+

You can also specify columns explicitly via +count_columns and non_count_columns. All +non-count variables are interpolated using weighted means, weighting by +the allocation factor from the crosswalk.

+

Supported Geography +and Year Combinations

+

Inter-Geography Crosswalks +(Geocorr)

+

2022-vintage crosswalks between any of these geographies:

+ +

Inter-Temporal Crosswalks +(NHGIS)

+

NHGIS provides cross-decade crosswalks with the following +structure:

+

Source geographies: block, block_group, tract

+

Target geographies: - From blocks (decennial years +only): block, block_group, tract, county, place, zcta, puma, urban_area, +cbsa - From block_group or tract: block_group, tract, county

+ + + + + + + + + + + + + + + + + + + + + +
Source YearsTarget Years
1990, 20002010, 2014, 2015, 2020, 2022
2010, 2011, 2012, 2014, 20151990, 2000, 2020, 2022
2020, 20221990, 2000, 2010, 2014, 2015
+

Notes: - Within-decade crosswalks (e.g., 2010→2014) +are not available from NHGIS - Block→ZCTA, Block→PUMA, etc. are only +available for decennial years (1990, 2000, 2010, 2020) - The package +automatically uses direct NHGIS crosswalks when available (e.g., +get_crosswalk(source_geography = "block", target_geography = "zcta", source_year = 2010, target_year = 2020) +returns a single-step NHGIS crosswalk)

+

2020→2022 Crosswalks +(CTData)

+

For 2020 to 2022 transformations, the package uses CT Data +Collaborative crosswalks for Connecticut (where planning regions +replaced counties) and identity mappings for other states (where no +changes occurred).

+

API Keys

+

NHGIS crosswalks require an IPUMS API key. Get one at https://account.ipums.org/api_keys +and add to your .Renviron:

+
usethis::edit_r_environ()
+# Add: IPUMS_API_KEY=your_key_here
+

Caching

+

Use the cache parameter to save crosswalks locally for +ease:

+
result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  weight = "population",
+  cache = here::here("crosswalks-cache"))
+

Citations

+

The intellectual credit for the underlying crosswalks belongs to the +original developers.

+

For NHGIS, see citation requirements at: https://www.nhgis.org/citation-and-use-nhgis-data

+

For Geocorr, a suggested citation:

+
+

Missouri Census Data Center, University of Missouri. (2022). Geocorr +2022: Geographic Correspondence Engine. Retrieved from: https://mcdc.missouri.edu/applications/geocorr2022.html

+
+ + + diff --git a/README.md b/README.md index 91bb023..7d6b80c 100644 --- a/README.md +++ b/README.md @@ -1,99 +1,349 @@ -# crosswalk - -An R interface to inter-geography and inter-temporal crosswalks. - -## Overview - -This project provides a consistent API and standardized versions of -crosswalks to allow for programmatic interpolation over time and between -geographies. Say goodbye to manual crosswalk downloads and hello to -reproducible workflows. - -## Installation - -``` r -# Install dependencies -renv::install("UI-Research/crosswalks") -``` - -## Usage - -To get started with `library(crosswalk)`: - -``` r -# Load the package -library(crosswalk) - -## obtain a crosswalk to translate data in 2020-vintage place geographies -## to 2020-vintage county geographies, weighted by population -place_county_crosswalk = get_crosswalk( - source_geography = "place", - target_geography = "county", - weight = c("population"), - cache = here::here("data")) - -## obtain a crosswalk to translate data in 2000-vintage place geographies -## to 2010-vintage place geographies. all available weighting options are -## returned -get_crosswalk( - source_year = 2000, - target_year = 2010, - source_geography = "place", - target_geography = "place", - cache = here::here("data")) -``` - -## Why Use `library(crosswalk)`? - -Crosswalks are a critical component of conducting social sciences -research as they enable analysts to translate data from one geography -and/or temporal vintage to another. For example, if source data are only -available at the county level, a crosswalk can help to produce estimates -of those source data at the city level, enabling analysis at a geography -(the city) that may be either more relevant to target audience(s) and/or -may align with the geography of other data that form part of the -overarching analysis. - -There are excellent existing resources for crosswalks, including the -University of Missouri - Missouri Census Data Center's Geocorr 2022 -crosswalking application and the IPUMS National Historical Geographic -Information System (NHGIS). In fact, the crosswalks returned by using -`library(crosswalk)` are those from Geocorr and NHGIS. - -So why use this package at all? It provides: - -- A consistent, programmatic approach to acquire crosswalks, rather - than ad-hoc manual downloads; - -- Standardized and clear crosswalk variable names so that you can - easily work with multiple crosswalks using the same workflow; - -- Crosswalk metadata stored within the returned crosswalk–no more - commenting in your script with the 15 options you configured prior - to clicking "Download"; - -- The ability to easily "cache" crosswalks locally. - -In brief: this package facilitates a well documented and reproducible -analysis workflow, building on top of the robust underlying resources -already available for crosswalking. - -## Citations! - -The intellectual work and credit for the underlying crosswalks returned -by this package belongs to the original developers of those crosswalks. -You should (in the case of Geocorr crosswalks) and in some cases must -(in the case of NHGIS crosswalks) appropriately cite the developers when -you use these resources. - -**For NHGIS**, you should refer to the NHGIS website and terms of use, -including the recommended citations provided at: -. - -**For Geocorr**, the author of `library(crosswalk)` is unaware of a -required or suggested citation format. An example citation might look -like: - -> Missouri Census Data Center, University of Missouri. (2022). Geocorr -> 2022: Geographic Correspondence Engine. Retrieved [202X-XX-XX] from: -> . + +# crosswalk + +An R interface to inter-geography and inter-temporal crosswalks. + +## Overview + +This package provides a consistent API and standardized versions of +crosswalks to enable consistent approaches that work across different +geography and year combinations. The package also facilitates +interpolation–that is, adjusting source geography/year values by their +crosswalk weights and translating these values to the desired target +geography/year–including diagnostics of the joins between source data +and crosswalks. + +The package sources crosswalks from: + +- **Geocorr 2022** (Missouri Census Data Center) - for same-year + crosswalks between geographies +- **IPUMS NHGIS** - for inter-temporal crosswalks (across different + census years) +- **CT Data Collaborative** - for Connecticut 2020→2022 crosswalks + (planning region changes) + +## Why Use `crosswalk`? + +- **Programmatic access**: No more manual downloads from web interfaces +- **Standardized output**: Consistent column names across all crosswalk + sources +- **Metadata tracking**: Full provenance stored as attributes +- **Multi-step handling**: Automatic chaining when both geography and + year change +- **Local caching**: Reproducible workflows with cached crosswalks + +## Installation + +``` r +# Install from GitHub +renv::install("UI-Research/crosswalk") +``` + +## Overview + +First we obtain a crosswalk and apply it to our data: + +``` r +library(crosswalk) +library(dplyr) +library(stringr) +library(sf) + +source_data = tidycensus::get_acs( + year = 2023, + geography = "zcta", + output = "wide", + variables = c(below_poverty_level = "B17001_002")) %>% + dplyr::select( + source_geoid = GEOID, + count_below_poverty_level = below_poverty_levelE) + +get_crosswalk( + source_geography = "block", + target_geography = "puma", + source_year = 2010, + target_year = 2020, + weight = "population") + +# Get a crosswalk from ZCTAs to PUMAs (same year, uses Geocorr (2022)) +zcta_puma_crosswalk <- get_crosswalk( + source_geography = "zcta", + target_geography = "puma22", + weight = "population") + +# Apply the crosswalk to your data +crosswalked_data <- crosswalk_data( + geoid_column = "source_geoid", + data = source_data, + crosswalk = zcta_puma_crosswalk) +``` + +What does the crosswalk(s) reflect and how was it sourced? + +``` r +attr(crosswalked_data, "crosswalk_metadata") +``` + +How well did the crosswalk join to our source data? + +``` r +## look at all the characteristics of the join(s) between the source data +## and the crosswalks +join_quality = attr(crosswalked_data, "join_quality") + +## what share of records in the source data do not join to a crosswalk and +## thus are dropped during the crosswalking process? +join_quality$pct_data_unmatched + +## zctas aren't nested within states, otherwise join_quality$state_analysis_data +## would help us to ID whether non-joining source data were clustered within one +## or a few states. instead we can join to spatial data to diagnose further: +zctas_sf = tigris::zctas(year = 2023) +states_sf = tigris::states(year = 2023, cb = TRUE) + +## apart from DC, which has a disproportionate number of non-joining ZCTAs-- +## seemingly corresponding to federal areas and buildings--the distribution of +## non-joining ZCTAs appears proportionate to state-level populations and is +## distributed across many states: +zctas_sf %>% + dplyr::filter(GEOID20 %in% join_quality$data_geoids_unmatched) %>% + sf::st_intersection(states_sf %>% select(NAME)) %>% + sf::st_drop_geometry() %>% + dplyr::count(NAME, sort = TRUE) +``` + +And how accurate was the crosswalking process? + +``` r +comparison_data = tidycensus::get_acs( + year = 2023, + geography = "puma", + output = "wide", + variables = c( + below_poverty_level = "B17001_002")) %>% + dplyr::select( + source_geoid = GEOID, + count_below_poverty_level_acs = below_poverty_levelE) + +combined_data = dplyr::left_join( + comparison_data, + crosswalked_data, + by = c("source_geoid" = "geoid")) + +combined_data %>% + dplyr::select(source_geoid, dplyr::matches("count")) %>% + dplyr::mutate(difference_percent = (count_below_poverty_level_acs - count_below_poverty_level) / count_below_poverty_level_acs) %>% + ggplot2::ggplot() + + ggplot2::geom_histogram(ggplot2::aes(x = difference_percent)) + + ggplot2::theme_minimal() + + ggplot2::theme(panel.grid = ggplot2::element_blank()) + + ggplot2::scale_x_continuous(labels = scales::percent) + + ggplot2::labs( + title = "Crosswalked data approximates observed values", + subtitle = "Block group-level source data would produce more accurate crosswalked values", + y = "", + x = "Percent difference between observed and crosswalked values") +``` + +## Core Functions + +The package has two main functions: + +| Function | Purpose | +|--------------------|---------------------------------------------------------------------| +| `get_crosswalk()` | Fetch crosswalk(s) | +| `crosswalk_data()` | Apply crosswalk(s) to interpolate data to the target geography-year | + +## Understanding `get_crosswalk()` Output + +`get_crosswalk()` **always returns a list** structured as follows: + +``` r +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + source_year = 2010, + target_year = 2020, + weight = "population" +) + +names(result) +#> [1] "crosswalks" "plan" "message" +``` + +The list contains three elements: + +| Element | Description | +|--------------|--------------------------------------------------------------------------------| +| `crosswalks` | A named list of crosswalks (`step_1`, `step_2`, etc.) of length one or greater | +| `plan` | Details about what crosswalks are being fetched | +| `message` | A human-readable description of the crosswalk chain | + +### Single-Step vs. Multi-Step Crosswalks + +**Single-step crosswalks** (same year, different geography OR same +geography, different year): + +``` r +# Same year, different geography (Geocorr) +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + weight = "population" +) +# result$crosswalks$step_1 contains one crosswalk + +# Same geography, different year (NHGIS) +result <- get_crosswalk( + source_geography = "tract", + target_geography = "tract", + source_year = 2010, + target_year = 2020 +) +# result$crosswalks$step_1 contains one crosswalk +``` + +**Multi-step crosswalks** (different geography AND different year): + +When both geography and year change, no single crosswalk source provides +this directly. The package automatically plans and fetches a two-step +chain: + +1. **Step 1 (NHGIS)**: Change year, keep geography constant +2. **Step 2 (Geocorr)**: Change geography at target year + +``` r +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + source_year = 2010, + target_year = 2020, + weight = "population" +) + +# Two crosswalks are returned +names(result$crosswalks) +#> [1] "step_1" "step_2" + +# Step 1: 2010 tracts -> 2020 tracts (NHGIS) +# Step 2: 2020 tracts -> 2020 ZCTAs (Geocorr) +``` + +### Crosswalk Structure + +Each crosswalk contains standardized columns: + +| Column | Description | +|--------------------------------------|-----------------------------------------------------| +| `source_geoid` | Identifier for source geography | +| `target_geoid` | Identifier for target geography | +| `allocation_factor_source_to_target` | Weight for interpolating values | +| `weighting_factor` | What attribute was used (population, housing, land) | + +Additional columns may include `source_year`, `target_year`, +`population_2020`, `housing_2020`, and `land_area_sqmi` depending on the +source. + +### Accessing Metadata + +Each crosswalk tibble has a `crosswalk_metadata` attribute that +documents what the crosswalk represents and how it was created: + +``` r +metadata <- attr(result$crosswalks$step_1, "crosswalk_metadata") +names(metadata) +#> [1] "call_parameters" "data_source" "data_source_full_name" "download_url" ... +``` + +## Using `crosswalk_data()` to Interpolate Data + +`crosswalk_data()` applies crosswalk weights to transform your data. It +automatically handles multi-step crosswalks. + +### Column Naming Convention + +The function auto-detects columns based on prefixes: + +| Prefix | Treatment | +|------------------------------------------|--------------------------------------------------------------------| +| `count_` | Summed after weighting (for counts like population, housing units) | +| `mean_`, `median_`, `percent_`, `ratio_` | Weighted mean (for rates, percentages, averages) | + +You can also specify columns explicitly via `count_columns` and +`non_count_columns`. All non-count variables are interpolated using +weighted means, weighting by the allocation factor from the crosswalk. + +## Supported Geography and Year Combinations + +### Inter-Geography Crosswalks (Geocorr) + +2022-vintage crosswalks between any of these geographies: + +- block, block group, tract, county +- place, zcta, puma22 +- cd118, cd119, urban_area, core_based_statistical_area + +### Inter-Temporal Crosswalks (NHGIS) + +NHGIS provides cross-decade crosswalks with the following structure: + +**Source geographies:** block, block_group, tract + +**Target geographies:** - From blocks (decennial years only): block, +block_group, tract, county, place, zcta, puma, urban_area, cbsa - From +block_group or tract: block_group, tract, county + +| Source Years | Target Years | +|------------------------------|------------------------------| +| 1990, 2000 | 2010, 2014, 2015, 2020, 2022 | +| 2010, 2011, 2012, 2014, 2015 | 1990, 2000, 2020, 2022 | +| 2020, 2022 | 1990, 2000, 2010, 2014, 2015 | + +**Notes:** - Within-decade crosswalks (e.g., 2010→2014) are not +available from NHGIS - Block→ZCTA, Block→PUMA, etc. are only available +for decennial years (1990, 2000, 2010, 2020) - The package automatically +uses direct NHGIS crosswalks when available (e.g., +`get_crosswalk(source_geography = "block", target_geography = "zcta", source_year = 2010, target_year = 2020)` +returns a single-step NHGIS crosswalk) + +### 2020→2022 Crosswalks (CTData) + +For 2020 to 2022 transformations, the package uses CT Data Collaborative +crosswalks for Connecticut (where planning regions replaced counties) +and identity mappings for other states (where no changes occurred). + +## API Keys + +NHGIS crosswalks require an IPUMS API key. Get one at + and add to your `.Renviron`: + +``` r +usethis::edit_r_environ() +# Add: IPUMS_API_KEY=your_key_here +``` + +## Caching + +Use the `cache` parameter to save crosswalks locally for ease: + +``` r +result <- get_crosswalk( + source_geography = "tract", + target_geography = "zcta", + weight = "population", + cache = here::here("crosswalks-cache")) +``` + +## Citations + +The intellectual credit for the underlying crosswalks belongs to the +original developers. + +**For NHGIS**, see citation requirements at: + + +**For Geocorr**, a suggested citation: + +> Missouri Census Data Center, University of Missouri. (2022). Geocorr +> 2022: Geographic Correspondence Engine. Retrieved from: +> diff --git a/man/crosswalk_data.Rd b/man/crosswalk_data.Rd index 1a7646b..d6d0e62 100644 --- a/man/crosswalk_data.Rd +++ b/man/crosswalk_data.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/crosswalk_data.R \name{crosswalk_data} \alias{crosswalk_data} -\title{Apply a Crosswalk to Transform Data} +\title{Interpolate data using a crosswalk(s)} \usage{ crosswalk_data( data, @@ -10,7 +10,8 @@ crosswalk_data( geoid_column = "geoid", count_columns = NULL, non_count_columns = NULL, - return_intermediate = FALSE + return_intermediate = FALSE, + show_join_quality = TRUE ) } \arguments{ @@ -40,6 +41,12 @@ detects columns with prefixes "mean_", "median_", "percent_", or "ratio_".} \item{return_intermediate}{Logical. If TRUE and crosswalk has multiple steps, returns a list containing both the final result and intermediate results from each step. Default is FALSE, which returns only the final result.} + +\item{show_join_quality}{Logical. If TRUE (default), prints diagnostic messages +about join quality, including the number of data rows not matching the crosswalk +and vice versa. For state-nested geographies (tract, county, block group, etc.), +also reports state-level concentration of unmatched rows. Set to FALSE to +suppress these messages.} } \value{ If \code{return_intermediate = FALSE} (default), a tibble with data summarized @@ -75,6 +82,11 @@ both NULL, the function will automatically detect columns based on naming prefix as non-count variables } +\strong{Other columns}: Columns that are not the geoid column, count columns, or +non-count columns (e.g., metadata like \code{data_year}) are preserved by taking +the first non-missing value within each target geography group. If all values +are missing, NA is returned. + \strong{Multi-step crosswalks}: When \code{get_crosswalk()} returns multiple crosswalks (for transformations that change both geography and year), this function automatically applies them in sequence. diff --git a/man/get_crosswalk.Rd b/man/get_crosswalk.Rd index da12545..b524d39 100644 --- a/man/get_crosswalk.Rd +++ b/man/get_crosswalk.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/get_crosswalk.R \name{get_crosswalk} \alias{get_crosswalk} -\title{Get a Geographic Crosswalk} +\title{Get a crosswalk(s) to translate data across time and geographies} \usage{ get_crosswalk( source_geography, diff --git a/man/get_crosswalk_chain.Rd b/man/get_crosswalk_chain.Rd deleted file mode 100644 index d314cc4..0000000 --- a/man/get_crosswalk_chain.Rd +++ /dev/null @@ -1,63 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/get_crosswalk_chain.R -\name{get_crosswalk_chain} -\alias{get_crosswalk_chain} -\title{Get a Chain of Crosswalks for Multi-Step Transformations} -\usage{ -get_crosswalk_chain( - source_geography, - target_geography, - source_year = NULL, - target_year = NULL, - weight = "population", - cache = NULL -) -} -\arguments{ -\item{source_geography}{Character. Source geography name.} - -\item{target_geography}{Character. Target geography name.} - -\item{source_year}{Numeric or NULL. Year of the source geography.} - -\item{target_year}{Numeric or NULL. Year of the target geography.} - -\item{weight}{Character or NULL. Weighting variable for Geocorr crosswalks.} - -\item{cache}{Directory path or NULL. Where to cache crosswalks.} -} -\value{ -A list with: -\describe{ -\item{crosswalks}{A named list of crosswalk tibbles (step_1, step_2, etc.)} -\item{plan}{The crosswalk plan from plan_crosswalk_chain()} -\item{message}{A formatted message describing the crosswalk chain} -} -} -\description{ -Retrieves a list of crosswalks needed to transform data from a source -geography/year to a target geography/year. For multi-step transformations, -users should apply each crosswalk sequentially using \code{crosswalk_data()}. -} -\examples{ -\dontrun{ -# Get crosswalks for 2010 tracts to 2020 ZCTAs (requires two steps) -chain <- get_crosswalk_chain( - source_geography = "tract", - target_geography = "zcta", - source_year = 2010, - target_year = 2020, - weight = "population") - -# Apply crosswalks sequentially -data_step1 <- crosswalk_data( - data = my_data, - crosswalk = chain$crosswalks$step_1, - count_columns = "count_population") - -data_final <- crosswalk_data( - data = data_step1, - crosswalk = chain$crosswalks$step_2, - count_columns = "count_population") -} -} diff --git a/man/list_nhgis_crosswalks.Rd b/man/list_nhgis_crosswalks.Rd index f5b1238..fd0d758 100644 --- a/man/list_nhgis_crosswalks.Rd +++ b/man/list_nhgis_crosswalks.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/get_nhgis_crosswalk.R \name{list_nhgis_crosswalks} \alias{list_nhgis_crosswalks} -\title{List Available NHGIS Crosswalks} +\title{List supported NHGIS crosswalks} \usage{ list_nhgis_crosswalks() } diff --git a/tests/testthat/test-crosswalk_data.R b/tests/testthat/test-crosswalk_data.R index b6ea1fd..b8e692f 100644 --- a/tests/testthat/test-crosswalk_data.R +++ b/tests/testthat/test-crosswalk_data.R @@ -612,3 +612,357 @@ test_that("crosswalk_data handles numeric GEOIDs", { expect_s3_class(result, "tbl_df") }) + +# ============================================================================== +# Join quality reporting tests +# ============================================================================== + +test_that("crosswalk_data attaches join_quality attribute", { + mock_data <- tibble::tibble( + geoid = c("A", "B"), + count_population = c(1000, 2000)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "B"), + target_geoid = c("X", "X"), + target_geography_name = c("test", "test"), + allocation_factor_source_to_target = c(0.5, 0.5)) + + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")) + + join_quality <- attr(result, "join_quality") + expect_type(join_quality, "list") + expect_true("n_data_total" %in% names(join_quality)) + expect_true("n_data_unmatched" %in% names(join_quality)) + expect_true("n_crosswalk_total" %in% names(join_quality)) + expect_true("n_crosswalk_unmatched" %in% names(join_quality)) +}) + +test_that("crosswalk_data reports correct join quality for perfect match", { + mock_data <- tibble::tibble( + geoid = c("A", "B"), + count_population = c(1000, 2000)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "B"), + target_geoid = c("X", "X"), + target_geography_name = c("test", "test"), + allocation_factor_source_to_target = c(0.5, 0.5)) + + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")) + + join_quality <- attr(result, "join_quality") + expect_equal(join_quality$n_data_unmatched, 0) + expect_equal(join_quality$n_crosswalk_unmatched, 0) + expect_equal(join_quality$pct_data_unmatched, 0) +}) + +test_that("crosswalk_data reports unmatched data rows", { + mock_data <- tibble::tibble( + geoid = c("A", "B", "C"), # C not in crosswalk + count_population = c(1000, 2000, 3000)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "B"), + target_geoid = c("X", "X"), + target_geography_name = c("test", "test"), + allocation_factor_source_to_target = c(0.5, 0.5)) + + expect_message( + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")), + regexp = "did not match the crosswalk") + + join_quality <- attr(result, "join_quality") + expect_equal(join_quality$n_data_unmatched, 1) + expect_equal(join_quality$n_data_total, 3) + expect_true("C" %in% join_quality$data_geoids_unmatched) +}) + +test_that("crosswalk_data reports unmatched crosswalk rows", { + mock_data <- tibble::tibble( + geoid = c("A"), + count_population = c(1000)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "B", "C"), # B and C not in data + target_geoid = c("X", "Y", "Z"), + target_geography_name = c("test", "test", "test"), + allocation_factor_source_to_target = c(1.0, 1.0, 1.0)) + + expect_message( + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")), + regexp = "were not in input data") + + join_quality <- attr(result, "join_quality") + expect_equal(join_quality$n_crosswalk_unmatched, 2) + expect_equal(join_quality$n_crosswalk_total, 3) +}) + +test_that("crosswalk_data reports state concentration for unmatched data rows", { + # Create data with state-based GEOIDs where unmatched rows are concentrated in NY (36) + mock_data <- tibble::tibble( + geoid = c("01001", "01002", "36001", "36002", "36003", "36004", "36005"), + count_population = c(1000, 2000, 3000, 4000, 5000, 6000, 7000)) + + # Crosswalk only has AL (01) counties - include metadata for state analysis + mock_crosswalk <- tibble::tibble( + source_geoid = c("01001", "01002"), + target_geoid = c("X", "X"), + target_geography_name = c("test", "test"), + allocation_factor_source_to_target = c(0.5, 0.5)) + + # Add metadata indicating this is a county crosswalk (state-nested geography) + attr(mock_crosswalk, "crosswalk_metadata") <- list(source_geography = "county") + + expect_message( + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")), + regexp = "Top states with unmatched data rows") + + join_quality <- attr(result, "join_quality") + expect_equal(join_quality$n_data_unmatched, 5) + + # Check state analysis + expect_false(is.null(join_quality$state_analysis_data)) + expect_true(join_quality$state_analysis_data$is_concentrated) + + # NY should be top state + top_state <- join_quality$state_analysis_data$top_states$state_fips[1] + expect_equal(top_state, "36") +}) + +test_that("crosswalk_data does NOT report state concentration for ZCTAs (non-state-nested)", { + # ZCTAs cross state boundaries, so state analysis should not be performed + mock_data <- tibble::tibble( + geoid = c("01001", "01002", "36001", "36002", "36003"), + count_population = c(1000, 2000, 3000, 4000, 5000)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("01001", "01002"), + target_geoid = c("X", "X"), + allocation_factor_source_to_target = c(0.5, 0.5)) + + # Add metadata indicating this is a ZCTA crosswalk (NOT state-nested) + attr(mock_crosswalk, "crosswalk_metadata") <- list(source_geography = "zcta") + + # Should report unmatched rows but NOT state concentration + expect_message( + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")), + regexp = "did not match the crosswalk") + + join_quality <- attr(result, "join_quality") + expect_equal(join_quality$n_data_unmatched, 3) + + # State analysis should NOT be performed for ZCTAs + expect_true(is.null(join_quality$state_analysis_data)) + expect_false(join_quality$state_analysis_applicable) +}) + +test_that("crosswalk_data show_join_quality=FALSE suppresses messages", { + mock_data <- tibble::tibble( + geoid = c("A", "B", "C"), # C not in crosswalk + count_population = c(1000, 2000, 3000)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "B"), + target_geoid = c("X", "X"), + allocation_factor_source_to_target = c(0.5, 0.5)) + + # With show_join_quality = FALSE, should NOT produce join quality messages + # (but will still produce "Applying crosswalk step..." message) + expect_no_message( + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population"), + show_join_quality = FALSE), + message = "did not match") + + # join_quality attribute should be NULL when disabled + join_quality <- attr(result, "join_quality") + expect_null(join_quality) +}) + +test_that("crosswalk_data silent when join is perfect", { + mock_data <- tibble::tibble( + geoid = c("A", "B"), + count_population = c(1000, 2000)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "B"), + target_geoid = c("X", "X"), + target_geography_name = c("test", "test"), + allocation_factor_source_to_target = c(0.5, 0.5)) + + # Should NOT produce any join quality messages (only "Applying crosswalk step..." message) + expect_message( + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")), + regexp = "Applying crosswalk step") + + # And should NOT have "did not match" message + expect_no_message( + result2 <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")), + message = "did not match") +}) + +# ============================================================================== +# Other column handling tests +# ============================================================================== + +test_that("crosswalk_data preserves other columns using first non-missing value", { + mock_data <- tibble::tibble( + geoid = c("A", "B"), + count_population = c(1000, 2000), + data_year = c(2018, 2018), + vintage = c(2010, 2010)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "B"), + target_geoid = c("X", "X"), + target_geography_name = c("test", "test"), + allocation_factor_source_to_target = c(0.5, 0.5)) + + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")) + + # Other columns should be preserved + expect_true("data_year" %in% colnames(result)) + expect_true("vintage" %in% colnames(result)) + + # Should take first value (both are same in this case) + expect_equal(result$data_year[1], 2018) + expect_equal(result$vintage[1], 2010) +}) + +test_that("crosswalk_data takes first non-missing value for other columns", { + mock_data <- tibble::tibble( + geoid = c("A", "B", "C"), + count_population = c(1000, 2000, 3000), + data_year = c(NA, 2019, 2019)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "B", "C"), + target_geoid = c("X", "X", "X"), + target_geography_name = c("test", "test", "test"), + allocation_factor_source_to_target = c(0.3, 0.3, 0.4)) + + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")) + + # Should take first non-missing value (2019 from B) + expect_equal(result$data_year[1], 2019) +}) + +test_that("crosswalk_data returns NA for other columns when all values missing", { + mock_data <- tibble::tibble( + geoid = c("A", "B"), + count_population = c(1000, 2000), + data_year = c(NA_integer_, NA_integer_)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "B"), + target_geoid = c("X", "X"), + target_geography_name = c("test", "test"), + allocation_factor_source_to_target = c(0.5, 0.5)) + + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")) + + # Should return NA when all values are missing + expect_true(is.na(result$data_year[1])) +}) + +test_that("crosswalk_data handles multiple other columns with different types", { + mock_data <- tibble::tibble( + geoid = c("A", "B"), + count_population = c(1000, 2000), + data_year = c(2018L, 2018L), + source_name = c("Urban", "Urban"), + weight_factor = c(1.5, 1.5)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "B"), + target_geoid = c("X", "X"), + target_geography_name = c("test", "test"), + allocation_factor_source_to_target = c(0.5, 0.5)) + + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")) + + # All other columns should be preserved with correct types + + expect_true("data_year" %in% colnames(result)) + expect_true("source_name" %in% colnames(result)) + expect_true("weight_factor" %in% colnames(result)) + + expect_equal(result$data_year[1], 2018L) + expect_equal(result$source_name[1], "Urban") + expect_equal(result$weight_factor[1], 1.5) +}) + +test_that("crosswalk_data other columns work correctly with one-to-many mapping", { + # Source A maps to two targets (X and Y), B maps to X only + mock_data <- tibble::tibble( + geoid = c("A", "B"), + count_population = c(1000, 2000), + data_year = c(2020, 2020)) + + mock_crosswalk <- tibble::tibble( + source_geoid = c("A", "A", "B"), + target_geoid = c("X", "Y", "X"), + target_geography_name = c("test", "test", "test"), + allocation_factor_source_to_target = c(0.6, 0.4, 1.0)) + + result <- crosswalk_data( + data = mock_data, + crosswalk = mock_crosswalk, + geoid_column = "geoid", + count_columns = c("count_population")) + + # Both target geographies should have data_year preserved + expect_true(all(result$data_year == 2020)) +}) diff --git a/tests/testthat/test-plan_crosswalk_chain.R b/tests/testthat/test-plan_crosswalk_chain.R index bd37ae8..1f0bc8d 100644 --- a/tests/testthat/test-plan_crosswalk_chain.R +++ b/tests/testthat/test-plan_crosswalk_chain.R @@ -115,6 +115,98 @@ test_that("plan_crosswalk_chain multi-step works for block_group", { expect_equal(plan$intermediate_geography, "block_group") }) +# ============================================================================== +# Direct NHGIS crosswalk tests (geography + year change with direct NHGIS crosswalk) +# ============================================================================== + +test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for block to zcta", { + # block 2010 -> zcta 2020 is available directly from NHGIS + plan <- plan_crosswalk_chain( + source_geography = "block", + target_geography = "zcta", + source_year = 2010, + target_year = 2020) + + expect_false(plan$is_multi_step) + expect_equal(nrow(plan$steps), 1) + expect_equal(plan$steps$crosswalk_source[1], "nhgis") + expect_true(stringr::str_detect(plan$steps$description[1], "direct NHGIS")) +}) + +test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for block to puma", { + # block 2010 -> puma 2020 is available directly from NHGIS + plan <- plan_crosswalk_chain( + source_geography = "block", + target_geography = "puma", + source_year = 2010, + target_year = 2020) + + expect_false(plan$is_multi_step) + expect_equal(nrow(plan$steps), 1) + expect_equal(plan$steps$crosswalk_source[1], "nhgis") +}) + +test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for block to place", { + # block 2010 -> place 2020 is available directly from NHGIS + plan <- plan_crosswalk_chain( + source_geography = "block", + target_geography = "place", + source_year = 2010, + target_year = 2020) + + expect_false(plan$is_multi_step) + expect_equal(plan$steps$crosswalk_source[1], "nhgis") +}) + +test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for block to urban_area", { + # block 2010 -> urban_area 2020 is available directly from NHGIS + plan <- plan_crosswalk_chain( + source_geography = "block", + target_geography = "urban_area", + source_year = 2010, + target_year = 2020) + + expect_false(plan$is_multi_step) + expect_equal(plan$steps$crosswalk_source[1], "nhgis") +}) + +test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for block to cbsa", { + # block 2010 -> cbsa 2020 is available directly from NHGIS + plan <- plan_crosswalk_chain( + source_geography = "block", + target_geography = "cbsa", + source_year = 2010, + target_year = 2020) + + expect_false(plan$is_multi_step) + expect_equal(plan$steps$crosswalk_source[1], "nhgis") +}) + +test_that("plan_crosswalk_chain still uses multi-step for tract to zcta (no direct NHGIS)", { + # tract 2010 -> zcta 2020 is NOT available directly from NHGIS + # So it should still require multi-step + plan <- plan_crosswalk_chain( + source_geography = "tract", + target_geography = "zcta", + source_year = 2010, + target_year = 2020) + + expect_true(plan$is_multi_step) + expect_equal(nrow(plan$steps), 2) +}) + +test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for reverse direction", { + # block 2020 -> zcta 2010 is available directly from NHGIS + plan <- plan_crosswalk_chain( + source_geography = "block", + target_geography = "zcta", + source_year = 2020, + target_year = 2010) + + expect_false(plan$is_multi_step) + expect_equal(plan$steps$crosswalk_source[1], "nhgis") +}) + # ============================================================================== # Error handling tests # ============================================================================== diff --git a/vignettes/standardizing-longitudinal-data.Rmd b/vignettes/standardizing-longitudinal-data.Rmd new file mode 100644 index 0000000..af715eb --- /dev/null +++ b/vignettes/standardizing-longitudinal-data.Rmd @@ -0,0 +1,176 @@ +--- +title: "Standardizing Longitudinal Data" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Standardizing Longitudinal Data} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + eval = FALSE +) +``` + +## Overview + +A common challenge with longitudinal tract-level data is that census tract boundaries +change between decennial censuses. Data from before 2020 typically uses 2010 tract +definitions, while more recent data uses 2020 tract definitions. To analyze trends +over time, you need to standardize all years to a consistent tract vintage. + +This vignette demonstrates how the `crosswalk` package efficiently handles this task +using the Urban Institute's [HMDA Neighborhood Summary Files](https://datacatalog.urban.org/dataset/home-mortgage-disclosure-act-neighborhood-summary-files-census-tract-level), +which provide tract-level mortgage lending data from 2018-2023. The 2018-2021 files +use 2010 tract definitions, while 2022-2023 use 2020 tract definitions. + +## Setup + +```{r setup, message=FALSE} +library(crosswalk) +library(dplyr) +library(purrr) +library(readr) +library(tibble) +library(stringr) +``` + +## Step 1: Download the Data + +The Urban Institute publishes annual HMDA tract-level summary files. Let's download +all six years (2018-2023): + +```{r download-data} +## metadata object describing data year/vintage/url +metadata = tibble::tribble( + ~ year, ~ vintage, ~ url, + 2018, 2010, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2023/12/20/hmda_tract_2018.csv", + 2019, 2010, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2023/12/20/hmda_tract_2019.csv", + 2020, 2010, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2023/12/20/hmda_tract_2020.csv", + 2021, 2010, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2023/12/20/hmda_tract_2021.csv", + 2022, 2020, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2023/12/20/hmda_tract_2022.csv", + 2023, 2020, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2024/12/17/hmda_tract_2023.csv") + +## iterate of the metadata object and read in data for each year +hmda_data <- pmap(metadata, function(url, year, vintage) { + read_csv(url, show_col_types = FALSE) |> + mutate( + vintage = vintage, + data_year = as.integer(year)) }) + +names(hmda_data) = metadata$year %>% as.character() +``` + +Let's inspect the structure of the data: + +```{r inspect-data} +# Check dimensions of each year +map(hmda_data, dim) + +# View the 2018 data (2010 tract vintage) +glimpse(hmda_data[["2018"]]) +``` + +## Step 2: Prepare Data for Crosswalking + +The HMDA data includes a `tractid` column that contains the 11-digit tract GEOID. +Let's prepare a subset of variables for crosswalking. We'll focus on count variables +(total applications by race/ethnicity) which require the `count_` prefix for automatic +detection by `crosswalk_data()`: + +```{r prepare-data} +prepare_hmda <- function(data) { + data |> + rename_with(.cols = matches("^geo20"), .fn = ~ "source_geoid") |> + select( + source_geoid, + vintage, + data_year, + # Count variables: rename with count_ prefix for automatic detection + count_race_white_purchase = race_white_purchase, + count_owner_purchase_originations = owner_purchase_originations, + median_owner_loan_amount = owner_loan_amount_median + ) |> + mutate(source_geoid = as.character(source_geoid)) +} + +hmda_prepared <- map(hmda_data, prepare_hmda) +``` + +## Step 3: Obtain the 2010→2020 Tract Crosswalk + +Here's where `crosswalk` shines. We need a single crosswalk to convert all 2010-vintage +tracts to 2020-vintage tracts. The package fetches this from IPUMS NHGIS: + +```{r get-crosswalk} +tract_crosswalk <- get_crosswalk( + source_geography = "tract", + target_geography = "tract", + source_year = 2010, + target_year = 2020, + weight = "population") + +# View the crosswalk plan +tract_crosswalk$message +``` + +The crosswalk contains allocation factors that specify how to distribute values from +2010 tracts to 2020 tracts. When tract boundaries changed, a single 2010 tract may +map to multiple 2020 tracts (or vice versa), with allocation factors summing to 1. + +## Step 4: Apply the Crosswalk to 2018-2021 Data + +Now we apply the crosswalk to the four years of data that use 2010 tract definitions: + +```{r apply-crosswalk} +# Years that need crosswalking (2010 vintage) +years_to_crosswalk <- c("2018", "2019", "2020", "2021") + +# Apply crosswalk to each year +hmda_crosswalked <- map_if( + .x = hmda_prepared, + .p = names(hmda_prepared) %in% years_to_crosswalk, + .f = ~ crosswalk_data( + data = .x, + crosswalk = tract_crosswalk, + geoid_column = "source_geoid", + show_join_quality = TRUE)) + +## how many source records are we unable to crosswalk each year? +## excluding those with an "X"--which is used to retain valid observations +## that, in the source data, do not have a valid tract identifier--very few: +hmda_crosswalked |> + map(~ + .x |> + attr("join_quality") |> + pluck("data_geoids_unmatched") %>% + .[!str_detect(., "X")] |> + length()) + +hmda_combined <- bind_rows(hmda_crosswalked) |> + ## data for years that are crosswalked have slightly different/additional columsn + mutate( + geoid = if_else(is.na(geoid), source_geoid, geoid)) |> + select(-c(geography_name, source_geoid, vintage)) |> + arrange(geoid, data_year) + +# View the result +glimpse(hmda_combined) +``` + + +## Result: A Panel Dataset in 2020 Tract Definitions +We now have a single dataframe with all six years of HMDA data standardized to 2020 +tract definitions: + +```{r final-summary} +# Unique tracts per year +hmda_combined |> + group_by(data_year) |> + summarize(n_tracts = n_distinct(geoid)) +``` + +