diff --git a/DESCRIPTION b/DESCRIPTION
index 51c56b5..4d40343 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -19,6 +19,7 @@ Imports:
     janitor,
     purrr,
     readr,
+    rlang,
     rvest,
     stringr,
     tibble,
diff --git a/NAMESPACE b/NAMESPACE
index 5172ca0..e964796 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -2,5 +2,4 @@
 
 export(crosswalk_data)
 export(get_crosswalk)
-export(get_crosswalk_chain)
 export(list_nhgis_crosswalks)
diff --git a/R/crosswalk_data.R b/R/crosswalk_data.R
index 917e5f5..f909391 100644
--- a/R/crosswalk_data.R
+++ b/R/crosswalk_data.R
@@ -1,7 +1,7 @@
 ## explicitly enable/acknowledge data.table (used by tidytable)
 .datatable.aware = TRUE
 
-#' Apply a Crosswalk to Transform Data
+#' Interpolate data using a crosswalk(s)
 #'
 #' Applies geographic crosswalk weights to transform data from a source geography
 #' to a target geography. Accepts the output from `get_crosswalk()` and automatically
@@ -28,6 +28,11 @@
 #' @param return_intermediate Logical. If TRUE and crosswalk has multiple steps,
 #'    returns a list containing both the final result and intermediate results
 #'    from each step. Default is FALSE, which returns only the final result.
+#' @param show_join_quality Logical. If TRUE (default), prints diagnostic messages
+#'    about join quality, including the number of data rows not matching the crosswalk
+#'    and vice versa. For state-nested geographies (tract, county, block group, etc.),
+#'    also reports state-level concentration of unmatched rows. Set to FALSE to
+#'    suppress these messages.
 #'
 #' @return If `return_intermediate = FALSE` (default), a tibble with data summarized
 #'    to the final target geography.
@@ -55,6 +60,11 @@
 #' - Columns starting with "mean_", "median_", "percent_", or "ratio_" are treated
 #'   as non-count variables
 #'
+#' **Other columns**: Columns that are not the geoid column, count columns, or
+#' non-count columns (e.g., metadata like `data_year`) are preserved by taking
+#' the first non-missing value within each target geography group. If all values
+#' are missing, NA is returned.
+#'
 #' **Multi-step crosswalks**: When `get_crosswalk()` returns multiple crosswalks
 #' (for transformations that change both geography and year), this function
 #' automatically applies them in sequence.
@@ -108,7 +118,8 @@ crosswalk_data <- function(
     geoid_column = "geoid",
     count_columns = NULL,
     non_count_columns = NULL,
-    return_intermediate = FALSE) {
+    return_intermediate = FALSE,
+    show_join_quality = TRUE) {
 
   # Determine if crosswalk is a list (from get_crosswalk) or a single tibble
   crosswalk_list <- extract_crosswalk_list(crosswalk)
@@ -166,7 +177,10 @@ crosswalk_data <- function(
       crosswalk = step_crosswalk,
       geoid_column = current_geoid_column,
       count_columns = count_columns,
-      non_count_columns = non_count_columns)
+      non_count_columns = non_count_columns,
+      step_number = i,
+      total_steps = n_steps,
+      show_join_quality = show_join_quality)
 
     # Store intermediate result if requested
     if (return_intermediate) {
@@ -246,6 +260,295 @@ validate_crosswalk_tibble <- function(crosswalk, name) {
 }
 
 
+#' Check if Geography is Nested Within States
+#'
+#' Internal function that determines whether a geography type has GEOIDs
+#' where the first two characters represent the state FIPS code.
+#'
+#' Geographies nested within states (state FIPS derivable from GEOID):
+#' - block, block_group, tract, county, place, puma, congressional districts
+#'
+#' Geographies NOT nested within states (cross state boundaries):
+#' - zcta, cbsa/core_based_statistical_area, urban_area
+#'
+#' @param geography Character. The geography type to check.
+#' @return Logical. TRUE if geography is nested within states.
+#' @keywords internal
+#' @noRd
+is_state_nested_geography <- function(geography) {
+  if (is.null(geography) || is.na(geography)) {
+    return(FALSE)
+  }
+
+  geography_lower <- geography |>
+    stringr::str_to_lower() |>
+    stringr::str_squish() |>
+    stringr::str_replace_all("_", " ")
+
+  # Geographies where GEOID starts with state FIPS
+
+  state_nested <- c(
+    "block", "blocks", "blk",
+    "block group", "blockgroup", "bg",
+    "tract", "tracts", "tr",
+    "county", "counties", "co",
+    "place", "places", "pl",
+    "puma", "pumas", "puma22",
+    "cd118", "cd119", "congressional district"
+  )
+
+  geography_lower %in% state_nested
+}
+
+
+#' State FIPS to Abbreviation Lookup
+#'
+#' Internal lookup table for converting state FIPS codes to abbreviations.
+#' @keywords internal
+#' @noRd
+state_fips_to_abbr <- tibble::tibble(
+  state_fips = c(
+    "01", "02", "04", "05", "06", "08", "09", "10", "11", "12",
+    "13", "15", "16", "17", "18", "19", "20", "21", "22", "23",
+    "24", "25", "26", "27", "28", "29", "30", "31", "32", "33",
+    "34", "35", "36", "37", "38", "39", "40", "41", "42", "44",
+    "45", "46", "47", "48", "49", "50", "51", "53", "54", "55",
+    "56", "72", "78", "66", "60", "69"
+  ),
+  state_abbr = c(
+    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "DC", "FL",
+    "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME",
+    "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH",
+    "NJ", "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI",
+    "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI",
+    "WY", "PR", "VI", "GU", "AS", "MP"
+  )
+)
+
+
+#' Analyze State Concentration of Unmatched GEOIDs
+#'
+#' Internal function that analyzes the state-level distribution of unmatched GEOIDs.
+#'
+#' @param unmatched_geoids Character vector of GEOIDs that did not match.
+#' @return A list with state_counts tibble, top_states tibble, and is_concentrated flag.
+#' @keywords internal
+#' @noRd
+analyze_state_concentration <- function(unmatched_geoids) {
+  if (length(unmatched_geoids) == 0) {
+    return(NULL)
+  }
+
+  state_counts <- tibble::tibble(
+    geoid = unmatched_geoids,
+    state_fips = stringr::str_sub(geoid, 1, 2)
+  ) |>
+    dplyr::count(state_fips, name = "n_unmatched") |>
+    dplyr::mutate(
+      pct_of_unmatched = n_unmatched / sum(n_unmatched) * 100
+    ) |>
+    dplyr::arrange(dplyr::desc(n_unmatched))
+
+  top_states <- state_counts |>
+    dplyr::slice_head(n = 3)
+
+  is_concentrated <- any(state_counts$pct_of_unmatched > 15)
+
+  list(
+    state_counts = state_counts,
+    top_states = top_states,
+    is_concentrated = is_concentrated
+  )
+}
+
+
+#' Format Join Quality Message
+#'
+#' Internal function that formats join quality diagnostics as messages.
+#'
+#' @param join_quality List of join quality statistics.
+#' @param step_number Integer. Current step number.
+#' @param total_steps Integer. Total number of steps.
+#' @return Character vector of messages to print.
+#' @keywords internal
+#' @noRd
+format_join_quality_message <- function(join_quality, step_number, total_steps) {
+  messages <- character(0)
+
+  step_prefix <- if (total_steps > 1) {
+    stringr::str_c("Step ", step_number, " join quality: ")
+  } else {
+    "Join quality: "
+  }
+
+  # Report unmatched data rows
+  if (join_quality$n_data_unmatched > 0) {
+    msg1 <- stringr::str_c(
+      step_prefix,
+      format(join_quality$n_data_unmatched, big.mark = ","),
+      " of ",
+      format(join_quality$n_data_total, big.mark = ","),
+      " data rows (",
+      sprintf("%.1f%%", join_quality$pct_data_unmatched),
+      ") did not match the crosswalk."
+    )
+    messages <- c(messages, msg1)
+
+    # Add state concentration info if available
+    if (!is.null(join_quality$state_analysis_data) &&
+        nrow(join_quality$state_analysis_data$top_states) > 0) {
+
+      top_states_str <- join_quality$state_analysis_data$top_states |>
+        dplyr::left_join(state_fips_to_abbr, by = "state_fips") |>
+        dplyr::mutate(
+          state_abbr = dplyr::if_else(is.na(state_abbr), state_fips, state_abbr),
+          label = stringr::str_c(
+            state_abbr, " (",
+            sprintf("%.0f%%", pct_of_unmatched), ", ",
+            format(n_unmatched, big.mark = ","), " rows)"
+          )
+        ) |>
+        dplyr::pull(label) |>
+        stringr::str_c(collapse = ", ")
+
+      msg2 <- stringr::str_c("  Top states with unmatched data rows: ", top_states_str)
+      messages <- c(messages, msg2)
+    }
+  }
+
+  # Report crosswalk source GEOIDs not in data
+  if (join_quality$n_crosswalk_unmatched > 0) {
+    msg3 <- stringr::str_c(
+      step_prefix,
+      format(join_quality$n_crosswalk_unmatched, big.mark = ","),
+      " of ",
+      format(join_quality$n_crosswalk_total, big.mark = ","),
+      " crosswalk source GEOIDs (",
+      sprintf("%.1f%%", join_quality$pct_crosswalk_unmatched),
+      ") were not in input data."
+    )
+    messages <- c(messages, msg3)
+
+    # Add state concentration info if available
+    if (!is.null(join_quality$state_analysis_crosswalk) &&
+        nrow(join_quality$state_analysis_crosswalk$top_states) > 0) {
+
+      top_states_str <- join_quality$state_analysis_crosswalk$top_states |>
+        dplyr::left_join(state_fips_to_abbr, by = "state_fips") |>
+        dplyr::mutate(
+          state_abbr = dplyr::if_else(is.na(state_abbr), state_fips, state_abbr),
+          label = stringr::str_c(
+            state_abbr, " (",
+            sprintf("%.0f%%", pct_of_unmatched), ", ",
+            format(n_unmatched, big.mark = ","), " rows)"
+          )
+        ) |>
+        dplyr::pull(label) |>
+        stringr::str_c(collapse = ", ")
+
+      msg4 <- stringr::str_c("  Top states not in data: ", top_states_str)
+      messages <- c(messages, msg4)
+    }
+
+    msg5 <- "  (This is expected if your data covers a geographic subset.)"
+    messages <- c(messages, msg5)
+  }
+
+  return(messages)
+}
+
+
+#' Report Join Quality Between Data and Crosswalk
+#'
+#' Internal function that computes and reports join quality diagnostics.
+#'
+#' @param data The input data tibble.
+#' @param crosswalk The crosswalk tibble.
+#' @param geoid_column Column name for source geoid in data.
+#' @param step_number Integer. Which step this is (for multi-step reporting).
+#' @param total_steps Integer. Total number of steps.
+#' @param source_geography Character or NULL. The source geography type, used to
+#'    determine if state-level analysis is applicable.
+#' @return A list with join quality statistics (also prints messages if issues found).
+#' @keywords internal
+#' @noRd
+report_join_quality <- function(data, crosswalk, geoid_column, step_number = 1,
+                                total_steps = 1, source_geography = NULL) {
+
+  # Ensure geoid columns are character for consistent comparison
+  data_geoids <- data |>
+    dplyr::mutate(dplyr::across(dplyr::all_of(geoid_column), as.character)) |>
+    dplyr::pull(!!rlang::sym(geoid_column)) |>
+    unique()
+
+  crosswalk_source_geoids <- crosswalk |>
+    dplyr::pull(source_geoid) |>
+    unique()
+
+  # Data rows not in crosswalk
+  data_not_in_crosswalk <- setdiff(data_geoids, crosswalk_source_geoids)
+  n_data_unmatched <- length(data_not_in_crosswalk)
+  n_data_total <- length(data_geoids)
+  pct_data_unmatched <- if (n_data_total > 0) {
+    n_data_unmatched / n_data_total * 100
+  } else {
+    0
+  }
+
+  # Crosswalk source GEOIDs not in data
+  crosswalk_not_in_data <- setdiff(crosswalk_source_geoids, data_geoids)
+  n_crosswalk_unmatched <- length(crosswalk_not_in_data)
+  n_crosswalk_total <- length(crosswalk_source_geoids)
+  pct_crosswalk_unmatched <- if (n_crosswalk_total > 0) {
+    n_crosswalk_unmatched / n_crosswalk_total * 100
+  } else {
+    0
+  }
+
+  # State concentration analysis only for state-nested geographies
+  # (ZCTAs, CBSAs, urban areas cross state boundaries so state analysis not meaningful)
+  do_state_analysis <- is_state_nested_geography(source_geography)
+
+  # State concentration analysis for unmatched data rows
+  state_analysis_data <- if (n_data_unmatched > 0 && do_state_analysis) {
+    analyze_state_concentration(data_not_in_crosswalk)
+  } else {
+    NULL
+  }
+
+  # State concentration analysis for crosswalk rows not in data
+  state_analysis_crosswalk <- if (n_crosswalk_unmatched > 0 && do_state_analysis) {
+    analyze_state_concentration(crosswalk_not_in_data)
+  } else {
+    NULL
+  }
+
+  # Build quality statistics
+  join_quality <- list(
+    n_data_total = n_data_total,
+    n_data_unmatched = n_data_unmatched,
+    pct_data_unmatched = pct_data_unmatched,
+    data_geoids_unmatched = data_not_in_crosswalk,
+    state_analysis_data = state_analysis_data,
+    n_crosswalk_total = n_crosswalk_total,
+    n_crosswalk_unmatched = n_crosswalk_unmatched,
+    pct_crosswalk_unmatched = pct_crosswalk_unmatched,
+    crosswalk_geoids_unmatched = crosswalk_not_in_data,
+    state_analysis_crosswalk = state_analysis_crosswalk,
+    source_geography = source_geography,
+    state_analysis_applicable = do_state_analysis
+  )
+
+  # Print messages if there are issues
+  messages <- format_join_quality_message(join_quality, step_number, total_steps)
+  if (length(messages) > 0) {
+    purrr::walk(messages, message)
+  }
+
+  return(join_quality)
+}
+
+
 #' Apply a Single Crosswalk Step
 #'
 #' Internal function that applies one crosswalk tibble to data.
@@ -255,6 +558,9 @@ validate_crosswalk_tibble <- function(crosswalk, name) {
 #' @param geoid_column Column name for source geoid
 #' @param count_columns Count variable columns
 #' @param non_count_columns Non-count variable columns
+#' @param step_number Integer. Current step number for multi-step reporting.
+#' @param total_steps Integer. Total number of steps for multi-step reporting.
+#' @param show_join_quality Logical. Whether to report join quality diagnostics.
 #' @return Crosswalked data
 #' @keywords internal
 #' @noRd
@@ -263,7 +569,10 @@ apply_single_crosswalk <- function(
     crosswalk,
     geoid_column,
     count_columns,
-    non_count_columns) {
+    non_count_columns,
+    step_number = 1,
+    total_steps = 1,
+    show_join_quality = TRUE) {
 
   # Check if crosswalk is empty
   if (nrow(crosswalk) == 0) {
@@ -276,6 +585,26 @@ apply_single_crosswalk <- function(
   # Store metadata for later attachment
   crosswalk_metadata <- attr(crosswalk, "crosswalk_metadata")
 
+  # Extract source geography from metadata for state analysis determination
+  source_geography <- if (!is.null(crosswalk_metadata)) {
+    crosswalk_metadata$source_geography
+  } else {
+    NULL
+  }
+
+  # Report join quality (if enabled)
+  join_quality <- if (show_join_quality) {
+    report_join_quality(
+      data = data,
+      crosswalk = crosswalk,
+      geoid_column = geoid_column,
+      step_number = step_number,
+      total_steps = total_steps,
+      source_geography = source_geography)
+  } else {
+    NULL
+  }
+
   # Determine grouping columns (target_geography_name may not always be present)
   group_cols <- "target_geoid"
   if ("target_geography_name" %in% names(crosswalk)) {
@@ -286,6 +615,17 @@ apply_single_crosswalk <- function(
   current_count_cols <- intersect(count_columns, names(data))
   current_non_count_cols <- intersect(non_count_columns, names(data))
 
+  # Identify "other" columns (not geoid, count, or non-count columns)
+  # These will be aggregated by taking the first non-missing value
+  crosswalk_cols <- c("source_geoid", "target_geoid", "allocation_factor_source_to_target",
+                      "target_geography_name", "weighting_factor", "source_year", "target_year",
+                      "population_2020", "housing_2020", "land_area_sqmi")
+  other_cols <- setdiff(
+    names(data),
+    c(geoid_column, current_count_cols, current_non_count_cols, crosswalk_cols)
+  )
+
+
   # Join crosswalk to data
   result <- data |>
     dplyr::mutate(
@@ -305,6 +645,10 @@ apply_single_crosswalk <- function(
       tidytable::across(
         .cols = tidytable::all_of(current_non_count_cols),
         .fns = ~ stats::weighted.mean(.x, allocation_factor_source_to_target, na.rm = TRUE)),
+      ## other columns: take first non-missing value (or NA if all missing)
+      tidytable::across(
+        .cols = tidytable::all_of(other_cols),
+        .fns = ~ dplyr::first(.x, na_rm = TRUE)),
       tidytable::across(
         .cols = tidytable::all_of(c(current_count_cols, current_non_count_cols)),
         .fns = ~ sum(!is.na(.x)),
@@ -312,7 +656,7 @@ apply_single_crosswalk <- function(
     tidytable::mutate(
       tidytable::across(
         .cols = tidytable::all_of(c(current_count_cols, current_non_count_cols)),
-        .fns = ~ tidytable::if_else(get(stringr::str_c(tidytable::cur_column(), "_validx$")) > 0, .x, NA))) |>
+        .fns = ~ tidytable::if_else(get(stringr::str_c(tidytable::cur_column(), "_validx")) > 0, .x, NA))) |>
     dplyr::select(-dplyr::matches("_validx$")) |>
     dplyr::rename_with(
       .cols = dplyr::everything(),
@@ -322,5 +666,8 @@ apply_single_crosswalk <- function(
   # Attach metadata
   attr(result, "crosswalk_metadata") <- crosswalk_metadata
 
+  # Attach join quality statistics
+  attr(result, "join_quality") <- join_quality
+
   return(result)
 }
diff --git a/R/get_crosswalk.R b/R/get_crosswalk.R
index add135b..5f8b0f3 100644
--- a/R/get_crosswalk.R
+++ b/R/get_crosswalk.R
@@ -1,4 +1,4 @@
-#' Get a Geographic Crosswalk
+#' Get a crosswalk(s) to translate data across time and geographies
 #'
 #' Retrieves a crosswalk with interpolation values from a source geography to a target
 #' geography, optionally across different years. Always returns a list with a consistent
@@ -359,4 +359,6 @@ and counties. The provided geography '", geography, "' is not supported.")}
   return(result)
 }
 
-utils::globalVariables(c("allocation_factor_source_to_target"))
\ No newline at end of file
+utils::globalVariables(c(
+  "allocation_factor_source_to_target", "geoid", "label", 
+  "n_unmatched", "pct_of_unmatched", "state_abbr"))
\ No newline at end of file
diff --git a/R/get_crosswalk_chain.R b/R/get_crosswalk_chain.R
index 1f630be..44bcdc4 100644
--- a/R/get_crosswalk_chain.R
+++ b/R/get_crosswalk_chain.R
@@ -18,28 +18,8 @@
 #'     \item{message}{A formatted message describing the crosswalk chain}
 #'   }
 #'
-#' @export
-#' @examples
-#' \dontrun{
-#' # Get crosswalks for 2010 tracts to 2020 ZCTAs (requires two steps)
-#' chain <- get_crosswalk_chain(
-#'   source_geography = "tract",
-#'   target_geography = "zcta",
-#'   source_year = 2010,
-#'   target_year = 2020,
-#'   weight = "population")
-#'
-#' # Apply crosswalks sequentially
-#' data_step1 <- crosswalk_data(
-#'   data = my_data,
-#'   crosswalk = chain$crosswalks$step_1,
-#'   count_columns = "count_population")
-#'
-#' data_final <- crosswalk_data(
-#'   data = data_step1,
-#'   crosswalk = chain$crosswalks$step_2,
-#'   count_columns = "count_population")
-#' }
+#' @keywords internal
+#' @noRd
 get_crosswalk_chain <- function(
     source_geography,
     target_geography,
diff --git a/R/get_geocorr_crosswalk.R b/R/get_geocorr_crosswalk.R
index e9c9ba0..0dbcf20 100644
--- a/R/get_geocorr_crosswalk.R
+++ b/R/get_geocorr_crosswalk.R
@@ -44,7 +44,7 @@ get_geocorr_crosswalk <- function(
     target_geography,
     weight = "population",
     cache = NULL) {
-
+  
   outpath = "no file exists here"
   ## identify the relevant file paths for potentially-cached crosswalks
   if (!is.null(cache)) {
@@ -239,7 +239,7 @@ get_geocorr_crosswalk <- function(
 
     df1 = readr::read_csv(file.path("https://mcdc.missouri.edu", "temp", csv_path), show_col_types = FALSE) |>
       janitor::clean_names() }
-
+  
   df2 = df1 |>
     dplyr::slice(2:nrow(df1)) |>
     ## naming conventions for some geographies are inconsistent; we standardize
@@ -291,7 +291,10 @@ get_geocorr_crosswalk <- function(
       dplyr::across(
         .cols = dplyr::matches("^cd11"),
         .fns = ~ stringr::str_c(stab, "-", .x),
-        .names = "{.col}_name")) |>
+        .names = "{.col}_name"),
+      dplyr::across(
+        .cols = dplyr::matches("puma22"),
+        .fns = ~ stringr::str_c(state, .x) )) |>
     dplyr::rename_with(
       .cols = dplyr::matches("state|stab"),
       .fn = ~ stringr::str_replace_all(.x, c("state" = "state_fips", "stab" = "state_abbreviation"))) |>
diff --git a/R/get_nhgis_crosswalk.R b/R/get_nhgis_crosswalk.R
index c36f0df..8f0d6e9 100644
--- a/R/get_nhgis_crosswalk.R
+++ b/R/get_nhgis_crosswalk.R
@@ -109,7 +109,7 @@ standardize_geography <- function(geography, context = "source") {
 available crosswalks.")
 }
 
-#' List Available NHGIS Crosswalks
+#' List supported NHGIS crosswalks
 #'
 #' Returns a tibble of all available NHGIS geographic crosswalks with their
 #' corresponding parameters that can be used with get_nhgis_crosswalk().
@@ -408,6 +408,80 @@ list_nhgis_crosswalks <- function() {
   return(nhgis_crosswalks)
 }
 
+
+#' Check if Direct NHGIS Crosswalk is Available
+#'
+#' Internal helper function that checks if NHGIS provides a direct crosswalk
+#' for the given source/target geography and year combination. This is used
+#' to determine if a single-step NHGIS crosswalk can be used instead of
+#' a multi-step approach.
+#'
+#' @param source_geography Character. Source geography name.
+#' @param target_geography Character. Target geography name.
+#' @param source_year Character or numeric. Source year.
+#' @param target_year Character or numeric. Target year.
+#'
+#' @return Logical. TRUE if NHGIS has a direct crosswalk, FALSE otherwise.
+#' @keywords internal
+#' @noRd
+is_nhgis_crosswalk_available <- function(
+    source_geography,
+    target_geography,
+    source_year,
+    target_year) {
+
+  # Get the list of available crosswalks
+  available_crosswalks <- list_nhgis_crosswalks()
+
+  # Standardize inputs for matching
+  source_year_chr <- as.character(source_year)
+  target_year_chr <- as.character(target_year)
+
+  # Standardize geography names to match list_nhgis_crosswalks() output
+  source_geog_std <- standardize_geography_for_nhgis_check(source_geography)
+  target_geog_std <- standardize_geography_for_nhgis_check(target_geography)
+
+  # Check if matching crosswalk exists
+  match_exists <- available_crosswalks |>
+    dplyr::filter(
+      source_geography == source_geog_std,
+      target_geography == target_geog_std,
+      source_year == source_year_chr,
+      target_year == target_year_chr) |>
+    nrow() > 0
+
+  return(match_exists)
+}
+
+
+#' Standardize Geography for NHGIS Availability Check
+#'
+#' Internal helper to standardize geography names to match list_nhgis_crosswalks() output.
+#'
+#' @param geography Character. Geography name in various formats.
+#' @return Character. Standardized geography name matching list_nhgis_crosswalks() output.
+#' @keywords internal
+#' @noRd
+standardize_geography_for_nhgis_check <- function(geography) {
+  geography <- geography |>
+    stringr::str_to_lower() |>
+    stringr::str_squish() |>
+    stringr::str_replace_all("_", " ")
+
+  dplyr::case_when(
+    geography %in% c("blk", "block", "blocks", "census block") ~ "block",
+    geography %in% c("bg", "blockgroup", "block group", "census block group") ~ "block_group",
+    geography %in% c("tr", "tract", "tracts", "census tract") ~ "tract",
+    geography %in% c("co", "county", "counties", "cnty") ~ "county",
+    geography %in% c("pl", "place", "places") ~ "place",
+    geography %in% c("zcta", "zctas", "zip code", "zip code tabulation area") ~ "zcta",
+    geography %in% c("puma", "pumas", "puma22", "public use microdata area") ~ "puma",
+    geography %in% c("cbsa", "core based statistical area") ~ "core_based_statistical_area",
+    geography %in% c("ua", "urban area", "urban areas") ~ "urban_area",
+    TRUE ~ geography)
+}
+
+
 #' Get NHGIS Geographic Crosswalk
 #'
 #' Retrieves a geographic crosswalk from the IPUMS NHGIS API based on user-specified
diff --git a/R/plan_crosswalk_chain.R b/R/plan_crosswalk_chain.R
index ee1a38b..20be49f 100644
--- a/R/plan_crosswalk_chain.R
+++ b/R/plan_crosswalk_chain.R
@@ -113,7 +113,32 @@ plan_crosswalk_chain <- function(
     return(result)
   }
 
-  # Case 4: Different geography AND different year - multi-step required
+  # Case 4: Different geography AND different year
+  # First, check if NHGIS has a direct crosswalk for this combination
+  # (e.g., block 2010 -> zcta 2020 is available directly from NHGIS)
+  nhgis_direct_available <- is_nhgis_crosswalk_available(
+    source_geography = source_geography,
+    target_geography = target_geography,
+    source_year = source_year,
+    target_year = target_year)
+
+  if (nhgis_direct_available) {
+    # Single-step NHGIS crosswalk available
+    result$steps <- tibble::tibble(
+      step_number = 1L,
+      source_geography = source_geography,
+      source_year = source_year_chr,
+      target_geography = target_geography,
+      target_year = target_year_chr,
+      crosswalk_source = "nhgis",
+      description = stringr::str_c(
+        source_year_chr, " ", source_geog_std, " -> ",
+        target_year_chr, " ", target_geog_std, " (direct NHGIS crosswalk)"))
+    result$composition_note <- "Single crosswalk; use allocation_factor_source_to_target directly."
+    return(result)
+  }
+
+  # No direct NHGIS crosswalk available - multi-step required
   result$is_multi_step <- TRUE
 
   # Check if NHGIS supports the source geography for inter-temporal crosswalk
diff --git a/README.Rmd b/README.Rmd
new file mode 100644
index 0000000..3be4410
--- /dev/null
+++ b/README.Rmd
@@ -0,0 +1,336 @@
+---
+output: github_document
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  fig.path = "man/figures/README-",
+  out.width = "100%",
+  eval = FALSE
+)
+```
+
+# crosswalk
+
+An R interface to inter-geography and inter-temporal crosswalks.
+
+## Overview
+
+This package provides a consistent API and standardized versions of crosswalks to enable consistent approaches that work across different geography and year combinations. The package also facilitates
+interpolation--that is, adjusting source geography/year values by their crosswalk weights and translating
+these values to the desired target geography/year--including diagnostics of the joins between source data
+and crosswalks.
+
+The package sources crosswalks from:
+
+-   **Geocorr 2022** (Missouri Census Data Center) - for same-year crosswalks between geographies
+-   **IPUMS NHGIS** - for inter-temporal crosswalks (across different census years)
+-   **CT Data Collaborative** - for Connecticut 2020→2022 crosswalks (planning region changes)
+
+## Why Use `crosswalk`?
+
+-   **Programmatic access**: No more manual downloads from web interfaces
+-   **Standardized output**: Consistent column names across all crosswalk sources
+-   **Metadata tracking**: Full provenance stored as attributes
+-   **Multi-step handling**: Automatic chaining when both geography and year change
+-   **Local caching**: Reproducible workflows with cached crosswalks
+
+## Installation
+
+```{r}
+# Install from GitHub
+renv::install("UI-Research/crosswalk")
+```
+
+## Overview
+
+First we obtain a crosswalk and apply it to our data:
+```{r}
+library(crosswalk)
+library(dplyr)
+library(stringr)
+library(sf)
+
+source_data = tidycensus::get_acs(
+    year = 2023,
+    geography = "zcta",
+    output = "wide",
+    variables = c(below_poverty_level = "B17001_002")) %>%
+  dplyr::select(
+    source_geoid = GEOID,
+    count_below_poverty_level = below_poverty_levelE)
+
+get_crosswalk(
+  source_geography = "block",
+  target_geography = "puma",
+  source_year = 2010,
+  target_year = 2020,
+  weight = "population")
+
+# Get a crosswalk from ZCTAs to PUMAs (same year, uses Geocorr (2022))
+zcta_puma_crosswalk <- get_crosswalk(
+  source_geography = "zcta",
+  target_geography = "puma22",
+  weight = "population")
+
+# Apply the crosswalk to your data
+crosswalked_data <- crosswalk_data(
+  geoid_column = "source_geoid",
+  data = source_data,
+  crosswalk = zcta_puma_crosswalk)
+```
+
+What does the crosswalk(s) reflect and how was it sourced?
+```{r}
+attr(crosswalked_data, "crosswalk_metadata")
+```
+
+How well did the crosswalk join to our source data?
+```{r}
+## look at all the characteristics of the join(s) between the source data
+## and the crosswalks
+join_quality = attr(crosswalked_data, "join_quality")
+
+## what share of records in the source data do not join to a crosswalk and
+## thus are dropped during the crosswalking process?
+join_quality$pct_data_unmatched
+
+## zctas aren't nested within states, otherwise join_quality$state_analysis_data 
+## would help us to ID whether non-joining source data were clustered within one
+## or a few states. instead we can join to spatial data to diagnose further:
+zctas_sf = tigris::zctas(year = 2023)
+states_sf = tigris::states(year = 2023, cb = TRUE)
+
+## apart from DC, which has a disproportionate number of non-joining ZCTAs--
+## seemingly corresponding to federal areas and buildings--the distribution of
+## non-joining ZCTAs appears proportionate to state-level populations and is 
+## distributed across many states:
+zctas_sf %>% 
+  dplyr::filter(GEOID20 %in% join_quality$data_geoids_unmatched) %>%
+  sf::st_intersection(states_sf %>% select(NAME)) %>%
+  sf::st_drop_geometry() %>%
+  dplyr::count(NAME, sort = TRUE)
+```
+
+And how accurate was the crosswalking process?
+```{r}
+comparison_data = tidycensus::get_acs(
+    year = 2023,
+    geography = "puma",
+    output = "wide",
+    variables = c(
+      below_poverty_level = "B17001_002")) %>%
+  dplyr::select(
+    source_geoid = GEOID,
+    count_below_poverty_level_acs = below_poverty_levelE)
+
+combined_data = dplyr::left_join(
+  comparison_data,
+  crosswalked_data,
+  by = c("source_geoid" = "geoid")) 
+  
+combined_data %>%
+  dplyr::select(source_geoid, dplyr::matches("count")) %>%
+  dplyr::mutate(difference_percent = (count_below_poverty_level_acs - count_below_poverty_level) / count_below_poverty_level_acs) %>%
+  ggplot2::ggplot() +
+    ggplot2::geom_histogram(ggplot2::aes(x = difference_percent)) +
+    ggplot2::theme_minimal() +
+    ggplot2::theme(panel.grid = ggplot2::element_blank()) +
+    ggplot2::scale_x_continuous(labels = scales::percent) +
+    ggplot2::labs(
+      title = "Crosswalked data approximates observed values",
+      subtitle = "Block group-level source data would produce more accurate crosswalked values",
+      y = "",
+      x = "Percent difference between observed and crosswalked values")
+```
+
+## Core Functions
+
+The package has two main functions:
+
+| Function | Purpose |
+|--------------------------------------|----------------------------------|
+| `get_crosswalk()` | Fetch crosswalk(s) |
+| `crosswalk_data()` | Apply crosswalk(s) to interpolate data to the target geography-year |
+
+## Understanding `get_crosswalk()` Output
+
+`get_crosswalk()` **always returns a list** structured as follows:
+
+```{r}
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  source_year = 2010,
+  target_year = 2020,
+  weight = "population"
+)
+
+names(result)
+#> [1] "crosswalks" "plan" "message"
+```
+
+The list contains three elements:
+
+| Element      | Description                                                  |
+|------------------------------|------------------------------------------|
+| `crosswalks` | A named list of crosswalks (`step_1`, `step_2`, etc.) of length one or greater |
+| `plan`       | Details about what crosswalks are being fetched              |
+| `message`    | A human-readable description of the crosswalk chain          |
+
+### Single-Step vs. Multi-Step Crosswalks
+
+**Single-step crosswalks** (same year, different geography OR same geography, different year):
+
+```{r}
+# Same year, different geography (Geocorr)
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  weight = "population"
+)
+# result$crosswalks$step_1 contains one crosswalk
+
+# Same geography, different year (NHGIS)
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "tract",
+  source_year = 2010,
+  target_year = 2020
+)
+# result$crosswalks$step_1 contains one crosswalk
+```
+
+**Multi-step crosswalks** (different geography AND different year):
+
+When both geography and year change, no single crosswalk source provides this directly. The package automatically plans and fetches a two-step chain:
+
+1.  **Step 1 (NHGIS)**: Change year, keep geography constant
+2.  **Step 2 (Geocorr)**: Change geography at target year
+
+```{r}
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  source_year = 2010,
+  target_year = 2020,
+  weight = "population"
+)
+
+# Two crosswalks are returned
+names(result$crosswalks)
+#> [1] "step_1" "step_2"
+
+# Step 1: 2010 tracts -> 2020 tracts (NHGIS)
+# Step 2: 2020 tracts -> 2020 ZCTAs (Geocorr)
+```
+
+### Crosswalk Structure
+
+Each crosswalk contains standardized columns:
+
+| Column | Description |
+|----------------------------|--------------------------------------------|
+| `source_geoid` | Identifier for source geography |
+| `target_geoid` | Identifier for target geography |
+| `allocation_factor_source_to_target` | Weight for interpolating values |
+| `weighting_factor` | What attribute was used (population, housing, land) |
+
+Additional columns may include `source_year`, `target_year`, `population_2020`, `housing_2020`, and `land_area_sqmi` depending on the source.
+
+### Accessing Metadata
+
+Each crosswalk tibble has a `crosswalk_metadata` attribute that documents what the crosswalk
+represents and how it was created:
+
+```{r}
+metadata <- attr(result$crosswalks$step_1, "crosswalk_metadata")
+names(metadata)
+#> [1] "call_parameters" "data_source" "data_source_full_name" "download_url" ...
+```
+
+## Using `crosswalk_data()` to Interpolate Data
+
+`crosswalk_data()` applies crosswalk weights to transform your data. It automatically handles multi-step crosswalks.
+
+### Column Naming Convention
+
+The function auto-detects columns based on prefixes:
+
+| Prefix | Treatment |
+|-------------------------------|-----------------------------------------|
+| `count_` | Summed after weighting (for counts like population, housing units) |
+| `mean_`, `median_`, `percent_`, `ratio_` | Weighted mean (for rates, percentages, averages) |
+
+You can also specify columns explicitly via `count_columns` and `non_count_columns`. 
+All non-count variables are interpolated using weighted means, weighting by the allocation factor from the crosswalk.
+
+
+## Supported Geography and Year Combinations
+
+### Inter-Geography Crosswalks (Geocorr)
+
+2022-vintage crosswalks between any of these geographies:
+
+-   block, block group, tract, county
+-   place, zcta, puma22
+-   cd118, cd119, urban_area, core_based_statistical_area
+
+### Inter-Temporal Crosswalks (NHGIS)
+
+NHGIS provides cross-decade crosswalks with the following structure:
+
+**Source geographies:** block, block_group, tract
+
+**Target geographies:**
+- From blocks (decennial years only): block, block_group, tract, county, place, zcta, puma, urban_area, cbsa
+- From block_group or tract: block_group, tract, county
+
+| Source Years                 | Target Years                 |
+|------------------------------|------------------------------|
+| 1990, 2000                   | 2010, 2014, 2015, 2020, 2022 |
+| 2010, 2011, 2012, 2014, 2015 | 1990, 2000, 2020, 2022       |
+| 2020, 2022                   | 1990, 2000, 2010, 2014, 2015 |
+
+**Notes:**
+- Within-decade crosswalks (e.g., 2010→2014) are not available from NHGIS
+- Block→ZCTA, Block→PUMA, etc. are only available for decennial years (1990, 2000, 2010, 2020)
+- The package automatically uses direct NHGIS crosswalks when available (e.g., `get_crosswalk(source_geography = "block", target_geography = "zcta", source_year = 2010, target_year = 2020)` returns a single-step NHGIS crosswalk)
+
+### 2020→2022 Crosswalks (CTData)
+
+For 2020 to 2022 transformations, the package uses CT Data Collaborative crosswalks for Connecticut (where planning regions replaced counties) and identity mappings for other states (where no changes occurred).
+
+## API Keys
+
+NHGIS crosswalks require an IPUMS API key. Get one at https://account.ipums.org/api_keys and add to your `.Renviron`:
+
+```{r}
+usethis::edit_r_environ()
+# Add: IPUMS_API_KEY=your_key_here
+```
+
+## Caching
+
+Use the `cache` parameter to save crosswalks locally for ease:
+
+```{r}
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  weight = "population",
+  cache = here::here("crosswalks-cache"))
+```
+
+
+## Citations
+
+The intellectual credit for the underlying crosswalks belongs to the original developers.
+
+**For NHGIS**, see citation requirements at: https://www.nhgis.org/citation-and-use-nhgis-data
+
+**For Geocorr**, a suggested citation:
+
+> Missouri Census Data Center, University of Missouri. (2022). Geocorr 2022: Geographic Correspondence Engine. Retrieved from: https://mcdc.missouri.edu/applications/geocorr2022.html
\ No newline at end of file
diff --git a/README.html b/README.html
new file mode 100644
index 0000000..575b5f1
--- /dev/null
+++ b/README.html
@@ -0,0 +1,976 @@
+<!DOCTYPE html>
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+
+<meta charset="utf-8">
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta name="generator" content="pandoc" />
+<meta name="viewport" content="width=device-width, initial-scale=1">
+
+<style type="text/css">
+@font-face {
+font-family: octicons-link;
+src: url(data:font/woff;charset=utf-8;base64,d09GRgABAAAAAAZwABAAAAAACFQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABEU0lHAAAGaAAAAAgAAAAIAAAAAUdTVUIAAAZcAAAACgAAAAoAAQAAT1MvMgAAAyQAAABJAAAAYFYEU3RjbWFwAAADcAAAAEUAAACAAJThvmN2dCAAAATkAAAABAAAAAQAAAAAZnBnbQAAA7gAAACyAAABCUM+8IhnYXNwAAAGTAAAABAAAAAQABoAI2dseWYAAAFsAAABPAAAAZwcEq9taGVhZAAAAsgAAAA0AAAANgh4a91oaGVhAAADCAAAABoAAAAkCA8DRGhtdHgAAAL8AAAADAAAAAwGAACfbG9jYQAAAsAAAAAIAAAACABiATBtYXhwAAACqAAAABgAAAAgAA8ASm5hbWUAAAToAAABQgAAAlXu73sOcG9zdAAABiwAAAAeAAAAME3QpOBwcmVwAAAEbAAAAHYAAAB/aFGpk3jaTY6xa8JAGMW/O62BDi0tJLYQincXEypYIiGJjSgHniQ6umTsUEyLm5BV6NDBP8Tpts6F0v+k/0an2i+itHDw3v2+9+DBKTzsJNnWJNTgHEy4BgG3EMI9DCEDOGEXzDADU5hBKMIgNPZqoD3SilVaXZCER3/I7AtxEJLtzzuZfI+VVkprxTlXShWKb3TBecG11rwoNlmmn1P2WYcJczl32etSpKnziC7lQyWe1smVPy/Lt7Kc+0vWY/gAgIIEqAN9we0pwKXreiMasxvabDQMM4riO+qxM2ogwDGOZTXxwxDiycQIcoYFBLj5K3EIaSctAq2kTYiw+ymhce7vwM9jSqO8JyVd5RH9gyTt2+J/yUmYlIR0s04n6+7Vm1ozezUeLEaUjhaDSuXHwVRgvLJn1tQ7xiuVv/ocTRF42mNgZGBgYGbwZOBiAAFGJBIMAAizAFoAAABiAGIAznjaY2BkYGAA4in8zwXi+W2+MjCzMIDApSwvXzC97Z4Ig8N/BxYGZgcgl52BCSQKAA3jCV8CAABfAAAAAAQAAEB42mNgZGBg4f3vACQZQABIMjKgAmYAKEgBXgAAeNpjYGY6wTiBgZWBg2kmUxoDA4MPhGZMYzBi1AHygVLYQUCaawqDA4PChxhmh/8ODDEsvAwHgMKMIDnGL0x7gJQCAwMAJd4MFwAAAHjaY2BgYGaA4DAGRgYQkAHyGMF8NgYrIM3JIAGVYYDT+AEjAwuDFpBmA9KMDEwMCh9i/v8H8sH0/4dQc1iAmAkALaUKLgAAAHjaTY9LDsIgEIbtgqHUPpDi3gPoBVyRTmTddOmqTXThEXqrob2gQ1FjwpDvfwCBdmdXC5AVKFu3e5MfNFJ29KTQT48Ob9/lqYwOGZxeUelN2U2R6+cArgtCJpauW7UQBqnFkUsjAY/kOU1cP+DAgvxwn1chZDwUbd6CFimGXwzwF6tPbFIcjEl+vvmM/byA48e6tWrKArm4ZJlCbdsrxksL1AwWn/yBSJKpYbq8AXaaTb8AAHja28jAwOC00ZrBeQNDQOWO//sdBBgYGRiYWYAEELEwMTE4uzo5Zzo5b2BxdnFOcALxNjA6b2ByTswC8jYwg0VlNuoCTWAMqNzMzsoK1rEhNqByEyerg5PMJlYuVueETKcd/89uBpnpvIEVomeHLoMsAAe1Id4AAAAAAAB42oWQT07CQBTGv0JBhagk7HQzKxca2sJCE1hDt4QF+9JOS0nbaaYDCQfwCJ7Au3AHj+LO13FMmm6cl7785vven0kBjHCBhfpYuNa5Ph1c0e2Xu3jEvWG7UdPDLZ4N92nOm+EBXuAbHmIMSRMs+4aUEd4Nd3CHD8NdvOLTsA2GL8M9PODbcL+hD7C1xoaHeLJSEao0FEW14ckxC+TU8TxvsY6X0eLPmRhry2WVioLpkrbp84LLQPGI7c6sOiUzpWIWS5GzlSgUzzLBSikOPFTOXqly7rqx0Z1Q5BAIoZBSFihQYQOOBEdkCOgXTOHA07HAGjGWiIjaPZNW13/+lm6S9FT7rLHFJ6fQbkATOG1j2OFMucKJJsxIVfQORl+9Jyda6Sl1dUYhSCm1dyClfoeDve4qMYdLEbfqHf3O/AdDumsjAAB42mNgYoAAZQYjBmyAGYQZmdhL8zLdDEydARfoAqIAAAABAAMABwAKABMAB///AA8AAQAAAAAAAAAAAAAAAAABAAAAAA==) format('woff');
+}
+body {
+-webkit-text-size-adjust: 100%;
+text-size-adjust: 100%;
+color: #333;
+font-family: "Helvetica Neue", Helvetica, "Segoe UI", Arial, freesans, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
+font-size: 16px;
+line-height: 1.6;
+word-wrap: break-word;
+}
+a {
+background-color: transparent;
+}
+a:active,
+a:hover {
+outline: 0;
+}
+strong {
+font-weight: bold;
+}
+h1 {
+font-size: 2em;
+margin: 0.67em 0;
+}
+img {
+border: 0;
+}
+hr {
+box-sizing: content-box;
+height: 0;
+}
+pre {
+overflow: auto;
+}
+code,
+kbd,
+pre {
+font-family: monospace, monospace;
+font-size: 1em;
+}
+input {
+color: inherit;
+font: inherit;
+margin: 0;
+}
+html input[disabled] {
+cursor: default;
+}
+input {
+line-height: normal;
+}
+input[type="checkbox"] {
+box-sizing: border-box;
+padding: 0;
+}
+table {
+border-collapse: collapse;
+border-spacing: 0;
+}
+td,
+th {
+padding: 0;
+}
+* {
+box-sizing: border-box;
+}
+input {
+font: 13px / 1.4 Helvetica, arial, nimbussansl, liberationsans, freesans, clean, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";
+}
+a {
+color: #4078c0;
+text-decoration: none;
+}
+a:hover,
+a:active {
+text-decoration: underline;
+}
+hr {
+height: 0;
+margin: 15px 0;
+overflow: hidden;
+background: transparent;
+border: 0;
+border-bottom: 1px solid #ddd;
+}
+hr:before {
+display: table;
+content: "";
+}
+hr:after {
+display: table;
+clear: both;
+content: "";
+}
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+margin-top: 15px;
+margin-bottom: 15px;
+line-height: 1.1;
+}
+h1 {
+font-size: 30px;
+}
+h2 {
+font-size: 21px;
+}
+h3 {
+font-size: 16px;
+}
+h4 {
+font-size: 14px;
+}
+h5 {
+font-size: 12px;
+}
+h6 {
+font-size: 11px;
+}
+blockquote {
+margin: 0;
+}
+ul,
+ol {
+padding: 0;
+margin-top: 0;
+margin-bottom: 0;
+}
+ol ol,
+ul ol {
+list-style-type: lower-roman;
+}
+ul ul ol,
+ul ol ol,
+ol ul ol,
+ol ol ol {
+list-style-type: lower-alpha;
+}
+dd {
+margin-left: 0;
+}
+code {
+font-family: Consolas, "Liberation Mono", Menlo, Courier, monospace;
+font-size: 12px;
+}
+pre {
+margin-top: 0;
+margin-bottom: 0;
+font: 12px Consolas, "Liberation Mono", Menlo, Courier, monospace;
+}
+.select::-ms-expand {
+opacity: 0;
+}
+.octicon {
+font: normal normal normal 16px/1 octicons-link;
+display: inline-block;
+text-decoration: none;
+text-rendering: auto;
+-webkit-font-smoothing: antialiased;
+-moz-osx-font-smoothing: grayscale;
+-webkit-user-select: none;
+-moz-user-select: none;
+-ms-user-select: none;
+user-select: none;
+}
+.octicon-link:before {
+content: '\f05c';
+}
+.markdown-body:before {
+display: table;
+content: "";
+}
+.markdown-body:after {
+display: table;
+clear: both;
+content: "";
+}
+.markdown-body>*:first-child {
+margin-top: 0 !important;
+}
+.markdown-body>*:last-child {
+margin-bottom: 0 !important;
+}
+a:not([href]) {
+color: inherit;
+text-decoration: none;
+}
+.anchor {
+display: inline-block;
+padding-right: 2px;
+margin-left: -18px;
+}
+.anchor:focus {
+outline: none;
+}
+h1,
+h2,
+h3,
+h4,
+h5,
+h6 {
+margin-top: 1em;
+margin-bottom: 16px;
+font-weight: bold;
+line-height: 1.4;
+}
+h1 .octicon-link,
+h2 .octicon-link,
+h3 .octicon-link,
+h4 .octicon-link,
+h5 .octicon-link,
+h6 .octicon-link {
+color: #000;
+vertical-align: middle;
+visibility: hidden;
+}
+h1:hover .anchor,
+h2:hover .anchor,
+h3:hover .anchor,
+h4:hover .anchor,
+h5:hover .anchor,
+h6:hover .anchor {
+text-decoration: none;
+}
+h1:hover .anchor .octicon-link,
+h2:hover .anchor .octicon-link,
+h3:hover .anchor .octicon-link,
+h4:hover .anchor .octicon-link,
+h5:hover .anchor .octicon-link,
+h6:hover .anchor .octicon-link {
+visibility: visible;
+}
+h1 {
+padding-bottom: 0.3em;
+font-size: 2.25em;
+line-height: 1.2;
+border-bottom: 1px solid #eee;
+}
+h1 .anchor {
+line-height: 1;
+}
+h2 {
+padding-bottom: 0.3em;
+font-size: 1.75em;
+line-height: 1.225;
+border-bottom: 1px solid #eee;
+}
+h2 .anchor {
+line-height: 1;
+}
+h3 {
+font-size: 1.5em;
+line-height: 1.43;
+}
+h3 .anchor {
+line-height: 1.2;
+}
+h4 {
+font-size: 1.25em;
+}
+h4 .anchor {
+line-height: 1.2;
+}
+h5 {
+font-size: 1em;
+}
+h5 .anchor {
+line-height: 1.1;
+}
+h6 {
+font-size: 1em;
+color: #777;
+}
+h6 .anchor {
+line-height: 1.1;
+}
+p,
+blockquote,
+ul,
+ol,
+dl,
+table,
+pre {
+margin-top: 0;
+margin-bottom: 16px;
+}
+hr {
+height: 4px;
+padding: 0;
+margin: 16px 0;
+background-color: #e7e7e7;
+border: 0 none;
+}
+ul,
+ol {
+padding-left: 2em;
+}
+ul ul,
+ul ol,
+ol ol,
+ol ul {
+margin-top: 0;
+margin-bottom: 0;
+}
+li>p {
+margin-top: 16px;
+}
+dl {
+padding: 0;
+}
+dl dt {
+padding: 0;
+margin-top: 16px;
+font-size: 1em;
+font-style: italic;
+font-weight: bold;
+}
+dl dd {
+padding: 0 16px;
+margin-bottom: 16px;
+}
+blockquote {
+padding: 0 15px;
+color: #777;
+border-left: 4px solid #ddd;
+}
+blockquote>:first-child {
+margin-top: 0;
+}
+blockquote>:last-child {
+margin-bottom: 0;
+}
+table {
+display: block;
+width: 100%;
+overflow: auto;
+word-break: normal;
+word-break: keep-all;
+}
+table th {
+font-weight: bold;
+}
+table th,
+table td {
+padding: 6px 13px;
+border: 1px solid #ddd;
+}
+table tr {
+background-color: #fff;
+border-top: 1px solid #ccc;
+}
+table tr:nth-child(2n) {
+background-color: #f8f8f8;
+}
+img {
+max-width: 100%;
+box-sizing: content-box;
+background-color: #fff;
+}
+code {
+padding: 0;
+padding-top: 0.2em;
+padding-bottom: 0.2em;
+margin: 0;
+font-size: 85%;
+background-color: rgba(0,0,0,0.04);
+border-radius: 3px;
+}
+code:before,
+code:after {
+letter-spacing: -0.2em;
+content: "\00a0";
+}
+pre>code {
+padding: 0;
+margin: 0;
+font-size: 100%;
+word-break: normal;
+white-space: pre;
+background: transparent;
+border: 0;
+}
+.highlight {
+margin-bottom: 16px;
+}
+.highlight pre,
+pre {
+padding: 16px;
+overflow: auto;
+font-size: 85%;
+line-height: 1.45;
+background-color: #f7f7f7;
+border-radius: 3px;
+}
+.highlight pre {
+margin-bottom: 0;
+word-break: normal;
+}
+pre {
+word-wrap: normal;
+}
+pre code {
+display: inline;
+max-width: initial;
+padding: 0;
+margin: 0;
+overflow: initial;
+line-height: inherit;
+word-wrap: normal;
+background-color: transparent;
+border: 0;
+}
+pre code:before,
+pre code:after {
+content: normal;
+}
+kbd {
+display: inline-block;
+padding: 3px 5px;
+font-size: 11px;
+line-height: 10px;
+color: #555;
+vertical-align: middle;
+background-color: #fcfcfc;
+border: solid 1px #ccc;
+border-bottom-color: #bbb;
+border-radius: 3px;
+box-shadow: inset 0 -1px 0 #bbb;
+}
+.pl-c {
+color: #969896;
+}
+.pl-c1,
+.pl-s .pl-v {
+color: #0086b3;
+}
+.pl-e,
+.pl-en {
+color: #795da3;
+}
+.pl-s .pl-s1,
+.pl-smi {
+color: #333;
+}
+.pl-ent {
+color: #63a35c;
+}
+.pl-k {
+color: #a71d5d;
+}
+.pl-pds,
+.pl-s,
+.pl-s .pl-pse .pl-s1,
+.pl-sr,
+.pl-sr .pl-cce,
+.pl-sr .pl-sra,
+.pl-sr .pl-sre {
+color: #183691;
+}
+.pl-v {
+color: #ed6a43;
+}
+.pl-id {
+color: #b52a1d;
+}
+.pl-ii {
+background-color: #b52a1d;
+color: #f8f8f8;
+}
+.pl-sr .pl-cce {
+color: #63a35c;
+font-weight: bold;
+}
+.pl-ml {
+color: #693a17;
+}
+.pl-mh,
+.pl-mh .pl-en,
+.pl-ms {
+color: #1d3e81;
+font-weight: bold;
+}
+.pl-mq {
+color: #008080;
+}
+.pl-mi {
+color: #333;
+font-style: italic;
+}
+.pl-mb {
+color: #333;
+font-weight: bold;
+}
+.pl-md {
+background-color: #ffecec;
+color: #bd2c00;
+}
+.pl-mi1 {
+background-color: #eaffea;
+color: #55a532;
+}
+.pl-mdr {
+color: #795da3;
+font-weight: bold;
+}
+.pl-mo {
+color: #1d3e81;
+}
+kbd {
+display: inline-block;
+padding: 3px 5px;
+font: 11px Consolas, "Liberation Mono", Menlo, Courier, monospace;
+line-height: 10px;
+color: #555;
+vertical-align: middle;
+background-color: #fcfcfc;
+border: solid 1px #ccc;
+border-bottom-color: #bbb;
+border-radius: 3px;
+box-shadow: inset 0 -1px 0 #bbb;
+}
+.task-list-item {
+list-style-type: none;
+}
+.task-list-item+.task-list-item {
+margin-top: 3px;
+}
+.task-list-item input {
+margin: 0 0.35em 0.25em -1.6em;
+vertical-align: middle;
+}
+:checked+.radio-label {
+z-index: 1;
+position: relative;
+border-color: #4078c0;
+}
+.sourceLine {
+display: inline-block;
+}
+code .kw { color: #000000; }
+code .dt { color: #ed6a43; }
+code .dv { color: #009999; }
+code .bn { color: #009999; }
+code .fl { color: #009999; }
+code .ch { color: #009999; }
+code .st { color: #183691; }
+code .co { color: #969896; }
+code .ot { color: #0086b3; }
+code .al { color: #a61717; }
+code .fu { color: #63a35c; }
+code .er { color: #a61717; background-color: #e3d2d2; }
+code .wa { color: #000000; }
+code .cn { color: #008080; }
+code .sc { color: #008080; }
+code .vs { color: #183691; }
+code .ss { color: #183691; }
+code .im { color: #000000; }
+code .va {color: #008080; }
+code .cf { color: #000000; }
+code .op { color: #000000; }
+code .bu { color: #000000; }
+code .ex { color: #000000; }
+code .pp { color: #999999; }
+code .at { color: #008080; }
+code .do { color: #969896; }
+code .an { color: #008080; }
+code .cv { color: #008080; }
+code .in { color: #008080; }
+</style>
+<style>
+body {
+box-sizing: border-box;
+min-width: 200px;
+max-width: 980px;
+margin: 0 auto;
+padding: 45px;
+padding-top: 0px;
+}
+</style>
+
+
+</head>
+
+<body>
+
+<h1 id="crosswalk">crosswalk</h1>
+<p>An R interface to inter-geography and inter-temporal crosswalks.</p>
+<h2 id="overview">Overview</h2>
+<p>This package provides a consistent API and standardized versions of
+crosswalks to enable consistent approaches that work across different
+geography and year combinations. The package also facilitates
+interpolation–that is, adjusting source geography/year values by their
+crosswalk weights and translating these values to the desired target
+geography/year–including diagnostics of the joins between source data
+and crosswalks.</p>
+<p>The package sources crosswalks from:</p>
+<ul>
+<li><strong>Geocorr 2022</strong> (Missouri Census Data Center) - for
+same-year crosswalks between geographies</li>
+<li><strong>IPUMS NHGIS</strong> - for inter-temporal crosswalks (across
+different census years)</li>
+<li><strong>CT Data Collaborative</strong> - for Connecticut 2020→2022
+crosswalks (planning region changes)</li>
+</ul>
+<h2 id="why-use-crosswalk">Why Use <code>crosswalk</code>?</h2>
+<ul>
+<li><strong>Programmatic access</strong>: No more manual downloads from
+web interfaces</li>
+<li><strong>Standardized output</strong>: Consistent column names across
+all crosswalk sources</li>
+<li><strong>Metadata tracking</strong>: Full provenance stored as
+attributes</li>
+<li><strong>Multi-step handling</strong>: Automatic chaining when both
+geography and year change</li>
+<li><strong>Local caching</strong>: Reproducible workflows with cached
+crosswalks</li>
+</ul>
+<h2 id="installation">Installation</h2>
+<div class="sourceCode" id="cb1"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb1-1"><a href="#cb1-1" tabindex="-1"></a><span class="co"># Install from GitHub</span></span>
+<span id="cb1-2"><a href="#cb1-2" tabindex="-1"></a>renv<span class="sc">::</span><span class="fu">install</span>(<span class="st">&quot;UI-Research/crosswalk&quot;</span>)</span></code></pre></div>
+<h2 id="overview-1">Overview</h2>
+<p>First we obtain a crosswalk and apply it to our data:</p>
+<div class="sourceCode" id="cb2"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb2-1"><a href="#cb2-1" tabindex="-1"></a><span class="fu">library</span>(crosswalk)</span>
+<span id="cb2-2"><a href="#cb2-2" tabindex="-1"></a><span class="fu">library</span>(dplyr)</span>
+<span id="cb2-3"><a href="#cb2-3" tabindex="-1"></a><span class="fu">library</span>(stringr)</span>
+<span id="cb2-4"><a href="#cb2-4" tabindex="-1"></a><span class="fu">library</span>(sf)</span>
+<span id="cb2-5"><a href="#cb2-5" tabindex="-1"></a></span>
+<span id="cb2-6"><a href="#cb2-6" tabindex="-1"></a>source_data <span class="ot">=</span> tidycensus<span class="sc">::</span><span class="fu">get_acs</span>(</span>
+<span id="cb2-7"><a href="#cb2-7" tabindex="-1"></a>    <span class="at">year =</span> <span class="dv">2023</span>,</span>
+<span id="cb2-8"><a href="#cb2-8" tabindex="-1"></a>    <span class="at">geography =</span> <span class="st">&quot;zcta&quot;</span>,</span>
+<span id="cb2-9"><a href="#cb2-9" tabindex="-1"></a>    <span class="at">output =</span> <span class="st">&quot;wide&quot;</span>,</span>
+<span id="cb2-10"><a href="#cb2-10" tabindex="-1"></a>    <span class="at">variables =</span> <span class="fu">c</span>(<span class="at">below_poverty_level =</span> <span class="st">&quot;B17001_002&quot;</span>)) <span class="sc">%&gt;%</span></span>
+<span id="cb2-11"><a href="#cb2-11" tabindex="-1"></a>  dplyr<span class="sc">::</span><span class="fu">select</span>(</span>
+<span id="cb2-12"><a href="#cb2-12" tabindex="-1"></a>    <span class="at">source_geoid =</span> GEOID,</span>
+<span id="cb2-13"><a href="#cb2-13" tabindex="-1"></a>    <span class="at">count_below_poverty_level =</span> below_poverty_levelE)</span>
+<span id="cb2-14"><a href="#cb2-14" tabindex="-1"></a></span>
+<span id="cb2-15"><a href="#cb2-15" tabindex="-1"></a><span class="fu">get_crosswalk</span>(</span>
+<span id="cb2-16"><a href="#cb2-16" tabindex="-1"></a>  <span class="at">source_geography =</span> <span class="st">&quot;block&quot;</span>,</span>
+<span id="cb2-17"><a href="#cb2-17" tabindex="-1"></a>  <span class="at">target_geography =</span> <span class="st">&quot;puma&quot;</span>,</span>
+<span id="cb2-18"><a href="#cb2-18" tabindex="-1"></a>  <span class="at">source_year =</span> <span class="dv">2010</span>,</span>
+<span id="cb2-19"><a href="#cb2-19" tabindex="-1"></a>  <span class="at">target_year =</span> <span class="dv">2020</span>,</span>
+<span id="cb2-20"><a href="#cb2-20" tabindex="-1"></a>  <span class="at">weight =</span> <span class="st">&quot;population&quot;</span>)</span>
+<span id="cb2-21"><a href="#cb2-21" tabindex="-1"></a></span>
+<span id="cb2-22"><a href="#cb2-22" tabindex="-1"></a><span class="co"># Get a crosswalk from ZCTAs to PUMAs (same year, uses Geocorr (2022))</span></span>
+<span id="cb2-23"><a href="#cb2-23" tabindex="-1"></a>zcta_puma_crosswalk <span class="ot">&lt;-</span> <span class="fu">get_crosswalk</span>(</span>
+<span id="cb2-24"><a href="#cb2-24" tabindex="-1"></a>  <span class="at">source_geography =</span> <span class="st">&quot;zcta&quot;</span>,</span>
+<span id="cb2-25"><a href="#cb2-25" tabindex="-1"></a>  <span class="at">target_geography =</span> <span class="st">&quot;puma22&quot;</span>,</span>
+<span id="cb2-26"><a href="#cb2-26" tabindex="-1"></a>  <span class="at">weight =</span> <span class="st">&quot;population&quot;</span>)</span>
+<span id="cb2-27"><a href="#cb2-27" tabindex="-1"></a></span>
+<span id="cb2-28"><a href="#cb2-28" tabindex="-1"></a><span class="co"># Apply the crosswalk to your data</span></span>
+<span id="cb2-29"><a href="#cb2-29" tabindex="-1"></a>crosswalked_data <span class="ot">&lt;-</span> <span class="fu">crosswalk_data</span>(</span>
+<span id="cb2-30"><a href="#cb2-30" tabindex="-1"></a>  <span class="at">geoid_column =</span> <span class="st">&quot;source_geoid&quot;</span>,</span>
+<span id="cb2-31"><a href="#cb2-31" tabindex="-1"></a>  <span class="at">data =</span> source_data,</span>
+<span id="cb2-32"><a href="#cb2-32" tabindex="-1"></a>  <span class="at">crosswalk =</span> zcta_puma_crosswalk)</span></code></pre></div>
+<p>What does the crosswalk(s) reflect and how was it sourced?</p>
+<div class="sourceCode" id="cb3"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb3-1"><a href="#cb3-1" tabindex="-1"></a><span class="fu">attr</span>(crosswalked_data, <span class="st">&quot;crosswalk_metadata&quot;</span>)</span></code></pre></div>
+<p>How well did the crosswalk join to our source data?</p>
+<div class="sourceCode" id="cb4"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb4-1"><a href="#cb4-1" tabindex="-1"></a><span class="do">## look at all the characteristics of the join(s) between the source data</span></span>
+<span id="cb4-2"><a href="#cb4-2" tabindex="-1"></a><span class="do">## and the crosswalks</span></span>
+<span id="cb4-3"><a href="#cb4-3" tabindex="-1"></a>join_quality <span class="ot">=</span> <span class="fu">attr</span>(crosswalked_data, <span class="st">&quot;join_quality&quot;</span>)</span>
+<span id="cb4-4"><a href="#cb4-4" tabindex="-1"></a></span>
+<span id="cb4-5"><a href="#cb4-5" tabindex="-1"></a><span class="do">## what share of records in the source data do not join to a crosswalk and</span></span>
+<span id="cb4-6"><a href="#cb4-6" tabindex="-1"></a><span class="do">## thus are dropped during the crosswalking process?</span></span>
+<span id="cb4-7"><a href="#cb4-7" tabindex="-1"></a>join_quality<span class="sc">$</span>pct_data_unmatched</span>
+<span id="cb4-8"><a href="#cb4-8" tabindex="-1"></a></span>
+<span id="cb4-9"><a href="#cb4-9" tabindex="-1"></a><span class="do">## zctas aren&#39;t nested within states, otherwise join_quality$state_analysis_data </span></span>
+<span id="cb4-10"><a href="#cb4-10" tabindex="-1"></a><span class="do">## would help us to ID whether non-joining source data were clustered within one</span></span>
+<span id="cb4-11"><a href="#cb4-11" tabindex="-1"></a><span class="do">## or a few states. instead we can join to spatial data to diagnose further:</span></span>
+<span id="cb4-12"><a href="#cb4-12" tabindex="-1"></a>zctas_sf <span class="ot">=</span> tigris<span class="sc">::</span><span class="fu">zctas</span>(<span class="at">year =</span> <span class="dv">2023</span>)</span>
+<span id="cb4-13"><a href="#cb4-13" tabindex="-1"></a>states_sf <span class="ot">=</span> tigris<span class="sc">::</span><span class="fu">states</span>(<span class="at">year =</span> <span class="dv">2023</span>, <span class="at">cb =</span> <span class="cn">TRUE</span>)</span>
+<span id="cb4-14"><a href="#cb4-14" tabindex="-1"></a></span>
+<span id="cb4-15"><a href="#cb4-15" tabindex="-1"></a><span class="do">## apart from DC, which has a disproportionate number of non-joining ZCTAs--</span></span>
+<span id="cb4-16"><a href="#cb4-16" tabindex="-1"></a><span class="do">## seemingly corresponding to federal areas and buildings--the distribution of</span></span>
+<span id="cb4-17"><a href="#cb4-17" tabindex="-1"></a><span class="do">## non-joining ZCTAs appears proportionate to state-level populations and is </span></span>
+<span id="cb4-18"><a href="#cb4-18" tabindex="-1"></a><span class="do">## distributed across many states:</span></span>
+<span id="cb4-19"><a href="#cb4-19" tabindex="-1"></a>zctas_sf <span class="sc">%&gt;%</span> </span>
+<span id="cb4-20"><a href="#cb4-20" tabindex="-1"></a>  dplyr<span class="sc">::</span><span class="fu">filter</span>(GEOID20 <span class="sc">%in%</span> join_quality<span class="sc">$</span>data_geoids_unmatched) <span class="sc">%&gt;%</span></span>
+<span id="cb4-21"><a href="#cb4-21" tabindex="-1"></a>  sf<span class="sc">::</span><span class="fu">st_intersection</span>(states_sf <span class="sc">%&gt;%</span> <span class="fu">select</span>(NAME)) <span class="sc">%&gt;%</span></span>
+<span id="cb4-22"><a href="#cb4-22" tabindex="-1"></a>  sf<span class="sc">::</span><span class="fu">st_drop_geometry</span>() <span class="sc">%&gt;%</span></span>
+<span id="cb4-23"><a href="#cb4-23" tabindex="-1"></a>  dplyr<span class="sc">::</span><span class="fu">count</span>(NAME, <span class="at">sort =</span> <span class="cn">TRUE</span>)</span></code></pre></div>
+<p>And how accurate was the crosswalking process?</p>
+<div class="sourceCode" id="cb5"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb5-1"><a href="#cb5-1" tabindex="-1"></a>comparison_data <span class="ot">=</span> tidycensus<span class="sc">::</span><span class="fu">get_acs</span>(</span>
+<span id="cb5-2"><a href="#cb5-2" tabindex="-1"></a>    <span class="at">year =</span> <span class="dv">2023</span>,</span>
+<span id="cb5-3"><a href="#cb5-3" tabindex="-1"></a>    <span class="at">geography =</span> <span class="st">&quot;puma&quot;</span>,</span>
+<span id="cb5-4"><a href="#cb5-4" tabindex="-1"></a>    <span class="at">output =</span> <span class="st">&quot;wide&quot;</span>,</span>
+<span id="cb5-5"><a href="#cb5-5" tabindex="-1"></a>    <span class="at">variables =</span> <span class="fu">c</span>(</span>
+<span id="cb5-6"><a href="#cb5-6" tabindex="-1"></a>      <span class="at">below_poverty_level =</span> <span class="st">&quot;B17001_002&quot;</span>)) <span class="sc">%&gt;%</span></span>
+<span id="cb5-7"><a href="#cb5-7" tabindex="-1"></a>  dplyr<span class="sc">::</span><span class="fu">select</span>(</span>
+<span id="cb5-8"><a href="#cb5-8" tabindex="-1"></a>    <span class="at">source_geoid =</span> GEOID,</span>
+<span id="cb5-9"><a href="#cb5-9" tabindex="-1"></a>    <span class="at">count_below_poverty_level_acs =</span> below_poverty_levelE)</span>
+<span id="cb5-10"><a href="#cb5-10" tabindex="-1"></a></span>
+<span id="cb5-11"><a href="#cb5-11" tabindex="-1"></a>combined_data <span class="ot">=</span> dplyr<span class="sc">::</span><span class="fu">left_join</span>(</span>
+<span id="cb5-12"><a href="#cb5-12" tabindex="-1"></a>  comparison_data,</span>
+<span id="cb5-13"><a href="#cb5-13" tabindex="-1"></a>  crosswalked_data,</span>
+<span id="cb5-14"><a href="#cb5-14" tabindex="-1"></a>  <span class="at">by =</span> <span class="fu">c</span>(<span class="st">&quot;source_geoid&quot;</span> <span class="ot">=</span> <span class="st">&quot;geoid&quot;</span>)) </span>
+<span id="cb5-15"><a href="#cb5-15" tabindex="-1"></a>  </span>
+<span id="cb5-16"><a href="#cb5-16" tabindex="-1"></a>combined_data <span class="sc">%&gt;%</span></span>
+<span id="cb5-17"><a href="#cb5-17" tabindex="-1"></a>  dplyr<span class="sc">::</span><span class="fu">select</span>(source_geoid, dplyr<span class="sc">::</span><span class="fu">matches</span>(<span class="st">&quot;count&quot;</span>)) <span class="sc">%&gt;%</span></span>
+<span id="cb5-18"><a href="#cb5-18" tabindex="-1"></a>  dplyr<span class="sc">::</span><span class="fu">mutate</span>(<span class="at">difference_percent =</span> (count_below_poverty_level_acs <span class="sc">-</span> count_below_poverty_level) <span class="sc">/</span> count_below_poverty_level_acs) <span class="sc">%&gt;%</span></span>
+<span id="cb5-19"><a href="#cb5-19" tabindex="-1"></a>  ggplot2<span class="sc">::</span><span class="fu">ggplot</span>() <span class="sc">+</span></span>
+<span id="cb5-20"><a href="#cb5-20" tabindex="-1"></a>    ggplot2<span class="sc">::</span><span class="fu">geom_histogram</span>(ggplot2<span class="sc">::</span><span class="fu">aes</span>(<span class="at">x =</span> difference_percent)) <span class="sc">+</span></span>
+<span id="cb5-21"><a href="#cb5-21" tabindex="-1"></a>    ggplot2<span class="sc">::</span><span class="fu">theme_minimal</span>() <span class="sc">+</span></span>
+<span id="cb5-22"><a href="#cb5-22" tabindex="-1"></a>    ggplot2<span class="sc">::</span><span class="fu">theme</span>(<span class="at">panel.grid =</span> ggplot2<span class="sc">::</span><span class="fu">element_blank</span>()) <span class="sc">+</span></span>
+<span id="cb5-23"><a href="#cb5-23" tabindex="-1"></a>    ggplot2<span class="sc">::</span><span class="fu">scale_x_continuous</span>(<span class="at">labels =</span> scales<span class="sc">::</span>percent) <span class="sc">+</span></span>
+<span id="cb5-24"><a href="#cb5-24" tabindex="-1"></a>    ggplot2<span class="sc">::</span><span class="fu">labs</span>(</span>
+<span id="cb5-25"><a href="#cb5-25" tabindex="-1"></a>      <span class="at">title =</span> <span class="st">&quot;Crosswalked data approximates observed values&quot;</span>,</span>
+<span id="cb5-26"><a href="#cb5-26" tabindex="-1"></a>      <span class="at">subtitle =</span> <span class="st">&quot;Block group-level source data would produce more accurate crosswalked values&quot;</span>,</span>
+<span id="cb5-27"><a href="#cb5-27" tabindex="-1"></a>      <span class="at">y =</span> <span class="st">&quot;&quot;</span>,</span>
+<span id="cb5-28"><a href="#cb5-28" tabindex="-1"></a>      <span class="at">x =</span> <span class="st">&quot;Percent difference between observed and crosswalked values&quot;</span>)</span></code></pre></div>
+<h2 id="core-functions">Core Functions</h2>
+<p>The package has two main functions:</p>
+<table>
+<thead>
+<tr class="header">
+<th>Function</th>
+<th>Purpose</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><code>get_crosswalk()</code></td>
+<td>Fetch crosswalk(s)</td>
+</tr>
+<tr class="even">
+<td><code>crosswalk_data()</code></td>
+<td>Apply crosswalk(s) to interpolate data to the target
+geography-year</td>
+</tr>
+</tbody>
+</table>
+<h2 id="understanding-get_crosswalk-output">Understanding
+<code>get_crosswalk()</code> Output</h2>
+<p><code>get_crosswalk()</code> <strong>always returns a list</strong>
+structured as follows:</p>
+<div class="sourceCode" id="cb6"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb6-1"><a href="#cb6-1" tabindex="-1"></a>result <span class="ot">&lt;-</span> <span class="fu">get_crosswalk</span>(</span>
+<span id="cb6-2"><a href="#cb6-2" tabindex="-1"></a>  <span class="at">source_geography =</span> <span class="st">&quot;tract&quot;</span>,</span>
+<span id="cb6-3"><a href="#cb6-3" tabindex="-1"></a>  <span class="at">target_geography =</span> <span class="st">&quot;zcta&quot;</span>,</span>
+<span id="cb6-4"><a href="#cb6-4" tabindex="-1"></a>  <span class="at">source_year =</span> <span class="dv">2010</span>,</span>
+<span id="cb6-5"><a href="#cb6-5" tabindex="-1"></a>  <span class="at">target_year =</span> <span class="dv">2020</span>,</span>
+<span id="cb6-6"><a href="#cb6-6" tabindex="-1"></a>  <span class="at">weight =</span> <span class="st">&quot;population&quot;</span></span>
+<span id="cb6-7"><a href="#cb6-7" tabindex="-1"></a>)</span>
+<span id="cb6-8"><a href="#cb6-8" tabindex="-1"></a></span>
+<span id="cb6-9"><a href="#cb6-9" tabindex="-1"></a><span class="fu">names</span>(result)</span>
+<span id="cb6-10"><a href="#cb6-10" tabindex="-1"></a><span class="co">#&gt; [1] &quot;crosswalks&quot; &quot;plan&quot; &quot;message&quot;</span></span></code></pre></div>
+<p>The list contains three elements:</p>
+<table>
+<thead>
+<tr class="header">
+<th>Element</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><code>crosswalks</code></td>
+<td>A named list of crosswalks (<code>step_1</code>,
+<code>step_2</code>, etc.) of length one or greater</td>
+</tr>
+<tr class="even">
+<td><code>plan</code></td>
+<td>Details about what crosswalks are being fetched</td>
+</tr>
+<tr class="odd">
+<td><code>message</code></td>
+<td>A human-readable description of the crosswalk chain</td>
+</tr>
+</tbody>
+</table>
+<h3 id="single-step-vs-multi-step-crosswalks">Single-Step vs. Multi-Step
+Crosswalks</h3>
+<p><strong>Single-step crosswalks</strong> (same year, different
+geography OR same geography, different year):</p>
+<div class="sourceCode" id="cb7"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb7-1"><a href="#cb7-1" tabindex="-1"></a><span class="co"># Same year, different geography (Geocorr)</span></span>
+<span id="cb7-2"><a href="#cb7-2" tabindex="-1"></a>result <span class="ot">&lt;-</span> <span class="fu">get_crosswalk</span>(</span>
+<span id="cb7-3"><a href="#cb7-3" tabindex="-1"></a>  <span class="at">source_geography =</span> <span class="st">&quot;tract&quot;</span>,</span>
+<span id="cb7-4"><a href="#cb7-4" tabindex="-1"></a>  <span class="at">target_geography =</span> <span class="st">&quot;zcta&quot;</span>,</span>
+<span id="cb7-5"><a href="#cb7-5" tabindex="-1"></a>  <span class="at">weight =</span> <span class="st">&quot;population&quot;</span></span>
+<span id="cb7-6"><a href="#cb7-6" tabindex="-1"></a>)</span>
+<span id="cb7-7"><a href="#cb7-7" tabindex="-1"></a><span class="co"># result$crosswalks$step_1 contains one crosswalk</span></span>
+<span id="cb7-8"><a href="#cb7-8" tabindex="-1"></a></span>
+<span id="cb7-9"><a href="#cb7-9" tabindex="-1"></a><span class="co"># Same geography, different year (NHGIS)</span></span>
+<span id="cb7-10"><a href="#cb7-10" tabindex="-1"></a>result <span class="ot">&lt;-</span> <span class="fu">get_crosswalk</span>(</span>
+<span id="cb7-11"><a href="#cb7-11" tabindex="-1"></a>  <span class="at">source_geography =</span> <span class="st">&quot;tract&quot;</span>,</span>
+<span id="cb7-12"><a href="#cb7-12" tabindex="-1"></a>  <span class="at">target_geography =</span> <span class="st">&quot;tract&quot;</span>,</span>
+<span id="cb7-13"><a href="#cb7-13" tabindex="-1"></a>  <span class="at">source_year =</span> <span class="dv">2010</span>,</span>
+<span id="cb7-14"><a href="#cb7-14" tabindex="-1"></a>  <span class="at">target_year =</span> <span class="dv">2020</span></span>
+<span id="cb7-15"><a href="#cb7-15" tabindex="-1"></a>)</span>
+<span id="cb7-16"><a href="#cb7-16" tabindex="-1"></a><span class="co"># result$crosswalks$step_1 contains one crosswalk</span></span></code></pre></div>
+<p><strong>Multi-step crosswalks</strong> (different geography AND
+different year):</p>
+<p>When both geography and year change, no single crosswalk source
+provides this directly. The package automatically plans and fetches a
+two-step chain:</p>
+<ol style="list-style-type: decimal">
+<li><strong>Step 1 (NHGIS)</strong>: Change year, keep geography
+constant</li>
+<li><strong>Step 2 (Geocorr)</strong>: Change geography at target
+year</li>
+</ol>
+<div class="sourceCode" id="cb8"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb8-1"><a href="#cb8-1" tabindex="-1"></a>result <span class="ot">&lt;-</span> <span class="fu">get_crosswalk</span>(</span>
+<span id="cb8-2"><a href="#cb8-2" tabindex="-1"></a>  <span class="at">source_geography =</span> <span class="st">&quot;tract&quot;</span>,</span>
+<span id="cb8-3"><a href="#cb8-3" tabindex="-1"></a>  <span class="at">target_geography =</span> <span class="st">&quot;zcta&quot;</span>,</span>
+<span id="cb8-4"><a href="#cb8-4" tabindex="-1"></a>  <span class="at">source_year =</span> <span class="dv">2010</span>,</span>
+<span id="cb8-5"><a href="#cb8-5" tabindex="-1"></a>  <span class="at">target_year =</span> <span class="dv">2020</span>,</span>
+<span id="cb8-6"><a href="#cb8-6" tabindex="-1"></a>  <span class="at">weight =</span> <span class="st">&quot;population&quot;</span></span>
+<span id="cb8-7"><a href="#cb8-7" tabindex="-1"></a>)</span>
+<span id="cb8-8"><a href="#cb8-8" tabindex="-1"></a></span>
+<span id="cb8-9"><a href="#cb8-9" tabindex="-1"></a><span class="co"># Two crosswalks are returned</span></span>
+<span id="cb8-10"><a href="#cb8-10" tabindex="-1"></a><span class="fu">names</span>(result<span class="sc">$</span>crosswalks)</span>
+<span id="cb8-11"><a href="#cb8-11" tabindex="-1"></a><span class="co">#&gt; [1] &quot;step_1&quot; &quot;step_2&quot;</span></span>
+<span id="cb8-12"><a href="#cb8-12" tabindex="-1"></a></span>
+<span id="cb8-13"><a href="#cb8-13" tabindex="-1"></a><span class="co"># Step 1: 2010 tracts -&gt; 2020 tracts (NHGIS)</span></span>
+<span id="cb8-14"><a href="#cb8-14" tabindex="-1"></a><span class="co"># Step 2: 2020 tracts -&gt; 2020 ZCTAs (Geocorr)</span></span></code></pre></div>
+<h3 id="crosswalk-structure">Crosswalk Structure</h3>
+<p>Each crosswalk contains standardized columns:</p>
+<table>
+<thead>
+<tr class="header">
+<th>Column</th>
+<th>Description</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><code>source_geoid</code></td>
+<td>Identifier for source geography</td>
+</tr>
+<tr class="even">
+<td><code>target_geoid</code></td>
+<td>Identifier for target geography</td>
+</tr>
+<tr class="odd">
+<td><code>allocation_factor_source_to_target</code></td>
+<td>Weight for interpolating values</td>
+</tr>
+<tr class="even">
+<td><code>weighting_factor</code></td>
+<td>What attribute was used (population, housing, land)</td>
+</tr>
+</tbody>
+</table>
+<p>Additional columns may include <code>source_year</code>,
+<code>target_year</code>, <code>population_2020</code>,
+<code>housing_2020</code>, and <code>land_area_sqmi</code> depending on
+the source.</p>
+<h3 id="accessing-metadata">Accessing Metadata</h3>
+<p>Each crosswalk tibble has a <code>crosswalk_metadata</code> attribute
+that documents what the crosswalk represents and how it was created:</p>
+<div class="sourceCode" id="cb9"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb9-1"><a href="#cb9-1" tabindex="-1"></a>metadata <span class="ot">&lt;-</span> <span class="fu">attr</span>(result<span class="sc">$</span>crosswalks<span class="sc">$</span>step_1, <span class="st">&quot;crosswalk_metadata&quot;</span>)</span>
+<span id="cb9-2"><a href="#cb9-2" tabindex="-1"></a><span class="fu">names</span>(metadata)</span>
+<span id="cb9-3"><a href="#cb9-3" tabindex="-1"></a><span class="co">#&gt; [1] &quot;call_parameters&quot; &quot;data_source&quot; &quot;data_source_full_name&quot; &quot;download_url&quot; ...</span></span></code></pre></div>
+<h2 id="using-crosswalk_data-to-interpolate-data">Using
+<code>crosswalk_data()</code> to Interpolate Data</h2>
+<p><code>crosswalk_data()</code> applies crosswalk weights to transform
+your data. It automatically handles multi-step crosswalks.</p>
+<h3 id="column-naming-convention">Column Naming Convention</h3>
+<p>The function auto-detects columns based on prefixes:</p>
+<table>
+<thead>
+<tr class="header">
+<th>Prefix</th>
+<th>Treatment</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td><code>count_</code></td>
+<td>Summed after weighting (for counts like population, housing
+units)</td>
+</tr>
+<tr class="even">
+<td><code>mean_</code>, <code>median_</code>, <code>percent_</code>,
+<code>ratio_</code></td>
+<td>Weighted mean (for rates, percentages, averages)</td>
+</tr>
+</tbody>
+</table>
+<p>You can also specify columns explicitly via
+<code>count_columns</code> and <code>non_count_columns</code>. All
+non-count variables are interpolated using weighted means, weighting by
+the allocation factor from the crosswalk.</p>
+<h2 id="supported-geography-and-year-combinations">Supported Geography
+and Year Combinations</h2>
+<h3 id="inter-geography-crosswalks-geocorr">Inter-Geography Crosswalks
+(Geocorr)</h3>
+<p>2022-vintage crosswalks between any of these geographies:</p>
+<ul>
+<li>block, block group, tract, county</li>
+<li>place, zcta, puma22</li>
+<li>cd118, cd119, urban_area, core_based_statistical_area</li>
+</ul>
+<h3 id="inter-temporal-crosswalks-nhgis">Inter-Temporal Crosswalks
+(NHGIS)</h3>
+<p>NHGIS provides cross-decade crosswalks with the following
+structure:</p>
+<p><strong>Source geographies:</strong> block, block_group, tract</p>
+<p><strong>Target geographies:</strong> - From blocks (decennial years
+only): block, block_group, tract, county, place, zcta, puma, urban_area,
+cbsa - From block_group or tract: block_group, tract, county</p>
+<table>
+<thead>
+<tr class="header">
+<th>Source Years</th>
+<th>Target Years</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td>1990, 2000</td>
+<td>2010, 2014, 2015, 2020, 2022</td>
+</tr>
+<tr class="even">
+<td>2010, 2011, 2012, 2014, 2015</td>
+<td>1990, 2000, 2020, 2022</td>
+</tr>
+<tr class="odd">
+<td>2020, 2022</td>
+<td>1990, 2000, 2010, 2014, 2015</td>
+</tr>
+</tbody>
+</table>
+<p><strong>Notes:</strong> - Within-decade crosswalks (e.g., 2010→2014)
+are not available from NHGIS - Block→ZCTA, Block→PUMA, etc. are only
+available for decennial years (1990, 2000, 2010, 2020) - The package
+automatically uses direct NHGIS crosswalks when available (e.g.,
+<code>get_crosswalk(source_geography = &quot;block&quot;, target_geography = &quot;zcta&quot;, source_year = 2010, target_year = 2020)</code>
+returns a single-step NHGIS crosswalk)</p>
+<h3 id="id_20202022-crosswalks-ctdata">2020→2022 Crosswalks
+(CTData)</h3>
+<p>For 2020 to 2022 transformations, the package uses CT Data
+Collaborative crosswalks for Connecticut (where planning regions
+replaced counties) and identity mappings for other states (where no
+changes occurred).</p>
+<h2 id="api-keys">API Keys</h2>
+<p>NHGIS crosswalks require an IPUMS API key. Get one at <a href="https://account.ipums.org/api_keys">https://account.ipums.org/api_keys</a>
+and add to your <code>.Renviron</code>:</p>
+<div class="sourceCode" id="cb10"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb10-1"><a href="#cb10-1" tabindex="-1"></a>usethis<span class="sc">::</span><span class="fu">edit_r_environ</span>()</span>
+<span id="cb10-2"><a href="#cb10-2" tabindex="-1"></a><span class="co"># Add: IPUMS_API_KEY=your_key_here</span></span></code></pre></div>
+<h2 id="caching">Caching</h2>
+<p>Use the <code>cache</code> parameter to save crosswalks locally for
+ease:</p>
+<div class="sourceCode" id="cb11"><pre class="sourceCode r"><code class="sourceCode r"><span id="cb11-1"><a href="#cb11-1" tabindex="-1"></a>result <span class="ot">&lt;-</span> <span class="fu">get_crosswalk</span>(</span>
+<span id="cb11-2"><a href="#cb11-2" tabindex="-1"></a>  <span class="at">source_geography =</span> <span class="st">&quot;tract&quot;</span>,</span>
+<span id="cb11-3"><a href="#cb11-3" tabindex="-1"></a>  <span class="at">target_geography =</span> <span class="st">&quot;zcta&quot;</span>,</span>
+<span id="cb11-4"><a href="#cb11-4" tabindex="-1"></a>  <span class="at">weight =</span> <span class="st">&quot;population&quot;</span>,</span>
+<span id="cb11-5"><a href="#cb11-5" tabindex="-1"></a>  <span class="at">cache =</span> here<span class="sc">::</span><span class="fu">here</span>(<span class="st">&quot;crosswalks-cache&quot;</span>))</span></code></pre></div>
+<h2 id="citations">Citations</h2>
+<p>The intellectual credit for the underlying crosswalks belongs to the
+original developers.</p>
+<p><strong>For NHGIS</strong>, see citation requirements at: <a href="https://www.nhgis.org/citation-and-use-nhgis-data">https://www.nhgis.org/citation-and-use-nhgis-data</a></p>
+<p><strong>For Geocorr</strong>, a suggested citation:</p>
+<blockquote>
+<p>Missouri Census Data Center, University of Missouri. (2022). Geocorr
+2022: Geographic Correspondence Engine. Retrieved from: <a href="https://mcdc.missouri.edu/applications/geocorr2022.html">https://mcdc.missouri.edu/applications/geocorr2022.html</a></p>
+</blockquote>
+
+</body>
+</html>
diff --git a/README.md b/README.md
index 91bb023..7d6b80c 100644
--- a/README.md
+++ b/README.md
@@ -1,99 +1,349 @@
-# crosswalk
-
-An R interface to inter-geography and inter-temporal crosswalks.
-
-## Overview
-
-This project provides a consistent API and standardized versions of
-crosswalks to allow for programmatic interpolation over time and between
-geographies. Say goodbye to manual crosswalk downloads and hello to
-reproducible workflows.
-
-## Installation
-
-``` r
-# Install dependencies
-renv::install("UI-Research/crosswalks")
-```
-
-## Usage
-
-To get started with `library(crosswalk)`:
-
-``` r
-# Load the package
-library(crosswalk)
-
-## obtain a crosswalk to translate data in 2020-vintage place geographies
-## to 2020-vintage county geographies, weighted by population
-place_county_crosswalk = get_crosswalk(
-  source_geography = "place",
-  target_geography = "county",
-  weight = c("population"),
-  cache = here::here("data"))
-
-## obtain a crosswalk to translate data in 2000-vintage place geographies
-## to 2010-vintage place geographies. all available weighting options are
-## returned
-get_crosswalk(
-  source_year = 2000,
-  target_year = 2010,
-  source_geography = "place",
-  target_geography = "place",
-  cache = here::here("data"))
-```
-
-## Why Use `library(crosswalk)`?
-
-Crosswalks are a critical component of conducting social sciences
-research as they enable analysts to translate data from one geography
-and/or temporal vintage to another. For example, if source data are only
-available at the county level, a crosswalk can help to produce estimates
-of those source data at the city level, enabling analysis at a geography
-(the city) that may be either more relevant to target audience(s) and/or
-may align with the geography of other data that form part of the
-overarching analysis.
-
-There are excellent existing resources for crosswalks, including the
-University of Missouri - Missouri Census Data Center's Geocorr 2022
-crosswalking application and the IPUMS National Historical Geographic
-Information System (NHGIS). In fact, the crosswalks returned by using
-`library(crosswalk)` are those from Geocorr and NHGIS.
-
-So why use this package at all? It provides:
-
--   A consistent, programmatic approach to acquire crosswalks, rather
-    than ad-hoc manual downloads;
-
--   Standardized and clear crosswalk variable names so that you can
-    easily work with multiple crosswalks using the same workflow;
-
--   Crosswalk metadata stored within the returned crosswalk–no more
-    commenting in your script with the 15 options you configured prior
-    to clicking "Download";
-
--   The ability to easily "cache" crosswalks locally.
-
-In brief: this package facilitates a well documented and reproducible
-analysis workflow, building on top of the robust underlying resources
-already available for crosswalking.
-
-## Citations!
-
-The intellectual work and credit for the underlying crosswalks returned
-by this package belongs to the original developers of those crosswalks.
-You should (in the case of Geocorr crosswalks) and in some cases must
-(in the case of NHGIS crosswalks) appropriately cite the developers when
-you use these resources.
-
-**For NHGIS**, you should refer to the NHGIS website and terms of use,
-including the recommended citations provided at:
-<https://www.nhgis.org/citation-and-use-nhgis-data>.
-
-**For Geocorr**, the author of `library(crosswalk)` is unaware of a
-required or suggested citation format. An example citation might look
-like:
-
-> Missouri Census Data Center, University of Missouri. (2022). Geocorr
-> 2022: Geographic Correspondence Engine. Retrieved [202X-XX-XX] from:
-> <https://mcdc.missouri.edu/applications/geocorr2022.html>.
+
+# crosswalk
+
+An R interface to inter-geography and inter-temporal crosswalks.
+
+## Overview
+
+This package provides a consistent API and standardized versions of
+crosswalks to enable consistent approaches that work across different
+geography and year combinations. The package also facilitates
+interpolation–that is, adjusting source geography/year values by their
+crosswalk weights and translating these values to the desired target
+geography/year–including diagnostics of the joins between source data
+and crosswalks.
+
+The package sources crosswalks from:
+
+- **Geocorr 2022** (Missouri Census Data Center) - for same-year
+  crosswalks between geographies
+- **IPUMS NHGIS** - for inter-temporal crosswalks (across different
+  census years)
+- **CT Data Collaborative** - for Connecticut 2020→2022 crosswalks
+  (planning region changes)
+
+## Why Use `crosswalk`?
+
+- **Programmatic access**: No more manual downloads from web interfaces
+- **Standardized output**: Consistent column names across all crosswalk
+  sources
+- **Metadata tracking**: Full provenance stored as attributes
+- **Multi-step handling**: Automatic chaining when both geography and
+  year change
+- **Local caching**: Reproducible workflows with cached crosswalks
+
+## Installation
+
+``` r
+# Install from GitHub
+renv::install("UI-Research/crosswalk")
+```
+
+## Overview
+
+First we obtain a crosswalk and apply it to our data:
+
+``` r
+library(crosswalk)
+library(dplyr)
+library(stringr)
+library(sf)
+
+source_data = tidycensus::get_acs(
+    year = 2023,
+    geography = "zcta",
+    output = "wide",
+    variables = c(below_poverty_level = "B17001_002")) %>%
+  dplyr::select(
+    source_geoid = GEOID,
+    count_below_poverty_level = below_poverty_levelE)
+
+get_crosswalk(
+  source_geography = "block",
+  target_geography = "puma",
+  source_year = 2010,
+  target_year = 2020,
+  weight = "population")
+
+# Get a crosswalk from ZCTAs to PUMAs (same year, uses Geocorr (2022))
+zcta_puma_crosswalk <- get_crosswalk(
+  source_geography = "zcta",
+  target_geography = "puma22",
+  weight = "population")
+
+# Apply the crosswalk to your data
+crosswalked_data <- crosswalk_data(
+  geoid_column = "source_geoid",
+  data = source_data,
+  crosswalk = zcta_puma_crosswalk)
+```
+
+What does the crosswalk(s) reflect and how was it sourced?
+
+``` r
+attr(crosswalked_data, "crosswalk_metadata")
+```
+
+How well did the crosswalk join to our source data?
+
+``` r
+## look at all the characteristics of the join(s) between the source data
+## and the crosswalks
+join_quality = attr(crosswalked_data, "join_quality")
+
+## what share of records in the source data do not join to a crosswalk and
+## thus are dropped during the crosswalking process?
+join_quality$pct_data_unmatched
+
+## zctas aren't nested within states, otherwise join_quality$state_analysis_data 
+## would help us to ID whether non-joining source data were clustered within one
+## or a few states. instead we can join to spatial data to diagnose further:
+zctas_sf = tigris::zctas(year = 2023)
+states_sf = tigris::states(year = 2023, cb = TRUE)
+
+## apart from DC, which has a disproportionate number of non-joining ZCTAs--
+## seemingly corresponding to federal areas and buildings--the distribution of
+## non-joining ZCTAs appears proportionate to state-level populations and is 
+## distributed across many states:
+zctas_sf %>% 
+  dplyr::filter(GEOID20 %in% join_quality$data_geoids_unmatched) %>%
+  sf::st_intersection(states_sf %>% select(NAME)) %>%
+  sf::st_drop_geometry() %>%
+  dplyr::count(NAME, sort = TRUE)
+```
+
+And how accurate was the crosswalking process?
+
+``` r
+comparison_data = tidycensus::get_acs(
+    year = 2023,
+    geography = "puma",
+    output = "wide",
+    variables = c(
+      below_poverty_level = "B17001_002")) %>%
+  dplyr::select(
+    source_geoid = GEOID,
+    count_below_poverty_level_acs = below_poverty_levelE)
+
+combined_data = dplyr::left_join(
+  comparison_data,
+  crosswalked_data,
+  by = c("source_geoid" = "geoid")) 
+  
+combined_data %>%
+  dplyr::select(source_geoid, dplyr::matches("count")) %>%
+  dplyr::mutate(difference_percent = (count_below_poverty_level_acs - count_below_poverty_level) / count_below_poverty_level_acs) %>%
+  ggplot2::ggplot() +
+    ggplot2::geom_histogram(ggplot2::aes(x = difference_percent)) +
+    ggplot2::theme_minimal() +
+    ggplot2::theme(panel.grid = ggplot2::element_blank()) +
+    ggplot2::scale_x_continuous(labels = scales::percent) +
+    ggplot2::labs(
+      title = "Crosswalked data approximates observed values",
+      subtitle = "Block group-level source data would produce more accurate crosswalked values",
+      y = "",
+      x = "Percent difference between observed and crosswalked values")
+```
+
+## Core Functions
+
+The package has two main functions:
+
+| Function           | Purpose                                                             |
+|--------------------|---------------------------------------------------------------------|
+| `get_crosswalk()`  | Fetch crosswalk(s)                                                  |
+| `crosswalk_data()` | Apply crosswalk(s) to interpolate data to the target geography-year |
+
+## Understanding `get_crosswalk()` Output
+
+`get_crosswalk()` **always returns a list** structured as follows:
+
+``` r
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  source_year = 2010,
+  target_year = 2020,
+  weight = "population"
+)
+
+names(result)
+#> [1] "crosswalks" "plan" "message"
+```
+
+The list contains three elements:
+
+| Element      | Description                                                                    |
+|--------------|--------------------------------------------------------------------------------|
+| `crosswalks` | A named list of crosswalks (`step_1`, `step_2`, etc.) of length one or greater |
+| `plan`       | Details about what crosswalks are being fetched                                |
+| `message`    | A human-readable description of the crosswalk chain                            |
+
+### Single-Step vs. Multi-Step Crosswalks
+
+**Single-step crosswalks** (same year, different geography OR same
+geography, different year):
+
+``` r
+# Same year, different geography (Geocorr)
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  weight = "population"
+)
+# result$crosswalks$step_1 contains one crosswalk
+
+# Same geography, different year (NHGIS)
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "tract",
+  source_year = 2010,
+  target_year = 2020
+)
+# result$crosswalks$step_1 contains one crosswalk
+```
+
+**Multi-step crosswalks** (different geography AND different year):
+
+When both geography and year change, no single crosswalk source provides
+this directly. The package automatically plans and fetches a two-step
+chain:
+
+1.  **Step 1 (NHGIS)**: Change year, keep geography constant
+2.  **Step 2 (Geocorr)**: Change geography at target year
+
+``` r
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  source_year = 2010,
+  target_year = 2020,
+  weight = "population"
+)
+
+# Two crosswalks are returned
+names(result$crosswalks)
+#> [1] "step_1" "step_2"
+
+# Step 1: 2010 tracts -> 2020 tracts (NHGIS)
+# Step 2: 2020 tracts -> 2020 ZCTAs (Geocorr)
+```
+
+### Crosswalk Structure
+
+Each crosswalk contains standardized columns:
+
+| Column                               | Description                                         |
+|--------------------------------------|-----------------------------------------------------|
+| `source_geoid`                       | Identifier for source geography                     |
+| `target_geoid`                       | Identifier for target geography                     |
+| `allocation_factor_source_to_target` | Weight for interpolating values                     |
+| `weighting_factor`                   | What attribute was used (population, housing, land) |
+
+Additional columns may include `source_year`, `target_year`,
+`population_2020`, `housing_2020`, and `land_area_sqmi` depending on the
+source.
+
+### Accessing Metadata
+
+Each crosswalk tibble has a `crosswalk_metadata` attribute that
+documents what the crosswalk represents and how it was created:
+
+``` r
+metadata <- attr(result$crosswalks$step_1, "crosswalk_metadata")
+names(metadata)
+#> [1] "call_parameters" "data_source" "data_source_full_name" "download_url" ...
+```
+
+## Using `crosswalk_data()` to Interpolate Data
+
+`crosswalk_data()` applies crosswalk weights to transform your data. It
+automatically handles multi-step crosswalks.
+
+### Column Naming Convention
+
+The function auto-detects columns based on prefixes:
+
+| Prefix                                   | Treatment                                                          |
+|------------------------------------------|--------------------------------------------------------------------|
+| `count_`                                 | Summed after weighting (for counts like population, housing units) |
+| `mean_`, `median_`, `percent_`, `ratio_` | Weighted mean (for rates, percentages, averages)                   |
+
+You can also specify columns explicitly via `count_columns` and
+`non_count_columns`. All non-count variables are interpolated using
+weighted means, weighting by the allocation factor from the crosswalk.
+
+## Supported Geography and Year Combinations
+
+### Inter-Geography Crosswalks (Geocorr)
+
+2022-vintage crosswalks between any of these geographies:
+
+- block, block group, tract, county
+- place, zcta, puma22
+- cd118, cd119, urban_area, core_based_statistical_area
+
+### Inter-Temporal Crosswalks (NHGIS)
+
+NHGIS provides cross-decade crosswalks with the following structure:
+
+**Source geographies:** block, block_group, tract
+
+**Target geographies:** - From blocks (decennial years only): block,
+block_group, tract, county, place, zcta, puma, urban_area, cbsa - From
+block_group or tract: block_group, tract, county
+
+| Source Years                 | Target Years                 |
+|------------------------------|------------------------------|
+| 1990, 2000                   | 2010, 2014, 2015, 2020, 2022 |
+| 2010, 2011, 2012, 2014, 2015 | 1990, 2000, 2020, 2022       |
+| 2020, 2022                   | 1990, 2000, 2010, 2014, 2015 |
+
+**Notes:** - Within-decade crosswalks (e.g., 2010→2014) are not
+available from NHGIS - Block→ZCTA, Block→PUMA, etc. are only available
+for decennial years (1990, 2000, 2010, 2020) - The package automatically
+uses direct NHGIS crosswalks when available (e.g.,
+`get_crosswalk(source_geography = "block", target_geography = "zcta", source_year = 2010, target_year = 2020)`
+returns a single-step NHGIS crosswalk)
+
+### 2020→2022 Crosswalks (CTData)
+
+For 2020 to 2022 transformations, the package uses CT Data Collaborative
+crosswalks for Connecticut (where planning regions replaced counties)
+and identity mappings for other states (where no changes occurred).
+
+## API Keys
+
+NHGIS crosswalks require an IPUMS API key. Get one at
+<https://account.ipums.org/api_keys> and add to your `.Renviron`:
+
+``` r
+usethis::edit_r_environ()
+# Add: IPUMS_API_KEY=your_key_here
+```
+
+## Caching
+
+Use the `cache` parameter to save crosswalks locally for ease:
+
+``` r
+result <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "zcta",
+  weight = "population",
+  cache = here::here("crosswalks-cache"))
+```
+
+## Citations
+
+The intellectual credit for the underlying crosswalks belongs to the
+original developers.
+
+**For NHGIS**, see citation requirements at:
+<https://www.nhgis.org/citation-and-use-nhgis-data>
+
+**For Geocorr**, a suggested citation:
+
+> Missouri Census Data Center, University of Missouri. (2022). Geocorr
+> 2022: Geographic Correspondence Engine. Retrieved from:
+> <https://mcdc.missouri.edu/applications/geocorr2022.html>
diff --git a/man/crosswalk_data.Rd b/man/crosswalk_data.Rd
index 1a7646b..d6d0e62 100644
--- a/man/crosswalk_data.Rd
+++ b/man/crosswalk_data.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/crosswalk_data.R
 \name{crosswalk_data}
 \alias{crosswalk_data}
-\title{Apply a Crosswalk to Transform Data}
+\title{Interpolate data using a crosswalk(s)}
 \usage{
 crosswalk_data(
   data,
@@ -10,7 +10,8 @@ crosswalk_data(
   geoid_column = "geoid",
   count_columns = NULL,
   non_count_columns = NULL,
-  return_intermediate = FALSE
+  return_intermediate = FALSE,
+  show_join_quality = TRUE
 )
 }
 \arguments{
@@ -40,6 +41,12 @@ detects columns with prefixes "mean_", "median_", "percent_", or "ratio_".}
 \item{return_intermediate}{Logical. If TRUE and crosswalk has multiple steps,
 returns a list containing both the final result and intermediate results
 from each step. Default is FALSE, which returns only the final result.}
+
+\item{show_join_quality}{Logical. If TRUE (default), prints diagnostic messages
+about join quality, including the number of data rows not matching the crosswalk
+and vice versa. For state-nested geographies (tract, county, block group, etc.),
+also reports state-level concentration of unmatched rows. Set to FALSE to
+suppress these messages.}
 }
 \value{
 If \code{return_intermediate = FALSE} (default), a tibble with data summarized
@@ -75,6 +82,11 @@ both NULL, the function will automatically detect columns based on naming prefix
 as non-count variables
 }
 
+\strong{Other columns}: Columns that are not the geoid column, count columns, or
+non-count columns (e.g., metadata like \code{data_year}) are preserved by taking
+the first non-missing value within each target geography group. If all values
+are missing, NA is returned.
+
 \strong{Multi-step crosswalks}: When \code{get_crosswalk()} returns multiple crosswalks
 (for transformations that change both geography and year), this function
 automatically applies them in sequence.
diff --git a/man/get_crosswalk.Rd b/man/get_crosswalk.Rd
index da12545..b524d39 100644
--- a/man/get_crosswalk.Rd
+++ b/man/get_crosswalk.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/get_crosswalk.R
 \name{get_crosswalk}
 \alias{get_crosswalk}
-\title{Get a Geographic Crosswalk}
+\title{Get a crosswalk(s) to translate data across time and geographies}
 \usage{
 get_crosswalk(
   source_geography,
diff --git a/man/get_crosswalk_chain.Rd b/man/get_crosswalk_chain.Rd
deleted file mode 100644
index d314cc4..0000000
--- a/man/get_crosswalk_chain.Rd
+++ /dev/null
@@ -1,63 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/get_crosswalk_chain.R
-\name{get_crosswalk_chain}
-\alias{get_crosswalk_chain}
-\title{Get a Chain of Crosswalks for Multi-Step Transformations}
-\usage{
-get_crosswalk_chain(
-  source_geography,
-  target_geography,
-  source_year = NULL,
-  target_year = NULL,
-  weight = "population",
-  cache = NULL
-)
-}
-\arguments{
-\item{source_geography}{Character. Source geography name.}
-
-\item{target_geography}{Character. Target geography name.}
-
-\item{source_year}{Numeric or NULL. Year of the source geography.}
-
-\item{target_year}{Numeric or NULL. Year of the target geography.}
-
-\item{weight}{Character or NULL. Weighting variable for Geocorr crosswalks.}
-
-\item{cache}{Directory path or NULL. Where to cache crosswalks.}
-}
-\value{
-A list with:
-\describe{
-\item{crosswalks}{A named list of crosswalk tibbles (step_1, step_2, etc.)}
-\item{plan}{The crosswalk plan from plan_crosswalk_chain()}
-\item{message}{A formatted message describing the crosswalk chain}
-}
-}
-\description{
-Retrieves a list of crosswalks needed to transform data from a source
-geography/year to a target geography/year. For multi-step transformations,
-users should apply each crosswalk sequentially using \code{crosswalk_data()}.
-}
-\examples{
-\dontrun{
-# Get crosswalks for 2010 tracts to 2020 ZCTAs (requires two steps)
-chain <- get_crosswalk_chain(
-  source_geography = "tract",
-  target_geography = "zcta",
-  source_year = 2010,
-  target_year = 2020,
-  weight = "population")
-
-# Apply crosswalks sequentially
-data_step1 <- crosswalk_data(
-  data = my_data,
-  crosswalk = chain$crosswalks$step_1,
-  count_columns = "count_population")
-
-data_final <- crosswalk_data(
-  data = data_step1,
-  crosswalk = chain$crosswalks$step_2,
-  count_columns = "count_population")
-}
-}
diff --git a/man/list_nhgis_crosswalks.Rd b/man/list_nhgis_crosswalks.Rd
index f5b1238..fd0d758 100644
--- a/man/list_nhgis_crosswalks.Rd
+++ b/man/list_nhgis_crosswalks.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/get_nhgis_crosswalk.R
 \name{list_nhgis_crosswalks}
 \alias{list_nhgis_crosswalks}
-\title{List Available NHGIS Crosswalks}
+\title{List supported NHGIS crosswalks}
 \usage{
 list_nhgis_crosswalks()
 }
diff --git a/tests/testthat/test-crosswalk_data.R b/tests/testthat/test-crosswalk_data.R
index b6ea1fd..b8e692f 100644
--- a/tests/testthat/test-crosswalk_data.R
+++ b/tests/testthat/test-crosswalk_data.R
@@ -612,3 +612,357 @@ test_that("crosswalk_data handles numeric GEOIDs", {
 
   expect_s3_class(result, "tbl_df")
 })
+
+# ==============================================================================
+# Join quality reporting tests
+# ==============================================================================
+
+test_that("crosswalk_data attaches join_quality attribute", {
+  mock_data <- tibble::tibble(
+    geoid = c("A", "B"),
+    count_population = c(1000, 2000))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "B"),
+    target_geoid = c("X", "X"),
+    target_geography_name = c("test", "test"),
+    allocation_factor_source_to_target = c(0.5, 0.5))
+
+  result <- crosswalk_data(
+    data = mock_data,
+    crosswalk = mock_crosswalk,
+    geoid_column = "geoid",
+    count_columns = c("count_population"))
+
+  join_quality <- attr(result, "join_quality")
+  expect_type(join_quality, "list")
+  expect_true("n_data_total" %in% names(join_quality))
+  expect_true("n_data_unmatched" %in% names(join_quality))
+  expect_true("n_crosswalk_total" %in% names(join_quality))
+  expect_true("n_crosswalk_unmatched" %in% names(join_quality))
+})
+
+test_that("crosswalk_data reports correct join quality for perfect match", {
+  mock_data <- tibble::tibble(
+    geoid = c("A", "B"),
+    count_population = c(1000, 2000))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "B"),
+    target_geoid = c("X", "X"),
+    target_geography_name = c("test", "test"),
+    allocation_factor_source_to_target = c(0.5, 0.5))
+
+  result <- crosswalk_data(
+    data = mock_data,
+    crosswalk = mock_crosswalk,
+    geoid_column = "geoid",
+    count_columns = c("count_population"))
+
+  join_quality <- attr(result, "join_quality")
+  expect_equal(join_quality$n_data_unmatched, 0)
+  expect_equal(join_quality$n_crosswalk_unmatched, 0)
+  expect_equal(join_quality$pct_data_unmatched, 0)
+})
+
+test_that("crosswalk_data reports unmatched data rows", {
+  mock_data <- tibble::tibble(
+    geoid = c("A", "B", "C"),  # C not in crosswalk
+    count_population = c(1000, 2000, 3000))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "B"),
+    target_geoid = c("X", "X"),
+    target_geography_name = c("test", "test"),
+    allocation_factor_source_to_target = c(0.5, 0.5))
+
+  expect_message(
+    result <- crosswalk_data(
+      data = mock_data,
+      crosswalk = mock_crosswalk,
+      geoid_column = "geoid",
+      count_columns = c("count_population")),
+    regexp = "did not match the crosswalk")
+
+  join_quality <- attr(result, "join_quality")
+  expect_equal(join_quality$n_data_unmatched, 1)
+  expect_equal(join_quality$n_data_total, 3)
+  expect_true("C" %in% join_quality$data_geoids_unmatched)
+})
+
+test_that("crosswalk_data reports unmatched crosswalk rows", {
+  mock_data <- tibble::tibble(
+    geoid = c("A"),
+    count_population = c(1000))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "B", "C"),  # B and C not in data
+    target_geoid = c("X", "Y", "Z"),
+    target_geography_name = c("test", "test", "test"),
+    allocation_factor_source_to_target = c(1.0, 1.0, 1.0))
+
+  expect_message(
+    result <- crosswalk_data(
+      data = mock_data,
+      crosswalk = mock_crosswalk,
+      geoid_column = "geoid",
+      count_columns = c("count_population")),
+    regexp = "were not in input data")
+
+  join_quality <- attr(result, "join_quality")
+  expect_equal(join_quality$n_crosswalk_unmatched, 2)
+  expect_equal(join_quality$n_crosswalk_total, 3)
+})
+
+test_that("crosswalk_data reports state concentration for unmatched data rows", {
+  # Create data with state-based GEOIDs where unmatched rows are concentrated in NY (36)
+  mock_data <- tibble::tibble(
+    geoid = c("01001", "01002", "36001", "36002", "36003", "36004", "36005"),
+    count_population = c(1000, 2000, 3000, 4000, 5000, 6000, 7000))
+
+  # Crosswalk only has AL (01) counties - include metadata for state analysis
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("01001", "01002"),
+    target_geoid = c("X", "X"),
+    target_geography_name = c("test", "test"),
+    allocation_factor_source_to_target = c(0.5, 0.5))
+
+  # Add metadata indicating this is a county crosswalk (state-nested geography)
+  attr(mock_crosswalk, "crosswalk_metadata") <- list(source_geography = "county")
+
+  expect_message(
+    result <- crosswalk_data(
+      data = mock_data,
+      crosswalk = mock_crosswalk,
+      geoid_column = "geoid",
+      count_columns = c("count_population")),
+    regexp = "Top states with unmatched data rows")
+
+  join_quality <- attr(result, "join_quality")
+  expect_equal(join_quality$n_data_unmatched, 5)
+
+  # Check state analysis
+  expect_false(is.null(join_quality$state_analysis_data))
+  expect_true(join_quality$state_analysis_data$is_concentrated)
+
+  # NY should be top state
+  top_state <- join_quality$state_analysis_data$top_states$state_fips[1]
+  expect_equal(top_state, "36")
+})
+
+test_that("crosswalk_data does NOT report state concentration for ZCTAs (non-state-nested)", {
+  # ZCTAs cross state boundaries, so state analysis should not be performed
+  mock_data <- tibble::tibble(
+    geoid = c("01001", "01002", "36001", "36002", "36003"),
+    count_population = c(1000, 2000, 3000, 4000, 5000))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("01001", "01002"),
+    target_geoid = c("X", "X"),
+    allocation_factor_source_to_target = c(0.5, 0.5))
+
+  # Add metadata indicating this is a ZCTA crosswalk (NOT state-nested)
+  attr(mock_crosswalk, "crosswalk_metadata") <- list(source_geography = "zcta")
+
+  # Should report unmatched rows but NOT state concentration
+  expect_message(
+    result <- crosswalk_data(
+      data = mock_data,
+      crosswalk = mock_crosswalk,
+      geoid_column = "geoid",
+      count_columns = c("count_population")),
+    regexp = "did not match the crosswalk")
+
+  join_quality <- attr(result, "join_quality")
+  expect_equal(join_quality$n_data_unmatched, 3)
+
+  # State analysis should NOT be performed for ZCTAs
+  expect_true(is.null(join_quality$state_analysis_data))
+  expect_false(join_quality$state_analysis_applicable)
+})
+
+test_that("crosswalk_data show_join_quality=FALSE suppresses messages", {
+  mock_data <- tibble::tibble(
+    geoid = c("A", "B", "C"),  # C not in crosswalk
+    count_population = c(1000, 2000, 3000))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "B"),
+    target_geoid = c("X", "X"),
+    allocation_factor_source_to_target = c(0.5, 0.5))
+
+  # With show_join_quality = FALSE, should NOT produce join quality messages
+  # (but will still produce "Applying crosswalk step..." message)
+  expect_no_message(
+    result <- crosswalk_data(
+      data = mock_data,
+      crosswalk = mock_crosswalk,
+      geoid_column = "geoid",
+      count_columns = c("count_population"),
+      show_join_quality = FALSE),
+    message = "did not match")
+
+  # join_quality attribute should be NULL when disabled
+  join_quality <- attr(result, "join_quality")
+  expect_null(join_quality)
+})
+
+test_that("crosswalk_data silent when join is perfect", {
+  mock_data <- tibble::tibble(
+    geoid = c("A", "B"),
+    count_population = c(1000, 2000))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "B"),
+    target_geoid = c("X", "X"),
+    target_geography_name = c("test", "test"),
+    allocation_factor_source_to_target = c(0.5, 0.5))
+
+  # Should NOT produce any join quality messages (only "Applying crosswalk step..." message)
+  expect_message(
+    result <- crosswalk_data(
+      data = mock_data,
+      crosswalk = mock_crosswalk,
+      geoid_column = "geoid",
+      count_columns = c("count_population")),
+    regexp = "Applying crosswalk step")
+
+  # And should NOT have "did not match" message
+  expect_no_message(
+    result2 <- crosswalk_data(
+      data = mock_data,
+      crosswalk = mock_crosswalk,
+      geoid_column = "geoid",
+      count_columns = c("count_population")),
+    message = "did not match")
+})
+
+# ==============================================================================
+# Other column handling tests
+# ==============================================================================
+
+test_that("crosswalk_data preserves other columns using first non-missing value", {
+  mock_data <- tibble::tibble(
+    geoid = c("A", "B"),
+    count_population = c(1000, 2000),
+    data_year = c(2018, 2018),
+    vintage = c(2010, 2010))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "B"),
+    target_geoid = c("X", "X"),
+    target_geography_name = c("test", "test"),
+    allocation_factor_source_to_target = c(0.5, 0.5))
+
+  result <- crosswalk_data(
+    data = mock_data,
+    crosswalk = mock_crosswalk,
+    geoid_column = "geoid",
+    count_columns = c("count_population"))
+
+  # Other columns should be preserved
+  expect_true("data_year" %in% colnames(result))
+  expect_true("vintage" %in% colnames(result))
+
+  # Should take first value (both are same in this case)
+  expect_equal(result$data_year[1], 2018)
+  expect_equal(result$vintage[1], 2010)
+})
+
+test_that("crosswalk_data takes first non-missing value for other columns", {
+  mock_data <- tibble::tibble(
+    geoid = c("A", "B", "C"),
+    count_population = c(1000, 2000, 3000),
+    data_year = c(NA, 2019, 2019))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "B", "C"),
+    target_geoid = c("X", "X", "X"),
+    target_geography_name = c("test", "test", "test"),
+    allocation_factor_source_to_target = c(0.3, 0.3, 0.4))
+
+  result <- crosswalk_data(
+    data = mock_data,
+    crosswalk = mock_crosswalk,
+    geoid_column = "geoid",
+    count_columns = c("count_population"))
+
+  # Should take first non-missing value (2019 from B)
+  expect_equal(result$data_year[1], 2019)
+})
+
+test_that("crosswalk_data returns NA for other columns when all values missing", {
+  mock_data <- tibble::tibble(
+    geoid = c("A", "B"),
+    count_population = c(1000, 2000),
+    data_year = c(NA_integer_, NA_integer_))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "B"),
+    target_geoid = c("X", "X"),
+    target_geography_name = c("test", "test"),
+    allocation_factor_source_to_target = c(0.5, 0.5))
+
+  result <- crosswalk_data(
+    data = mock_data,
+    crosswalk = mock_crosswalk,
+    geoid_column = "geoid",
+    count_columns = c("count_population"))
+
+  # Should return NA when all values are missing
+  expect_true(is.na(result$data_year[1]))
+})
+
+test_that("crosswalk_data handles multiple other columns with different types", {
+  mock_data <- tibble::tibble(
+    geoid = c("A", "B"),
+    count_population = c(1000, 2000),
+    data_year = c(2018L, 2018L),
+    source_name = c("Urban", "Urban"),
+    weight_factor = c(1.5, 1.5))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "B"),
+    target_geoid = c("X", "X"),
+    target_geography_name = c("test", "test"),
+    allocation_factor_source_to_target = c(0.5, 0.5))
+
+  result <- crosswalk_data(
+    data = mock_data,
+    crosswalk = mock_crosswalk,
+    geoid_column = "geoid",
+    count_columns = c("count_population"))
+
+  # All other columns should be preserved with correct types
+
+  expect_true("data_year" %in% colnames(result))
+  expect_true("source_name" %in% colnames(result))
+  expect_true("weight_factor" %in% colnames(result))
+
+  expect_equal(result$data_year[1], 2018L)
+  expect_equal(result$source_name[1], "Urban")
+  expect_equal(result$weight_factor[1], 1.5)
+})
+
+test_that("crosswalk_data other columns work correctly with one-to-many mapping", {
+  # Source A maps to two targets (X and Y), B maps to X only
+  mock_data <- tibble::tibble(
+    geoid = c("A", "B"),
+    count_population = c(1000, 2000),
+    data_year = c(2020, 2020))
+
+  mock_crosswalk <- tibble::tibble(
+    source_geoid = c("A", "A", "B"),
+    target_geoid = c("X", "Y", "X"),
+    target_geography_name = c("test", "test", "test"),
+    allocation_factor_source_to_target = c(0.6, 0.4, 1.0))
+
+  result <- crosswalk_data(
+    data = mock_data,
+    crosswalk = mock_crosswalk,
+    geoid_column = "geoid",
+    count_columns = c("count_population"))
+
+  # Both target geographies should have data_year preserved
+  expect_true(all(result$data_year == 2020))
+})
diff --git a/tests/testthat/test-plan_crosswalk_chain.R b/tests/testthat/test-plan_crosswalk_chain.R
index bd37ae8..1f0bc8d 100644
--- a/tests/testthat/test-plan_crosswalk_chain.R
+++ b/tests/testthat/test-plan_crosswalk_chain.R
@@ -115,6 +115,98 @@ test_that("plan_crosswalk_chain multi-step works for block_group", {
   expect_equal(plan$intermediate_geography, "block_group")
 })
 
+# ==============================================================================
+# Direct NHGIS crosswalk tests (geography + year change with direct NHGIS crosswalk)
+# ==============================================================================
+
+test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for block to zcta", {
+  # block 2010 -> zcta 2020 is available directly from NHGIS
+  plan <- plan_crosswalk_chain(
+    source_geography = "block",
+    target_geography = "zcta",
+    source_year = 2010,
+    target_year = 2020)
+
+  expect_false(plan$is_multi_step)
+  expect_equal(nrow(plan$steps), 1)
+  expect_equal(plan$steps$crosswalk_source[1], "nhgis")
+  expect_true(stringr::str_detect(plan$steps$description[1], "direct NHGIS"))
+})
+
+test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for block to puma", {
+  # block 2010 -> puma 2020 is available directly from NHGIS
+  plan <- plan_crosswalk_chain(
+    source_geography = "block",
+    target_geography = "puma",
+    source_year = 2010,
+    target_year = 2020)
+
+  expect_false(plan$is_multi_step)
+  expect_equal(nrow(plan$steps), 1)
+  expect_equal(plan$steps$crosswalk_source[1], "nhgis")
+})
+
+test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for block to place", {
+  # block 2010 -> place 2020 is available directly from NHGIS
+  plan <- plan_crosswalk_chain(
+    source_geography = "block",
+    target_geography = "place",
+    source_year = 2010,
+    target_year = 2020)
+
+  expect_false(plan$is_multi_step)
+  expect_equal(plan$steps$crosswalk_source[1], "nhgis")
+})
+
+test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for block to urban_area", {
+  # block 2010 -> urban_area 2020 is available directly from NHGIS
+  plan <- plan_crosswalk_chain(
+    source_geography = "block",
+    target_geography = "urban_area",
+    source_year = 2010,
+    target_year = 2020)
+
+  expect_false(plan$is_multi_step)
+  expect_equal(plan$steps$crosswalk_source[1], "nhgis")
+})
+
+test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for block to cbsa", {
+  # block 2010 -> cbsa 2020 is available directly from NHGIS
+  plan <- plan_crosswalk_chain(
+    source_geography = "block",
+    target_geography = "cbsa",
+    source_year = 2010,
+    target_year = 2020)
+
+  expect_false(plan$is_multi_step)
+  expect_equal(plan$steps$crosswalk_source[1], "nhgis")
+})
+
+test_that("plan_crosswalk_chain still uses multi-step for tract to zcta (no direct NHGIS)", {
+  # tract 2010 -> zcta 2020 is NOT available directly from NHGIS
+  # So it should still require multi-step
+  plan <- plan_crosswalk_chain(
+    source_geography = "tract",
+    target_geography = "zcta",
+    source_year = 2010,
+    target_year = 2020)
+
+  expect_true(plan$is_multi_step)
+  expect_equal(nrow(plan$steps), 2)
+})
+
+test_that("plan_crosswalk_chain detects direct NHGIS crosswalk for reverse direction", {
+  # block 2020 -> zcta 2010 is available directly from NHGIS
+  plan <- plan_crosswalk_chain(
+    source_geography = "block",
+    target_geography = "zcta",
+    source_year = 2020,
+    target_year = 2010)
+
+  expect_false(plan$is_multi_step)
+  expect_equal(plan$steps$crosswalk_source[1], "nhgis")
+})
+
 # ==============================================================================
 # Error handling tests
 # ==============================================================================
diff --git a/vignettes/standardizing-longitudinal-data.Rmd b/vignettes/standardizing-longitudinal-data.Rmd
new file mode 100644
index 0000000..af715eb
--- /dev/null
+++ b/vignettes/standardizing-longitudinal-data.Rmd
@@ -0,0 +1,176 @@
+---
+title: "Standardizing Longitudinal Data"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Standardizing Longitudinal Data}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  eval = FALSE
+)
+```
+
+## Overview
+
+A common challenge with longitudinal tract-level data is that census tract boundaries
+change between decennial censuses. Data from before 2020 typically uses 2010 tract
+definitions, while more recent data uses 2020 tract definitions. To analyze trends
+over time, you need to standardize all years to a consistent tract vintage.
+
+This vignette demonstrates how the `crosswalk` package efficiently handles this task
+using the Urban Institute's [HMDA Neighborhood Summary Files](https://datacatalog.urban.org/dataset/home-mortgage-disclosure-act-neighborhood-summary-files-census-tract-level),
+which provide tract-level mortgage lending data from 2018-2023. The 2018-2021 files
+use 2010 tract definitions, while 2022-2023 use 2020 tract definitions.
+
+## Setup
+
+```{r setup, message=FALSE}
+library(crosswalk)
+library(dplyr)
+library(purrr)
+library(readr)
+library(tibble)
+library(stringr)
+```
+
+## Step 1: Download the Data
+
+The Urban Institute publishes annual HMDA tract-level summary files. Let's download
+all six years (2018-2023):
+
+```{r download-data}
+## metadata object describing data year/vintage/url
+metadata = tibble::tribble(
+  ~ year, ~ vintage, ~ url,
+  2018, 2010, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2023/12/20/hmda_tract_2018.csv",
+  2019, 2010, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2023/12/20/hmda_tract_2019.csv",
+  2020, 2010, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2023/12/20/hmda_tract_2020.csv",
+  2021, 2010, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2023/12/20/hmda_tract_2021.csv",
+  2022, 2020, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2023/12/20/hmda_tract_2022.csv",
+  2023, 2020, "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/2024/12/17/hmda_tract_2023.csv")
+
+## iterate of the metadata object and read in data for each year
+hmda_data <- pmap(metadata, function(url, year, vintage) {
+  read_csv(url, show_col_types = FALSE) |>
+    mutate(
+      vintage = vintage,
+      data_year = as.integer(year)) })
+
+names(hmda_data) = metadata$year %>% as.character()
+```
+
+Let's inspect the structure of the data:
+
+```{r inspect-data}
+# Check dimensions of each year
+map(hmda_data, dim)
+
+# View the 2018 data (2010 tract vintage)
+glimpse(hmda_data[["2018"]])
+```
+
+## Step 2: Prepare Data for Crosswalking
+
+The HMDA data includes a `tractid` column that contains the 11-digit tract GEOID.
+Let's prepare a subset of variables for crosswalking. We'll focus on count variables
+(total applications by race/ethnicity) which require the `count_` prefix for automatic
+detection by `crosswalk_data()`:
+
+```{r prepare-data}
+prepare_hmda <- function(data) {
+  data |>
+    rename_with(.cols = matches("^geo20"), .fn = ~ "source_geoid") |>
+    select(
+      source_geoid,
+      vintage,
+      data_year,
+      # Count variables: rename with count_ prefix for automatic detection
+      count_race_white_purchase = race_white_purchase,
+      count_owner_purchase_originations = owner_purchase_originations,
+      median_owner_loan_amount = owner_loan_amount_median
+    ) |>
+    mutate(source_geoid = as.character(source_geoid))
+}
+
+hmda_prepared <- map(hmda_data, prepare_hmda)
+```
+
+## Step 3: Obtain the 2010→2020 Tract Crosswalk
+
+Here's where `crosswalk` shines. We need a single crosswalk to convert all 2010-vintage
+tracts to 2020-vintage tracts. The package fetches this from IPUMS NHGIS:
+
+```{r get-crosswalk}
+tract_crosswalk <- get_crosswalk(
+  source_geography = "tract",
+  target_geography = "tract",
+  source_year = 2010,
+  target_year = 2020,
+  weight = "population")
+
+# View the crosswalk plan
+tract_crosswalk$message
+```
+
+The crosswalk contains allocation factors that specify how to distribute values from
+2010 tracts to 2020 tracts. When tract boundaries changed, a single 2010 tract may
+map to multiple 2020 tracts (or vice versa), with allocation factors summing to 1.
+
+## Step 4: Apply the Crosswalk to 2018-2021 Data
+
+Now we apply the crosswalk to the four years of data that use 2010 tract definitions:
+
+```{r apply-crosswalk}
+# Years that need crosswalking (2010 vintage)
+years_to_crosswalk <- c("2018", "2019", "2020", "2021")
+
+# Apply crosswalk to each year
+hmda_crosswalked <- map_if(
+  .x = hmda_prepared, 
+  .p = names(hmda_prepared) %in% years_to_crosswalk,
+  .f = ~ crosswalk_data(
+      data = .x,
+      crosswalk = tract_crosswalk,
+      geoid_column = "source_geoid",
+      show_join_quality = TRUE)) 
+
+## how many source records are we unable to crosswalk each year?
+## excluding those with an "X"--which is used to retain valid observations
+## that, in the source data, do not have a valid tract identifier--very few:
+hmda_crosswalked |>
+  map(~ 
+    .x |> 
+    attr("join_quality") |> 
+    pluck("data_geoids_unmatched") %>%
+    .[!str_detect(., "X")] |>
+    length())
+
+hmda_combined <- bind_rows(hmda_crosswalked) |>
+  ## data for years that are crosswalked have slightly different/additional columsn
+  mutate(
+    geoid = if_else(is.na(geoid), source_geoid, geoid)) |>
+  select(-c(geography_name, source_geoid, vintage)) |>
+  arrange(geoid, data_year)
+
+# View the result
+glimpse(hmda_combined)
+```
+
+
+## Result: A Panel Dataset in 2020 Tract Definitions
+We now have a single dataframe with all six years of HMDA data standardized to 2020
+tract definitions:
+
+```{r final-summary}
+# Unique tracts per year
+hmda_combined |>
+  group_by(data_year) |>
+  summarize(n_tracts = n_distinct(geoid))
+```
+
+