diff --git a/DESCRIPTION b/DESCRIPTION index 179d915..15229ab 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -68,4 +68,3 @@ Suggests: tidyverse Config/testthat/edition: 3 VignetteBuilder: knitr -Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 0e38021..2103982 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ export(get_government_finances) export(get_hazard_mitigation_assistance) export(get_ihp_registrations) export(get_lodes) +export(get_naics_codes) export(get_nfip_claims) export(get_nfip_policies) export(get_preliminary_damage_assessments) diff --git a/R/get_business_patterns.R b/R/get_business_patterns.R index 8988c49..a1911c1 100644 --- a/R/get_business_patterns.R +++ b/R/get_business_patterns.R @@ -1,5 +1,61 @@ #' @importFrom magrittr %>% +#' @title Get NAICS Codes for County Business Patterns +#' +#' @description A utility function to programmatically identify and select NAICS +#' codes for use with `get_business_patterns()`. This is a wrapper around +#' `censusapi::listCensusMetadata(name = "cbp")`. +#' +#' @param year The vintage year for NAICS codes. Data are available from 1986 through 2023. +#' Default is 2022. +#' @param digits The number of digits for desired NAICS codes. Must be between 2 and 6. +#' Default is 3. Two-digit codes represent broad industry sectors (20 codes), +#' while six-digit codes represent detailed industries. +#' +#' @return A tibble with the following columns: +#' \describe{ +#' \item{naics_code}{The NAICS code (character)} +#' \item{naics_label}{The descriptive label for the NAICS code} +#' \item{year}{The vintage year of the NAICS codes} +#' } +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' # Get all 2-digit NAICS codes +#' get_naics_codes(year = 2022, digits = 2) +#' +#' # Get all 3-digit NAICS codes (default) +#' get_naics_codes(year = 2022) +#' +#' # Get 4-digit NAICS codes for a specific year +#' get_naics_codes(year = 2020, digits = 4) +#' } +get_naics_codes <- function(year = 2022, digits = 3) { + if (year < 1986) { stop("Year must be 1986 or later.") } + if (year > 2023) { stop("Most recent year for data is 2023.") } + if (!digits %in% 2:6) { stop("`digits` must be between 2 and 6.") } + + naics_metadata <- censusapi::listCensusMetadata( + name = "cbp", + vintage = as.character(year), + type = "variables", + include_values = TRUE) |> + dplyr::filter( + !is.na(values_code), + !stringr::str_starts(values_code, "92|95"), + nchar(values_code) == digits) |> + dplyr::transmute( + naics_code = values_code, + naics_label = values_label, + year = year) |> + dplyr::distinct() |> + dplyr::arrange(naics_code) |> + tibble::as_tibble() + +} + #' @title Obtain County Business Patterns (CBP) Estimates per County #' #' @param year The vintage of CBP data desired. Data are available from 1986, @@ -234,4 +290,4 @@ get_business_patterns = function(year = 2022, geo = "county", naics_code_digits utils::globalVariables( c("EMP", "EMPSZES", "ESTAB", "NAICS2017_LABEL", "PAYANN", "annual_payroll", "employee_size_range", "employee_size_range_code", "employee_size_range_label", - "employees", "employers", "industry", "values_code", "naics_code")) + "employees", "employers", "industry", "values_code", "values_label", "naics_code")) diff --git a/man/cache_it.Rd b/man/cache_it.Rd index bd2b832..d683ece 100644 --- a/man/cache_it.Rd +++ b/man/cache_it.Rd @@ -4,7 +4,7 @@ \alias{cache_it} \title{Cache an object to a parquet file; optionally read from disk} \usage{ -cache_it(object, file_name = NULL, path = "/data", read = TRUE) +cache_it(object, file_name = NULL, path = ".", read = TRUE, keep_n = 5) } \arguments{ \item{object}{A dataframe, tibble, or sf object to cache. Can be provided as @@ -12,11 +12,13 @@ either a quoted or unquoted name. Optional when reading from cache - in this case, file_name must be provided.} \item{file_name}{File name (without extension). Optional when object is provided -(uses object's name). Required when object is missing and reading from cache.} +(uses object's name). Required when object is missing and reading from cache. +Must not contain path separators or invalid filename characters.} -\item{path}{Directory path where the file should be saved/read. Defaults to /data. -If the path does not exist, the user will be prompted to create it (in -interactive sessions) or an error will be thrown (in non-interactive sessions).} +\item{path}{Directory path where the file should be saved/read. Defaults to +current directory ("."). If the path does not exist, the user will be prompted +to create it (in interactive sessions) or an error will be thrown (in +non-interactive sessions).} \item{read}{Logical or character. TRUE by default. \itemize{ @@ -25,6 +27,10 @@ interactive sessions) or an error will be thrown (in non-interactive sessions).} \item Character: Read the specific file with this exact filename (including extension). Defaults to TRUE. }} + +\item{keep_n}{Integer. Maximum number of cached versions to keep. When writing +a new file, older versions beyond this limit are deleted. Defaults to 5. +Set to NULL or Inf to keep all versions.} } \value{ The object that was cached (either written or read) @@ -38,16 +44,16 @@ reading/writing and adds "_sf" to the filename to indicate the file format. \examples{ \dontrun{ ## Note: datestamps in filenames are illustrative; user results will -## vary depending on the the date at runtime +## vary depending on the date at runtime # Regular data frames my_data <- tibble(x = 1:10, y = letters[1:10]) -# Cache with automatic naming and datestamp -cache_it(my_data) # Creates: my_data_2025_12_07.parquet +# Cache with automatic naming and datestamp (writes to current directory) +cache_it(my_data) # Creates: ./my_data_2025_12_07.parquet -# Cache with custom filename -cache_it(my_data, file_name = "custom_name") +# Cache with custom filename and path +cache_it(my_data, file_name = "custom_name", path = "data") # Read most recent cached version if exists, otherwise write cached_data <- cache_it(my_data, read = TRUE) @@ -70,9 +76,15 @@ my_data <- cache_it(my_data, read = TRUE) # Read specific file when object doesn't exist old_data <- cache_it(read = "my_data_2025_12_01.parquet") +# Keep only the 3 most recent cached versions +cache_it(my_data, keep_n = 3) + +# Keep all cached versions (no cleanup) +cache_it(my_data, keep_n = NULL) + # SF objects (automatically uses sfarrow) my_sf <- sf::st_read(system.file("shape/nc.shp", package="sf")) -cache_it(my_sf) # Creates: my_sf_2025_12_07_sf.parquet +cache_it(my_sf) # Creates: ./my_sf_2025_12_07_sf.parquet # Read most recent sf cached file cached_sf <- cache_it(my_sf, read = TRUE) diff --git a/man/convert_delimited_to_parquet.Rd b/man/convert_delimited_to_parquet.Rd index 1579c71..519f8a4 100644 --- a/man/convert_delimited_to_parquet.Rd +++ b/man/convert_delimited_to_parquet.Rd @@ -24,7 +24,7 @@ convert_delimited_to_parquet( \item{dataset}{NULL by default. Alternately, one of c("nfip_policies", "ihp_registrations"). If not null, this will be used to select the columns that are returned.} } \value{ -Nothing. Parquet data are written to local path. +NULL (invisibly). This function is called for its side effect of writing a parquet file to disk at the specified \code{outpath} (or a path derived from \code{inpath} with a .parquet extension). The function reads the input file in chunks to handle large files efficiently, optionally subsets to specified columns, and writes the result in Apache Parquet format using \code{arrow::write_parquet()}. } \description{ Convert raw data to parquet to conserve memory / speed subsequent operations diff --git a/man/convert_table_text_to_dataframe.Rd b/man/convert_table_text_to_dataframe.Rd index 9cba577..6f8c0c3 100644 --- a/man/convert_table_text_to_dataframe.Rd +++ b/man/convert_table_text_to_dataframe.Rd @@ -30,7 +30,13 @@ convert_table_text_to_dataframe( \item{required}{Boolean; default is FALSE. If TRUE, the LLM will be instructed to return values for all columns. If FALSE, \code{NULL} values are allowed. Generally, NULL values should be allowed unless you are certain that every value in the inputted text-table has a non-NULL value.} } \value{ -A list of dataframes, with each item corresponding to one page of the inputted text. The dataframes have the same column names and types as specified in \code{column_types}. Use \code{purrr::bind_rows()} to consolidate results into a single dataframe, if needed. +A list of tibbles, where each list element corresponds to one item (typically one page) in the input \code{text} vector/list. Each tibble contains: +\describe{ +\item{Structure}{Columns match the names and types defined in \code{column_types}. Each row represents one record extracted from the table text by the LLM.} +\item{NULL values}{When \code{required = FALSE} (default), columns may contain NULL/NA values if the LLM could not extract a value for that cell.} +\item{Empty dataframes}{If the LLM encounters an error processing a page, that list element will be an empty \code{data.frame()}.} +} +Use \code{purrr::list_rbind()} or \code{dplyr::bind_rows()} to consolidate results into a single dataframe. A warning is issued reminding users to review AI-generated results for accuracy. } \description{ It is common to encounter valuable tabular data that is stored in a file type that does not codify tabular data as such, e.g., a table in a PDF or .docx file. This function uses a user-specified LLM (from OpenAI or Anthropic) to convert the text of a table into a dataframe. Note that users must have an API key with credits for the specified LLM. For a typical full-page PDF table, the LLM costs are roughly $.02-.05 USD per page. diff --git a/man/estimate_units_per_parcel.Rd b/man/estimate_units_per_parcel.Rd index 188faaa..9634ed1 100644 --- a/man/estimate_units_per_parcel.Rd +++ b/man/estimate_units_per_parcel.Rd @@ -16,7 +16,19 @@ estimate_units_per_parcel(structures, parcels, zoning, acs = NULL) \item{acs}{Optionally, a non-spatial dataset, at the tract level, returned from \code{urbnindicators::compile_acs_data()}.} } \value{ -The inputted parcels datasets with attributes describing estimated unit counts by unit type. +An \code{sf} object (point geometry, representing parcel centroids) containing the input parcel data augmented with estimated residential unit information. The returned object includes: +\describe{ +\item{parcel_id}{Character or numeric. The unique parcel identifier from the input data.} +\item{tract_geoid}{Character. The 11-digit Census tract GEOID containing the parcel centroid.} +\item{jurisdiction}{Character. The jurisdiction name associated with the parcel.} +\item{municipality_name}{Character. The municipality name associated with the parcel.} +\item{residential_unit_count}{Numeric. The estimated number of residential units on the parcel, benchmarked against ACS estimates at the tract level.} +\item{residential_unit_categories}{Factor (ordered). Categorical classification of unit counts: "0", "1", "2", "3-4", "5-9", "10-19", "20-49", "50+".} +\item{median_value_improvement_sf}{Numeric. Tract-level median improvement value for single-family parcels.} +\item{median_value_improvement_mh}{Numeric. Tract-level median improvement value for manufactured home parcels.} +\item{acs_units_\emph{}{Numeric. ACS-reported housing unit counts by units-in-structure category for the tract.} +\item{zone, zoned_housing_type, far, setback_}, height_maximum, ...}{Various zoning attributes joined from the zoning dataset.} +} } \description{ Estimate the number and types of structures per parcel diff --git a/man/get_box_path.Rd b/man/get_box_path.Rd index 03f338e..8673df7 100644 --- a/man/get_box_path.Rd +++ b/man/get_box_path.Rd @@ -7,7 +7,10 @@ get_box_path() } \value{ -The filepath to the C&C Box folder +A character string containing the full file path to the Climate and Communities (C&C) Box folder. +On Windows, returns "C:/Users/{username}/Box/METRO Climate and Communities Practice Area/github-repository". +On Mac, checks for Box at "/Users/{username}/Box" or "/Users/{username}/Library/CloudStorage/Box-Box", +using whichever exists. Throws an error if the Box folder cannot be found. } \description{ Get the path to the C&C Box folder diff --git a/man/get_dataset_columns.Rd b/man/get_dataset_columns.Rd index 509120f..cc3b894 100644 --- a/man/get_dataset_columns.Rd +++ b/man/get_dataset_columns.Rd @@ -10,7 +10,7 @@ get_dataset_columns(dataset) \item{dataset}{The name of the dataset. One of c('nfip_policies', 'ihp_registrations').} } \value{ -A vector of raw column names to be selected from the specified dataset +A character vector containing the raw column names (in camelCase format as they appear in the source data) to be selected when reading the specified dataset. The columns returned are curated subsets of the full dataset columns, excluding administrative/metadata fields. For "nfip_policies": 20 columns including location, policy details, and building characteristics. For "ihp_registrations": ~20 columns including disaster info, geographic identifiers, and assistance amounts. } \description{ Get the raw column names for a specified dataset diff --git a/man/get_emergency_management_performance.Rd b/man/get_emergency_management_performance.Rd index b2aaae7..a1739b3 100644 --- a/man/get_emergency_management_performance.Rd +++ b/man/get_emergency_management_performance.Rd @@ -17,7 +17,15 @@ get_emergency_management_performance( \item{api}{Logical indicating whether to use the OpenFEMA API to retrieve the data. Default is TRUE.} } \value{ -A data frame containing emergency management performance grant (EMPG) data. +A tibble containing Emergency Management Performance Grant (EMPG) data with the following columns: +\describe{ +\item{state_name}{Character. The name of the state receiving the grant (renamed from original "state" column).} +\item{year_project_start}{Numeric. The year the project started, with corrections applied for known data entry errors in the source data.} +\item{state_code}{Character. Two-digit FIPS state code.} +\item{state_abbreviation}{Character. Two-letter USPS state abbreviation.} +\item{...}{Additional columns from the OpenFEMA EMPG dataset, cleaned via \code{janitor::clean_names()}.} +} +Data are filtered to records with \code{year_project_start > 2012}. A warning is issued noting data completeness concerns for 2024-2025. } \description{ Get EMPG data diff --git a/man/get_geography_metadata.Rd b/man/get_geography_metadata.Rd index 165f524..cfd576c 100644 --- a/man/get_geography_metadata.Rd +++ b/man/get_geography_metadata.Rd @@ -12,7 +12,12 @@ get_geography_metadata(geography_type = c("state", "county"), year = 2023) \item{year}{The year for which to obtain state/county metadata. Cannot be greater than the most recent year supported by \code{library(tidycensus)} for the 5-year ACS.} } \value{ -A data frame containing metadata about the specified geography type and area. +A tibble containing geographic metadata. The structure varies by \code{geography_type}: +\describe{ +\item{For "county"}{Returns county-level data with columns: \code{state_code} (2-digit FIPS), \code{state_name}, \code{state_abbreviation} (2-letter USPS), \code{state_population}, \code{county_code} (5-digit FIPS), \code{county_name}, \code{county_population}.} +\item{For "state"}{Returns state-level data with columns: \code{state_abbreviation}, \code{state_code}, \code{state_name} (one row per state, no county information).} +} +Population data are sourced from the ACS 5-year estimates for the specified \code{year}. } \description{ Get geography metadata about states or counties diff --git a/man/get_government_finances.Rd b/man/get_government_finances.Rd index a7fc04b..0dca1be 100644 --- a/man/get_government_finances.Rd +++ b/man/get_government_finances.Rd @@ -10,7 +10,20 @@ get_government_finances(year = 2022) \item{year}{A four-digit year. The default is 2022.} } \value{ -A dataframe containing government unit-level expenses for the specified year. +A tibble containing government unit-level financial data aggregated by unit, with the following columns: +\describe{ +\item{unit_id}{Character. Unique identifier for the government unit.} +\item{year_data}{Numeric. The year of the financial data.} +\item{amount_thousands}{Numeric. Total expenditure amount in thousands of dollars.} +\item{government_type}{Character. Type of government unit: "State", "County", "City", "Township", "Special District", or "School District/Educational Service Agency".} +\item{data_quality}{Numeric. Proportion of records that were reported (vs. imputed or from alternative sources), ranging from 0 to 1.} +\item{unit_name}{Character. Name of the government unit.} +\item{county_name}{Character. County name where the unit is located.} +\item{state_code}{Character. Two-digit state FIPS code.} +\item{population}{Numeric. Population served by the government unit.} +\item{enrollment}{Numeric. Student enrollment (for school districts; NA for other unit types).} +\item{amount_per_capita}{Numeric. Expenditure per capita (or per enrolled student for school districts).} +} } \description{ Get government unit-level expenses from the Census of Governments diff --git a/man/get_ihp_registrations.Rd b/man/get_ihp_registrations.Rd index a523847..0fd2b40 100644 --- a/man/get_ihp_registrations.Rd +++ b/man/get_ihp_registrations.Rd @@ -21,7 +21,19 @@ get_ihp_registrations( \item{outpath}{The path to save the parquet-formatted datafile. Applicable only when \code{api = FALSE}.} } \value{ -A dataframe comprising IHP registrations +A tibble containing Individual and Households Program (IHP) registration data at the household level, joined to county-level geography. Due to ZIP-to-county crosswalking, records may be duplicated across counties (see warning). The returned object includes: +\describe{ +\item{unique_id}{Character. A UUID uniquely identifying each original IHP registration.} +\item{allocation_factor_zcta_to_county}{Numeric. The proportion of the ZCTA's population in this county (0-1). Used to apportion registrations when a ZIP spans multiple counties.} +\item{geoid_county}{Character. Five-digit FIPS county code.} +\item{zcta_code}{Character. Five-digit ZCTA (ZIP Code Tabulation Area) code.} +\item{geoid_tract}{Character. 11-digit Census tract GEOID (may have missingness).} +\item{geoid_block_group}{Character. 12-digit Census block group GEOID (may have missingness).} +\item{disaster_number}{Character. FEMA disaster number associated with the registration.} +\item{amount_individual_housing_program, amount_housing_assistance, amount_other_needs_assistance, amount_rental_assistance, amount_repairs, amount_replacement, amount_personal_property}{Numeric. Various IHP assistance amounts in dollars.} +\item{amount_flood_insurance_premium_paid_by_fema}{Numeric. Flood insurance premium paid by FEMA in dollars.} +\item{state_name, state_abbreviation, state_code}{Character. State identifiers.} +} } \description{ Get Individuals and Households Program (IHP) registrations diff --git a/man/get_naics_codes.Rd b/man/get_naics_codes.Rd new file mode 100644 index 0000000..839c4d0 --- /dev/null +++ b/man/get_naics_codes.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_business_patterns.R +\name{get_naics_codes} +\alias{get_naics_codes} +\title{Get NAICS Codes for County Business Patterns} +\usage{ +get_naics_codes(year = 2022, digits = 3) +} +\arguments{ +\item{year}{The vintage year for NAICS codes. Data are available from 1986 through 2023. +Default is 2022.} + +\item{digits}{The number of digits for desired NAICS codes. Must be between 2 and 6. +Default is 3. Two-digit codes represent broad industry sectors (20 codes), +while six-digit codes represent detailed industries.} +} +\value{ +A tibble with the following columns: +\describe{ +\item{naics_code}{The NAICS code (character)} +\item{naics_label}{The descriptive label for the NAICS code} +\item{year}{The vintage year of the NAICS codes} +} +} +\description{ +A utility function to programmatically identify and select NAICS +codes for use with \code{get_business_patterns()}. This is a wrapper around +\code{censusapi::listCensusMetadata(name = "cbp")}. +} +\examples{ +\dontrun{ +# Get all 2-digit NAICS codes +get_naics_codes(year = 2022, digits = 2) + +# Get all 3-digit NAICS codes (default) +get_naics_codes(year = 2022) + +# Get 4-digit NAICS codes for a specific year +get_naics_codes(year = 2020, digits = 4) +} +} diff --git a/man/get_preliminary_damage_assessments.Rd b/man/get_preliminary_damage_assessments.Rd index 0dbb189..a165dde 100644 --- a/man/get_preliminary_damage_assessments.Rd +++ b/man/get_preliminary_damage_assessments.Rd @@ -20,7 +20,24 @@ get_preliminary_damage_assessments( \item{use_cache}{Boolean. Read the existing dataset stored at \code{file_path}? If FALSE, data will be generated anew. Else, if a file exists at \code{file_path}, this file will be returned.} } \value{ -A dataframe of preliminary damage assessment reports. +A tibble containing parsed data from FEMA Preliminary Damage Assessment (PDA) PDF reports. Each row represents one disaster declaration request. The returned object includes: +\describe{ +\item{disaster_number}{Character. Four-digit FEMA disaster number.} +\item{event_type}{Character. Declaration outcome: "approved", "denial", "appeal_approved", or "appeal_denial".} +\item{event_title}{Character. Title/description of the disaster event from the PDA.} +\item{event_date_determined}{Date. Date the declaration determination was made.} +\item{event_native_flag}{Integer. 1 if this is a tribal declaration, 0 otherwise.} +\item{ia_requested}{Integer. 1 if Individual Assistance was requested, 0 otherwise.} +\item{ia_residences_impacted, ia_residences_destroyed, ia_residences_major_damage, ia_residences_minor_damage, ia_residences_affected}{Numeric. Counts of affected residences by damage category.} +\item{ia_residences_insured_total_percent, ia_residences_insured_flood_percent}{Numeric. Insurance coverage percentages.} +\item{ia_cost_estimate_total}{Numeric. Total estimated Individual Assistance cost.} +\item{pa_requested}{Integer. 1 if Public Assistance was requested, 0 otherwise.} +\item{pa_cost_estimate_total}{Numeric. Total estimated Public Assistance cost.} +\item{pa_per_capita_impact_statewide, pa_per_capita_impact_countywide_max, pa_per_capita_impact_countywide_min}{Numeric. Per capita impact metrics.} +\item{pa_per_capita_impact_indicator_statewide, pa_per_capita_impact_indicator_countywide}{Character. Per capita impact indicator values.} +\item{text}{Character. Full extracted text from the PDA for reference.} +} +Note: Due to the unstructured nature of PDF source documents, some extracted values may be inaccurate and should be verified. } \description{ These data reflect extracted attributes from PDF preliminary damage assessments diff --git a/man/get_sba_loans.Rd b/man/get_sba_loans.Rd index e70fb7c..08b782c 100644 --- a/man/get_sba_loans.Rd +++ b/man/get_sba_loans.Rd @@ -7,7 +7,20 @@ get_sba_loans() } \value{ -A dataframe comprising city- and zip-level data on SBA loanmaking +A tibble containing SBA disaster loan data at the city/zip level, combining both home and business loan records. The returned object includes: +\describe{ +\item{disaster_number_fema}{Character. The FEMA disaster number associated with the loan.} +\item{disaster_number_sba_physical}{Character. SBA physical disaster declaration number.} +\item{disaster_number_sba_eidl}{Character. SBA Economic Injury Disaster Loan (EIDL) declaration number.} +\item{damaged_property_zip_code}{Character. ZIP code of the damaged property.} +\item{damaged_property_city_name}{Character. City name of the damaged property.} +\item{damaged_property_state_code}{Character. State code of the damaged property.} +\item{verified_loss_total}{Numeric. Total verified loss amount in dollars.} +\item{approved_amount_total}{Numeric. Total approved loan amount in dollars.} +\item{approved_amount_real_estate}{Numeric. Approved loan amount for real estate in dollars.} +\item{fiscal_year}{Character. Fiscal year of the loan (format: "20XX").} +\item{loan_type}{Character. Either "business" or "residential" indicating the loan category.} +} } \description{ Access SBA data on disaster loans diff --git a/man/get_sheldus.Rd b/man/get_sheldus.Rd index dddfdc9..7db61f0 100644 --- a/man/get_sheldus.Rd +++ b/man/get_sheldus.Rd @@ -14,7 +14,22 @@ get_sheldus( \item{file_path}{The path to the raw SHELDUS data.} } \value{ -A dataframe comprising hazard x month x year x county observations of hazard events. +A tibble containing SHELDUS (Spatial Hazard Events and Losses Database for the United States) data at the county-year-month-hazard level. The returned object includes: +\describe{ +\item{unique_id}{Character. A UUID uniquely identifying each observation.} +\item{GEOID}{Character. Five-digit FIPS county code. Connecticut counties are crosswalked to 2022 planning regions using population-weighted allocation factors.} +\item{state_name}{Character. State name (sentence case).} +\item{county_name}{Character. County name.} +\item{year}{Numeric. Year of the hazard event.} +\item{month}{Numeric. Month of the hazard event (1-12).} +\item{hazard}{Character. Type of hazard event.} +\item{damage_property}{Numeric. Property damage in 2023 inflation-adjusted dollars.} +\item{damage_crop}{Numeric. Crop damage in 2023 inflation-adjusted dollars.} +\item{injuries}{Numeric. Number of injuries.} +\item{fatalities}{Numeric. Number of fatalities.} +\item{records}{Numeric. Number of individual event records aggregated into this observation.} +} +Note: Only counties that existed in either 2010 or 2022 are included. } \description{ Access temporal county-level SHELDUS hazard damage data. diff --git a/man/get_spatial_extent_census.Rd b/man/get_spatial_extent_census.Rd index 376e030..c1356ea 100644 --- a/man/get_spatial_extent_census.Rd +++ b/man/get_spatial_extent_census.Rd @@ -14,7 +14,12 @@ get_spatial_extent_census(data, return_geometry = FALSE, projection = 5070) \item{projection}{The EPSG code of the desired projection. Default is 5070 (Albers Equal Area).} } \value{ -A dataframe (optionally, an sf-dataframe) comprising Census geographies +A tibble (or \code{sf} object if \code{return_geometry = TRUE}) containing Census geographies that overlap with the input spatial data. The structure depends on the geographic extent: +\describe{ +\item{When multiple states overlap}{Returns state-level data with columns: \code{state_geoid} (2-digit FIPS), \code{geography} ("state").} +\item{When a single state overlaps}{Returns tract-level data with columns: \code{state_geoid} (2-digit FIPS), \code{county_geoid} (5-digit FIPS), \code{geography} ("tract").} +} +If \code{return_geometry = TRUE}, the geometry column is retained; otherwise it is dropped. } \description{ Get the Census geographies that overlap with the input spatial dataset diff --git a/man/get_structures.Rd b/man/get_structures.Rd index 9d81a15..027e22c 100644 --- a/man/get_structures.Rd +++ b/man/get_structures.Rd @@ -14,7 +14,21 @@ get_structures(boundaries, geography = "county", keep_structures = FALSE) \item{keep_structures}{Logical. If TRUE, the raw structure data will be returned alongside the summarized data.} } \value{ -A dataframe comprising estimated counts of each structure type, at the specified \code{geography}, for all such geographic units intersecting the \code{boundaries} object. If keep_structure = TRUE, returns a list with two elements: the summarized data and the raw structure data. +Depends on the \code{keep_structures} parameter: + +\strong{When \code{keep_structures = FALSE} (default):} A tibble containing structure counts aggregated by geography and occupancy type, with columns: +\describe{ +\item{GEOID}{Character. Census geography identifier (county FIPS or tract GEOID depending on \code{geography} parameter).} +\item{primary_occupancy}{Character. The primary occupancy classification of the structures (e.g., "Single Family Dwelling", "Multi - Family Dwelling").} +\item{occupancy_class}{Character. Broad occupancy classification (e.g., "Residential", "Commercial").} +\item{count}{Integer. Number of structures of this occupancy type in the geography.} +} + +\strong{When \code{keep_structures = TRUE}:} A named list with two elements: +\describe{ +\item{structures_summarized}{The aggregated tibble described above.} +\item{structures_raw}{An \code{sf} object (POINT geometry) containing individual structure records with columns: \code{unique_id} (building ID), \code{occupancy_class}, \code{primary_occupancy}, \code{county_fips}, and geometry.} +} } \description{ Estimate counts of hazard-impacted structures by structure type diff --git a/man/get_system_username.Rd b/man/get_system_username.Rd index fe213d5..de6af47 100644 --- a/man/get_system_username.Rd +++ b/man/get_system_username.Rd @@ -7,7 +7,8 @@ get_system_username() } \value{ -The username of the user running the script +A character string containing the system username. Uses \code{Sys.info()["user"]} +which works reliably across Windows, Mac, and Linux. } \description{ Get the user's username diff --git a/man/inflation_adjust.Rd b/man/inflation_adjust.Rd index 535befe..121afe7 100644 --- a/man/inflation_adjust.Rd +++ b/man/inflation_adjust.Rd @@ -24,7 +24,7 @@ inflation_adjust( \item{base_year}{The year to use as the base for inflation adjustment. If NULL, defaults to the most recent year in the PCE index data.} } \value{ -A dataframe with inflation-adjusted values +A tibble identical to the input \code{df} with additional inflation-adjusted columns. For each column specified in \code{dollar_variables}, a new column is created with the same name plus \code{names_suffix} (default: "_{base_year}"). The adjusted values are calculated by multiplying original values by an inflation factor derived from the PCE Price Index ratio between the base year and each observation's year. Original columns are preserved unchanged. } \description{ The Personal Consumption Expenditures Price Index (PCE Index) is from the diff --git a/man/polygons_to_linestring.Rd b/man/polygons_to_linestring.Rd index 4eb2303..b4af43c 100644 --- a/man/polygons_to_linestring.Rd +++ b/man/polygons_to_linestring.Rd @@ -10,7 +10,14 @@ polygons_to_linestring(.sf) \item{.sf}{The spatial dataframe containing one or more polygons} } \value{ -A simple feature collection of linestrings derived from the inputted polygons; all attributes are retained, and two new attributes--\code{polygon_id} and \code{line_id}--are prepended to the output +An \code{sf} object (simple feature collection) with geometry type LINESTRING. The returned object contains: +\describe{ +\item{polygon_id}{Integer. The row index of the originating polygon from the input \code{.sf} object, enabling linkage back to the source polygon.} +\item{line_id}{Integer. A sequential identifier for each line segment within its originating polygon. Line segments are ordered according to the vertex sequence of the polygon boundary.} +\item{...}{All original attributes from the input \code{.sf} object are preserved and joined back via \code{polygon_id}.} +\item{geometry}{LINESTRING geometry. Each line segment represents one edge of the original polygon boundary.} +} +The CRS of the output matches the input \code{.sf} object (transformed to EPSG:5070 during processing). } \description{ Convert polygons into their component linestrings diff --git a/man/qualtrics_define_missing.Rd b/man/qualtrics_define_missing.Rd index 011bbec..e7412c1 100644 --- a/man/qualtrics_define_missing.Rd +++ b/man/qualtrics_define_missing.Rd @@ -27,7 +27,12 @@ qualtrics_define_missing( \item{predicate_question_negative_value}{If \code{predicate_question} is specified, provide the value that indicates a negative response to the predicate question. For responses where the predicate question has this value, this value will be imputed to the specified columns} } \value{ -The inputted \code{df} object with missing/non-missing values applied to specified columns +A tibble containing only the columns selected by \code{question_code_include} (excluding those matching \code{question_code_omit}), with missing values handled according to the following logic: +\describe{ +\item{Without predicate_question}{If all selected columns are NA for a row, values remain NA. If any selected column has a non-NA value, NA values in other selected columns are replaced with the appropriate default value from \code{default_values} based on column type.} +\item{With predicate_question}{If the predicate question is NA, all selected columns are set to NA. If the predicate question equals \code{predicate_question_negative_value }, all selected columns are set to the appropriate default value. Otherwise, original values are preserved.} +} +Column types and their default value mappings: character uses \code{default_values[[1]]}, numeric uses \code{default_values[[2]]}, and Date/POSIXct uses \code{default_values[[3]]}. } \description{ Fill in missing and non-missing values across interrelated survey questions diff --git a/man/qualtrics_format_metadata.Rd b/man/qualtrics_format_metadata.Rd index 9d3ddfc..a6d6968 100644 --- a/man/qualtrics_format_metadata.Rd +++ b/man/qualtrics_format_metadata.Rd @@ -14,7 +14,15 @@ qualtrics_format_metadata(metadata, sections = c(), text_replace = "zzzzz") \item{text_replace}{A named character vector of regex patterns to replace in the metadata} } \value{ -A dataframe of formatted metadata +A tibble containing formatted Qualtrics survey metadata with the following columns: +\describe{ +\item{question_number}{Integer. The sequential position of the question in the survey (1-indexed).} +\item{question_name}{Character. The internal Qualtrics question identifier (e.g., "Q1", "Q2_1").} +\item{text_main}{Character. The primary question text, with any patterns specified in \code{text_replace} substituted.} +\item{text_sub}{Character. The sub-question or response option text, with any patterns specified in \code{text_replace} substituted.} +\item{survey_section}{Character. The name of the survey section to which the question belongs, as defined by the \code{sections} parameter. Filled upward from section boundaries. +} +} } \description{ Prep Qualtrics metadata diff --git a/man/qualtrics_get_metadata.Rd b/man/qualtrics_get_metadata.Rd index a8650b2..5ccd64b 100644 --- a/man/qualtrics_get_metadata.Rd +++ b/man/qualtrics_get_metadata.Rd @@ -21,7 +21,7 @@ qualtrics_get_metadata( \item{return_values}{The name of the column (character) to be returned} } \value{ -A character vector of the requested metadata +A character vector containing the values from the column specified by \code{return_values} (default: "text_sub"), filtered to rows matching either the \code{question_name} or \code{survey_section} pattern. The length of the vector corresponds to the number of matching rows in the metadata. Returns an empty character vector if no matches are found. } \description{ Access Qualtrics metadata diff --git a/man/qualtrics_plot_question.Rd b/man/qualtrics_plot_question.Rd index 1d8a308..2627cbe 100644 --- a/man/qualtrics_plot_question.Rd +++ b/man/qualtrics_plot_question.Rd @@ -43,7 +43,14 @@ qualtrics_plot_question( \item{omit_other}{Logical; whether to omit the "Other" response option. Default is TRUE.} } \value{ -A ggplot object +A \code{ggplot2} object representing a visualization of survey responses. The plot type varies based on \code{question_type}: +\describe{ +\item{For "continuous"}{A boxplot showing the distribution of numeric responses, with question sub-text on the y-axis and values on the x-axis. Multiple sub-questions are displayed as separate boxplots. +} +\item{For "checkbox_single" or "checkbox_multi"}{A horizontal bar chart showing response counts. Response options are ordered by total count (descending). For "checkbox_multi", bars are stacked by response type.} +\item{For "checkbox_factor"}{A stacked horizontal bar chart showing response counts by factor level, with response options ordered by total count.} +} +The plot uses Urban Institute theming via \code{urbnthemes::theme_urbn_print()} and includes the specified \code{title} and auto-generated or custom \code{subtitle}. } \description{ Plot responses to Qualtrics survey questions diff --git a/man/read_ipums_cached.Rd b/man/read_ipums_cached.Rd index 15c5838..14baf10 100644 --- a/man/read_ipums_cached.Rd +++ b/man/read_ipums_cached.Rd @@ -21,7 +21,12 @@ read_ipums_cached( \item{refresh}{If true, execute the API query, even if data are already stored locally. Defaults to FALSE.} } \value{ -A dataframe corresponding to the supplied \code{extract_definition} +A tibble containing IPUMS data corresponding to the supplied \code{extract_definition}. The structure varies by collection type: +\describe{ +\item{For microdata collections (e.g., "usa", "cps")}{Returns individual-level records with columns corresponding to the variables specified in the extract definition. Column names and types are determined by IPUMS variable specifications. The data are read via \code{ipumsr::read_ipums_micro()}.} +\item{For aggregate collections ("nhgis", "ihgis")}{Returns aggregate data (typically at geographic summary levels) with columns corresponding to the requested tables/variables. IPUMS variable attributes are applied via the collection's codebook. The data are read via \code{ipumsr::read_ipums_agg()}.} +} +If a cached file exists at the specified path and \code{refresh = FALSE}, the cached data are returned with a warning. Otherwise, the extract is submitted to IPUMS, downloaded, and cached for future use. } \description{ This script wraps a standard ipumsr::read_ipums*() query workflow, addressing diff --git a/man/read_xlsx_from_url.Rd b/man/read_xlsx_from_url.Rd index efb1811..2c2334e 100644 --- a/man/read_xlsx_from_url.Rd +++ b/man/read_xlsx_from_url.Rd @@ -16,7 +16,8 @@ read_xlsx_from_url(urls, directory, file_names = NULL, silent = TRUE) \item{silent}{If TRUE (default), files are saved silently. If FALSE, downloaded files are read and returned as a list.} } \value{ -Either nothing (silent == TRUE) or a list of dataframes from the specified URLs. +When \code{silent = TRUE} (default): Returns NULL invisibly. Files are downloaded and saved to \code{directory}. +When \code{silent = FALSE}: Returns a list of data frames, one per URL, containing the contents of each downloaded .xlsx file as read by \code{openxlsx::read.xlsx()}. List elements are in the same order as the input \code{urls}. } \description{ Download a .xlsx file(s) from a URL(s) diff --git a/man/subdivide_linestring.Rd b/man/subdivide_linestring.Rd index 0f3af6c..6eb0f22 100644 --- a/man/subdivide_linestring.Rd +++ b/man/subdivide_linestring.Rd @@ -14,7 +14,13 @@ subdivide_linestring(line, max_length, crs = 5070) \item{crs}{The coordinate reference system to which the linestring should be transformed. Default is 5070.} } \value{ -A spatial dataframe comprising linestrings below the \code{max_length} threshold, linked back to their input linestrings via a \code{line_id} attribute +An \code{sf} object (simple feature collection) with geometry type LINESTRING. The returned object contains: +\describe{ +\item{row_id}{Integer. The row index from the original input linestring, allowing linkage back to the input data.} +\item{...}{All original attributes from the input \code{line} object are preserved and joined back via \code{row_id}.} +\item{geometry}{LINESTRING geometry. Each segment is at most \code{max_length} units long (in the CRS units). Segments shorter than \code{max_length} in the input are returned unchanged.} +} +The CRS of the output is set to the value specified by the \code{crs} parameter (default: EPSG:5070). } \description{ Subdivide a linestring into segments of a specified length diff --git a/tests/testthat/test-get_business_patterns.R b/tests/testthat/test-get_business_patterns.R index 586a5e7..c90b7f1 100644 --- a/tests/testthat/test-get_business_patterns.R +++ b/tests/testthat/test-get_business_patterns.R @@ -1,3 +1,4 @@ +# Tests for get_business_patterns() testthat::test_that("naics_code_digits errors clearly when not in c(2,3)", { testthat::expect_error({get_business_patterns(year = 2022, naics_code_digits = 4)}) @@ -29,3 +30,53 @@ testthat::test_that("employees has no negative values", { info = "Found negative values in `employees`." ) }) + +# Tests for get_naics_codes() + +testthat::test_that("get_naics_codes exists and is a function", { + testthat::expect_true(is.function(get_naics_codes)) +}) + +testthat::test_that("get_naics_codes validates year parameter", { + testthat::expect_error(get_naics_codes(year = 1985), "1986 or later") + testthat::expect_error(get_naics_codes(year = 2030), "2023") +}) + +testthat::test_that("get_naics_codes validates digits parameter", { + testthat::expect_error(get_naics_codes(year = 2022, digits = 1), "between 2 and 6") + testthat::expect_error(get_naics_codes(year = 2022, digits = 7), "between 2 and 6") +}) + +testthat::test_that("get_naics_codes returns tibble with expected columns", { + result <- get_naics_codes(year = 2022, digits = 2) + + testthat::expect_s3_class(result, "tbl_df") + testthat::expect_true("naics_code" %in% names(result)) + testthat::expect_true("naics_label" %in% names(result)) + testthat::expect_true("year" %in% names(result)) +}) +testthat::test_that("get_naics_codes filters by digit count correctly", { + result_2 <- get_naics_codes(year = 2022, digits = 2) + result_3 <- get_naics_codes(year = 2022, digits = 3) + + # All 2-digit codes should have exactly 2 characters + testthat::expect_true(all(nchar(result_2$naics_code) == 2)) + + # All 3-digit codes should have exactly 3 characters + testthat::expect_true(all(nchar(result_3$naics_code) == 3)) + + # 3-digit should have more codes than 2-digit + testthat::expect_gt(nrow(result_3), nrow(result_2)) +}) + +testthat::test_that("get_naics_codes default digits is 3", { + result <- get_naics_codes(year = 2022) + + testthat::expect_true(all(nchar(result$naics_code) == 3)) +}) + +testthat::test_that("get_naics_codes year column matches requested year", { + result <- get_naics_codes(year = 2020) + + testthat::expect_true(all(result$year == 2020)) +})