diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 90df33d..c1b1326 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,22 +1,18 @@ { - "image": "rocker/r2u", + "name": "migrate R package", + "image": "rocker/r-ver:4", // Features to add to the dev container. More info: https://containers.dev/features. "features": { - // Install system library for `devtools::document()` - // More info: https://github.com/rocker-org/devcontainer-features/blob/main/src/quarto-cli/README.md#install-chromium - "ghcr.io/rocker-org/devcontainer-features/apt-packages:1": { + "ghcr.io/rocker-org/devcontainer-features/apt-packages:1": { "packages": "libxml2-dev, qpdf" }, - // Install pandoc (for building vignettes) - "ghcr.io/rocker-org/devcontainer-features/pandoc:1": {}, - // Install additional R package dependencies "ghcr.io/rocker-org/devcontainer-features/r-packages:1": { - "packages": "testthat, knitr, rmarkdown, dplyr, tidyr, tibble, rlang, utils, magrittr, devtools, usethis, testthat", - "additionalRepositories": "CRAN = 'https://packagemanager.posit.co/cran/__linux__/jammy/latest'" + "packages": "devtools, dplyr, github::nx10/httpgd, knitr, languageserver, magrittr, rlang, rmarkdown, testthat, tibble, tidyr, usethis", + "installSystemRequirements": true } }, @@ -25,13 +21,13 @@ // Settings for VS Code. "vscode": { "extensions": [ - "reditorsupport.r" + "reditorsupport.r", + "RDebugger.r-debugger" ], "settings": { + "editor.rulers": [80], "r.bracketedPaste": true, - "r.plot.useHttpgd": true, - "r.lsp.diagnostics": false, - "r.lsp.promptToInstall": false + "r.plot.useHttpgd": true } } } diff --git a/DESCRIPTION b/DESCRIPTION index dd077a4..c533c23 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: migrate Type: Package Title: Create Credit State Migration (Transition) Matrices -Version: 0.5.0 +Version: 0.5.1 Authors@R: c(person(given = "Michael", family = "Thomas", @@ -14,7 +14,11 @@ Authors@R: person(given = "Ivan", family = "Millanes", role = "ctb", - email = "imillanes@ketchbrookanalytics.com")) + email = "imillanes@ketchbrookanalytics.com"), + person(given = "Dylan", + family = "Hughes", + role = "ctb", + email = "dhughes@ketchbrookanalytics.com")) Description: Tools to help convert credit risk data at two timepoints into traditional credit state migration (aka, "transition") matrices. At a higher level, 'migrate' is intended to help an analyst understand @@ -44,5 +48,5 @@ Imports: VignetteBuilder: knitr Language: en-US -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.3 Roxygen: list(markdown = TRUE) diff --git a/NEWS.md b/NEWS.md index 8ba537d..1dfcdb5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# migrate 0.5.1 + +## Bug Fix + +* Remediates issue where `migrate()` would fail if values of `time` argument had overlapping characters (i.e., "T1" and "T100") + + `migrate()` now throws a warning if the argument passed to `time` is a character-type column + # migrate 0.5.0 ## Enhancements diff --git a/R/migrate.R b/R/migrate.R index 07daa3b..98d55e5 100644 --- a/R/migrate.R +++ b/R/migrate.R @@ -72,8 +72,15 @@ coerce_factor <- function(data, state_name) { # to convert it to an ordered factor if (!is.ordered(state_vec)) { + msg <- glue::glue( + "Please consider converting `{ state_name }` to an ordered factor", + "before passing it to `migrate()` to ensure that the rank-ordering in", + "the final matrix displays correctly", + .sep = " " + ) + cli::cli_warn( - c("!" = glue::glue("Please consider converting `{ state_name }` to an ordered factor before passing it to `migrate()` to ensure that the rank-ordering in the final matrix displays correctly")) + c("!" = msg) ) } @@ -87,7 +94,12 @@ coerce_factor <- function(data, state_name) { cli::cli_warn( c( "!" = glue::glue("Converting `{ state_name }` to type `factor`"), - "!" = glue::glue("To ensure that your output is ordered correctly, convert the `{ state_name }` column variable in your data frame to an ordered factor before passing to `migrate()`") + "i" = glue::glue( + "To ensure that your output is ordered correctly, convert the", + "`{ state_name }` column variable in your data frame to an ordered", + "factor before passing to `migrate()`", + .sep = " " + ) ) ) @@ -103,6 +115,7 @@ coerce_factor <- function(data, state_name) { # Stop execution if there aren't exactly 2 unique time values in the data +# Warn that character time values may sort incorrectly (e.g., pre_, post_) check_times <- function(times, time_name) { if (length(times) != 2) { @@ -118,6 +131,19 @@ check_times <- function(times, time_name) { } + if (is.character(times)) { + + cli::cli_warn( + c("!" = glue::glue( + "Please consider converting `{ time_name }` to an ordered factor", + "before passing it to `migrate()` to ensure that transition is", + "appropriately chronological", + .sep = " " + )) + ) + + } + } @@ -144,7 +170,11 @@ drop_missing_timepoints <- function(data) { tidyr::drop_na() cli::cli_warn( - c("!" = glue::glue("Removed { (nrow(data) - nrow(out)) } observations due to missingness or IDs only existing at one `time` value")) + c("!" = glue::glue( + "Removed { (nrow(data) - nrow(out)) } observations due to missingness or", + "IDs only existing at one `time` value", + .sep = " " + )) ) return(out) @@ -186,7 +216,7 @@ migrate_percent <- function(data, state_start_name, metric_name) { "{metric_name}" := .data[[metric_name]] / sum(.data[[metric_name]]) ) |> dplyr::ungroup() |> - # Replace `NaN` values with `Inf` so that they are not dropped with `drop_na()` + # Replace `NaN` values with `Inf` so that they're not dropped by `drop_na()` dplyr::mutate( "{metric_name}" := ifelse( is.nan(.data[[metric_name]]), @@ -313,7 +343,9 @@ migrate <- function(data, id, time, state, # Stop if the `metric` variable isn't numeric if (!is.numeric(data[[metric_name]])) { - cli::cli_abort("`metric` argument must be a numeric type variable in `data`") + cli::cli_abort( + "`metric` argument must be a numeric type variable in `data`" + ) } @@ -360,10 +392,24 @@ migrate <- function(data, id, time, state, # Inform the user cli::cli_div(theme = list(ul = list(`margin-left` = 2, before = ""))) - cli::cli_alert_info(glue::glue("{ n_missing } IDs have a missing timepoint:")) + cli::cli_alert_info( + glue::glue("{ n_missing } IDs have a missing timepoint:") + ) cli::cli_ul(id = "ul_id") - cli::cli_li(glue::glue("Migrating { n_missing_end } IDs with missing end timepoint to { fill_state_class_type } class '{ fill_state }'")) - cli::cli_li(glue::glue("Migrating { n_missing_start } IDs with missing start timepoint from { fill_state_class_type } class '{ fill_state }'")) + cli::cli_li( + glue::glue( + "Migrating { n_missing_end } IDs with missing end timepoint to", + "{ fill_state_class_type } class '{ fill_state }'", + .sep = " " + ) + ) + cli::cli_li( + glue::glue( + "Migrating { n_missing_start } IDs with missing start timepoint from", + "{ fill_state_class_type } class '{ fill_state }'", + .sep = " " + ) + ) cli::cli_end(id = "ul_id") } @@ -416,14 +462,16 @@ migrate <- function(data, id, time, state, } # Replace the time values in the column names with "start" and "end" + # Anchoring each pattern to the end of the column names to + # accommodate overlapping patterns (e.g., M1 and M12) colnames(data) <- gsub( - pattern = as.character(times[1]), + pattern = paste0(as.character(times[1]), "$"), replacement = "start", x = colnames(data) ) colnames(data) <- gsub( - pattern = as.character(times[2]), + pattern = paste0(as.character(times[2]), "$"), replacement = "end", x = colnames(data) ) diff --git a/README.Rmd b/README.Rmd index 725faa2..76e8348 100644 --- a/README.Rmd +++ b/README.Rmd @@ -61,16 +61,19 @@ devtools::install_github("ketchbrookanalytics/migrate") ## Practical Usage -{migrate} currently only handles transitions between exactly two (2) timepoints. Under the hood, `migrate()` finds the earliest & latest dates in the given *time* variable, and filters out any observations where the *time* value does not match those two dates. +{migrate} currently only handles transitions between exactly two (2) timepoints. Under the hood, `migrate()` finds the earliest & latest timepoints in the given *time* variable, and filters out any observations where the *time* value does not match those two periods. -If you are writing a SQL query to get data to be used with `migrate()`, the query would likely look something like this: +`migrate()` identifies the desired timepoints in the *time* variable by isolating the unique values and sorting. As a result, `migrate()` can accommodate a variety of data types. However, `date` or `datetime` data types are likely the most convenient to work with. + +While most data types will sort appropriately, if the *time* variable is type `character`, it is recommended to convert to type `factor` (ordered) before passing to `migrate()` to ensure the *time* values are properly sequenced. `migrate()` will throw a warning if the *time* variable is type `character`. -```{r, eval = FALSE} -# -- Get the *State* risk status and *Balance* dollar amount for each ID, at two distinct dates +If you are writing a SQL query to get data to be used with `migrate()`, the query would likely look something like this: -# SELECT ID, Date, State, Balance -# FROM my_database -# WHERE Date IN ('2020-12-31', '2021-06-30') +```sql +-- Get the *State* risk status and *Balance* dollar amount for each ID, at two distinct dates +SELECT ID, Date, State, Balance +FROM my_database +WHERE Date IN ('2020-12-31', '2021-06-30') ``` By default, `migrate()` drops observations that belong to IDs found at a single timepoint. However, users can define a *filler state* so that IDs with a single timepoint are not removed but rather migrated from or to this *filler state*. This allows for more flexible handling of such data, ensuring that no information is lost during the migration process. Check [Handle IDs with observations at a single timepoint](https://ketchbrookanalytics.github.io/migrate/articles/migrate.html#handle-ids-with-observations-at-a-single-timepoint) for more information. diff --git a/README.md b/README.md index cd013a9..532fb47 100644 --- a/README.md +++ b/README.md @@ -68,18 +68,28 @@ devtools::install_github("ketchbrookanalytics/migrate") {migrate} currently only handles transitions between exactly two (2) timepoints. Under the hood, `migrate()` finds the earliest & latest -dates in the given *time* variable, and filters out any observations -where the *time* value does not match those two dates. +timepoints in the given *time* variable, and filters out any +observations where the *time* value does not match those two periods. + +`migrate()` identifies the desired timepoints in the *time* variable by +isolating the unique values and sorting. As a result, `migrate()` can +accommodate a variety of data types. However, `date` or `datetime` data +types are likely the most convenient to work with. + +While most data types will sort appropriately, if the *time* variable is +type `character`, it is recommended to convert to type `factor` +(ordered) before passing to `migrate()` to ensure the *time* values are +properly sequenced. `migrate()` will throw a warning if the *time* +variable is type `character`. If you are writing a SQL query to get data to be used with `migrate()`, the query would likely look something like this: -``` r -# -- Get the *State* risk status and *Balance* dollar amount for each ID, at two distinct dates - -# SELECT ID, Date, State, Balance -# FROM my_database -# WHERE Date IN ('2020-12-31', '2021-06-30') +``` sql +-- Get the *State* risk status and *Balance* dollar amount for each ID, at two distinct dates +SELECT ID, Date, State, Balance +FROM my_database +WHERE Date IN ('2020-12-31', '2021-06-30') ``` By default, `migrate()` drops observations that belong to IDs found at a @@ -137,9 +147,6 @@ migrated_df <- migrate( state = risk_rating, ) #> ℹ Migrating from 2020-06-30 to 2020-09-30 -``` - -``` r head(migrated_df) #> # A tibble: 6 × 3 #> risk_rating_start risk_rating_end prop diff --git a/tests/testthat/test-migrate.R b/tests/testthat/test-migrate.R index 95fd0d6..9533e6f 100644 --- a/tests/testthat/test-migrate.R +++ b/tests/testthat/test-migrate.R @@ -315,6 +315,7 @@ test_that("migrate() throws an error if `metric` argument is not numeric column" }) + test_that("migrate() correctly names third column based upon `metric` argument", { # when `percent = TRUE` (default) @@ -360,6 +361,7 @@ test_that("migrate() correctly names third column based upon `metric` argument", }) + test_that("migrate() coerces 'character'-type `state` columns to type 'factor'", { suppressWarnings({ @@ -389,13 +391,61 @@ test_that("migrate() coerces 'character'-type `state` columns to type 'factor'", }) + +# Mutate `date` to 'character'-type +mock_credit_time_character <- mock_credit |> + dplyr::mutate( + time_overlap_chars = dplyr::case_when( + date == as.Date("2020-06-30") ~ "M1", + date == as.Date("2020-09-30") ~ "M100" + ) + ) + + +testthat::test_that("migrate() names 'character'-type `time` columns correctly", { + + df_time_character <- suppressWarnings({ + migrate( + data = mock_credit_time_character, + time = time_overlap_chars, + state = risk_rating, + id = customer_id, + verbose = FALSE + ) + }) + + testthat::expect_identical( + raw_ct, + df_time_character + ) + +}) + + +testthat::test_that("migrate() throws a warning if `time` variable is 'character`-type", { + + # suggest converting character to ordered factor + testthat::expect_warning( + migrate( + data = mock_credit_time_character, + time = time_overlap_chars, + state = risk_rating, + id = customer_id, + verbose = FALSE + ), + regexp = "Please consider converting `time_overlap_chars` to an ordered factor" + ) + +}) + + ## Tests for `fill_state` argument --------------------------------------- # Create mock data with `customer_id` values that only exist at one timepoint. # In particular, `mock_credit_with_missing` has: # - 20 customers that have a value only in the first timepoint # - 10 customers that have a value only in the second timepoint -mock_credit_with_missing <- mock_credit |> +mock_credit_with_missing <- mock_credit |> # Remove the first 10 rows dplyr::slice(-(1:10)) |> # Remove the last 20 rows @@ -410,8 +460,8 @@ test_that("migrate() doesn't remove customers with missing timepoints when `fill id = customer_id, percent = FALSE, verbose = FALSE - ) |> - dplyr::pull(count) |> + ) |> + dplyr::pull(count) |> sum() migrate_counts_with_missing <- migrate( @@ -422,8 +472,8 @@ test_that("migrate() doesn't remove customers with missing timepoints when `fill percent = FALSE, fill_state = "NR", verbose = FALSE - ) |> - dplyr::pull(count) |> + ) |> + dplyr::pull(count) |> sum() expect_equal(migrate_counts_without_missing, migrate_counts_with_missing) @@ -440,10 +490,10 @@ test_that("migrate() removes customers with missing timepoints when `fill_state` id = customer_id, percent = FALSE, verbose = FALSE - ) |> - dplyr::pull(count) |> + ) |> + dplyr::pull(count) |> sum() - }) + }) migrate_counts_with_fill_state <- migrate( data = mock_credit_with_missing, @@ -453,8 +503,8 @@ test_that("migrate() removes customers with missing timepoints when `fill_state` percent = FALSE, fill_state = "NR", verbose = FALSE - ) |> - dplyr::pull(count) |> + ) |> + dplyr::pull(count) |> sum() expect_true(migrate_counts_without_fill_state < migrate_counts_with_fill_state) @@ -503,14 +553,14 @@ test_that("migrate() assigns filler state correctly when `fill_state` is not NUL verbose = FALSE ) - n_missing_start <- migrated_data |> - dplyr::count(risk_rating_start, wt = count) |> - dplyr::filter(risk_rating_start == "NR") |> + n_missing_start <- migrated_data |> + dplyr::count(risk_rating_start, wt = count) |> + dplyr::filter(risk_rating_start == "NR") |> dplyr::pull(n) - n_missing_end <- migrated_data |> - dplyr::count(risk_rating_end, wt = count) |> - dplyr::filter(risk_rating_end == "NR") |> + n_missing_end <- migrated_data |> + dplyr::count(risk_rating_end, wt = count) |> + dplyr::filter(risk_rating_end == "NR") |> dplyr::pull(n) # Recall that `mock_credit_with_missing` removed the first 10 and the last 20 rows