diff --git a/DESCRIPTION b/DESCRIPTION index 0b647a1..2b2e1c2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,15 +17,19 @@ URL: https://important.tidymodels.org/, https://github.com/tidymodels/important BugReports: https://github.com/tidymodels/important/issues Depends: - R (>= 4.1.0) + R (>= 4.1.0), + recipes (>= 1.1.0) Imports: cli, + desirability2 (>= 0.2.0), dplyr, + filtro (>= 0.2.0), generics, ggplot2, hardhat (>= 1.4.1), purrr, - rlang, + rlang (>= 1.1.0), + S7, tibble, tidyr, tune, @@ -39,7 +43,7 @@ Suggests: mirai, modeldata, parsnip, - recipes, + ranger, spelling, survival, testthat (>= 3.0.0), diff --git a/NAMESPACE b/NAMESPACE index 63caeb2..e8190a3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,14 +1,37 @@ # Generated by roxygen2: do not edit by hand S3method(autoplot,importance_perm) +S3method(bake,step_predictor_best) +S3method(bake,step_predictor_desirability) +S3method(bake,step_predictor_retain) +S3method(prep,step_predictor_best) +S3method(prep,step_predictor_desirability) +S3method(prep,step_predictor_retain) +S3method(print,step_predictor_best) +S3method(print,step_predictor_desirability) +S3method(print,step_predictor_retain) +S3method(required_pkgs,step_predictor_best) +S3method(required_pkgs,step_predictor_desirability) +S3method(tidy,step_predictor_best) +S3method(tidy,step_predictor_desirability) +S3method(tidy,step_predictor_retain) +S3method(tunable,step_predictor_best) +S3method(tunable,step_predictor_desirability) export(augment) export(autoplot) export(importance_perm) export(required_pkgs) +export(step_predictor_best) +export(step_predictor_desirability) +export(step_predictor_retain) +if (getRversion() < "4.3.0") importFrom("S7", "@") +import(recipes) +import(rlang) +importFrom(S7,check_is_S7) importFrom(generics,augment) importFrom(generics,required_pkgs) importFrom(ggplot2,autoplot) importFrom(hardhat,extract_fit_parsnip) -importFrom(hardhat,extract_postprocessor) +importFrom(stats,as.formula) importFrom(stats,predict) importFrom(stats,sd) diff --git a/R/import-standalone-obj-type.R b/R/import-standalone-obj-type.R new file mode 100644 index 0000000..c582ba0 --- /dev/null +++ b/R/import-standalone-obj-type.R @@ -0,0 +1,365 @@ +# Standalone file: do not edit by hand +# Source: https://github.com/r-lib/rlang/blob/HEAD/R/standalone-obj-type.R +# Generated by: usethis::use_standalone("r-lib/rlang", "obj-type") +# ---------------------------------------------------------------------- +# +# --- +# repo: r-lib/rlang +# file: standalone-obj-type.R +# last-updated: 2024-02-14 +# license: https://unlicense.org +# imports: rlang (>= 1.1.0) +# --- +# +# ## Changelog +# +# 2024-02-14: +# - `obj_type_friendly()` now works for S7 objects. +# +# 2023-05-01: +# - `obj_type_friendly()` now only displays the first class of S3 objects. +# +# 2023-03-30: +# - `stop_input_type()` now handles `I()` input literally in `arg`. +# +# 2022-10-04: +# - `obj_type_friendly(value = TRUE)` now shows numeric scalars +# literally. +# - `stop_friendly_type()` now takes `show_value`, passed to +# `obj_type_friendly()` as the `value` argument. +# +# 2022-10-03: +# - Added `allow_na` and `allow_null` arguments. +# - `NULL` is now backticked. +# - Better friendly type for infinities and `NaN`. +# +# 2022-09-16: +# - Unprefixed usage of rlang functions with `rlang::` to +# avoid onLoad issues when called from rlang (#1482). +# +# 2022-08-11: +# - Prefixed usage of rlang functions with `rlang::`. +# +# 2022-06-22: +# - `friendly_type_of()` is now `obj_type_friendly()`. +# - Added `obj_type_oo()`. +# +# 2021-12-20: +# - Added support for scalar values and empty vectors. +# - Added `stop_input_type()` +# +# 2021-06-30: +# - Added support for missing arguments. +# +# 2021-04-19: +# - Added support for matrices and arrays (#141). +# - Added documentation. +# - Added changelog. +# +# nocov start + +#' Return English-friendly type +#' @param x Any R object. +#' @param value Whether to describe the value of `x`. Special values +#' like `NA` or `""` are always described. +#' @param length Whether to mention the length of vectors and lists. +#' @return A string describing the type. Starts with an indefinite +#' article, e.g. "an integer vector". +#' @noRd +obj_type_friendly <- function(x, value = TRUE) { + if (is_missing(x)) { + return("absent") + } + + if (is.object(x)) { + if (inherits(x, "quosure")) { + type <- "quosure" + } else { + type <- class(x)[[1L]] + } + return(sprintf("a <%s> object", type)) + } + + if (!is_vector(x)) { + return(.rlang_as_friendly_type(typeof(x))) + } + + n_dim <- length(dim(x)) + + if (!n_dim) { + if (!is_list(x) && length(x) == 1) { + if (is_na(x)) { + return(switch( + typeof(x), + logical = "`NA`", + integer = "an integer `NA`", + double = if (is.nan(x)) { + "`NaN`" + } else { + "a numeric `NA`" + }, + complex = "a complex `NA`", + character = "a character `NA`", + .rlang_stop_unexpected_typeof(x) + )) + } + + show_infinites <- function(x) { + if (x > 0) { + "`Inf`" + } else { + "`-Inf`" + } + } + str_encode <- function(x, width = 30, ...) { + if (nchar(x) > width) { + x <- substr(x, 1, width - 3) + x <- paste0(x, "...") + } + encodeString(x, ...) + } + + if (value) { + if (is.numeric(x) && is.infinite(x)) { + return(show_infinites(x)) + } + + if (is.numeric(x) || is.complex(x)) { + number <- as.character(round(x, 2)) + what <- if (is.complex(x)) "the complex number" else "the number" + return(paste(what, number)) + } + + return(switch( + typeof(x), + logical = if (x) "`TRUE`" else "`FALSE`", + character = { + what <- if (nzchar(x)) "the string" else "the empty string" + paste(what, str_encode(x, quote = "\"")) + }, + raw = paste("the raw value", as.character(x)), + .rlang_stop_unexpected_typeof(x) + )) + } + + return(switch( + typeof(x), + logical = "a logical value", + integer = "an integer", + double = if (is.infinite(x)) show_infinites(x) else "a number", + complex = "a complex number", + character = if (nzchar(x)) "a string" else "\"\"", + raw = "a raw value", + .rlang_stop_unexpected_typeof(x) + )) + } + + if (length(x) == 0) { + return(switch( + typeof(x), + logical = "an empty logical vector", + integer = "an empty integer vector", + double = "an empty numeric vector", + complex = "an empty complex vector", + character = "an empty character vector", + raw = "an empty raw vector", + list = "an empty list", + .rlang_stop_unexpected_typeof(x) + )) + } + } + + vec_type_friendly(x) +} + +vec_type_friendly <- function(x, length = FALSE) { + if (!is_vector(x)) { + abort("`x` must be a vector.") + } + type <- typeof(x) + n_dim <- length(dim(x)) + + add_length <- function(type) { + if (length && !n_dim) { + paste0(type, sprintf(" of length %s", length(x))) + } else { + type + } + } + + if (type == "list") { + if (n_dim < 2) { + return(add_length("a list")) + } else if (is.data.frame(x)) { + return("a data frame") + } else if (n_dim == 2) { + return("a list matrix") + } else { + return("a list array") + } + } + + type <- switch( + type, + logical = "a logical %s", + integer = "an integer %s", + numeric = , + double = "a double %s", + complex = "a complex %s", + character = "a character %s", + raw = "a raw %s", + type = paste0("a ", type, " %s") + ) + + if (n_dim < 2) { + kind <- "vector" + } else if (n_dim == 2) { + kind <- "matrix" + } else { + kind <- "array" + } + out <- sprintf(type, kind) + + if (n_dim >= 2) { + out + } else { + add_length(out) + } +} + +.rlang_as_friendly_type <- function(type) { + switch( + type, + + list = "a list", + + NULL = "`NULL`", + environment = "an environment", + externalptr = "a pointer", + weakref = "a weak reference", + S4 = "an S4 object", + + name = , + symbol = "a symbol", + language = "a call", + pairlist = "a pairlist node", + expression = "an expression vector", + + char = "an internal string", + promise = "an internal promise", + ... = "an internal dots object", + any = "an internal `any` object", + bytecode = "an internal bytecode object", + + primitive = , + builtin = , + special = "a primitive function", + closure = "a function", + + type + ) +} + +.rlang_stop_unexpected_typeof <- function(x, call = caller_env()) { + abort( + sprintf("Unexpected type <%s>.", typeof(x)), + call = call + ) +} + +#' Return OO type +#' @param x Any R object. +#' @return One of `"bare"` (for non-OO objects), `"S3"`, `"S4"`, +#' `"R6"`, or `"S7"`. +#' @noRd +obj_type_oo <- function(x) { + if (!is.object(x)) { + return("bare") + } + + class <- inherits(x, c("R6", "S7_object"), which = TRUE) + + if (class[[1]]) { + "R6" + } else if (class[[2]]) { + "S7" + } else if (isS4(x)) { + "S4" + } else { + "S3" + } +} + +#' @param x The object type which does not conform to `what`. Its +#' `obj_type_friendly()` is taken and mentioned in the error message. +#' @param what The friendly expected type as a string. Can be a +#' character vector of expected types, in which case the error +#' message mentions all of them in an "or" enumeration. +#' @param show_value Passed to `value` argument of `obj_type_friendly()`. +#' @param ... Arguments passed to [abort()]. +#' @inheritParams args_error_context +#' @noRd +stop_input_type <- function( + x, + what, + ..., + allow_na = FALSE, + allow_null = FALSE, + show_value = TRUE, + arg = caller_arg(x), + call = caller_env() +) { + # From standalone-cli.R + cli <- env_get_list( + nms = c("format_arg", "format_code"), + last = topenv(), + default = function(x) sprintf("`%s`", x), + inherit = TRUE + ) + + if (allow_na) { + what <- c(what, cli$format_code("NA")) + } + if (allow_null) { + what <- c(what, cli$format_code("NULL")) + } + if (length(what)) { + what <- oxford_comma(what) + } + if (inherits(arg, "AsIs")) { + format_arg <- identity + } else { + format_arg <- cli$format_arg + } + + message <- sprintf( + "%s must be %s, not %s.", + format_arg(arg), + what, + obj_type_friendly(x, value = show_value) + ) + + abort(message, ..., call = call, arg = arg) +} + +oxford_comma <- function(chr, sep = ", ", final = "or") { + n <- length(chr) + + if (n < 2) { + return(chr) + } + + head <- chr[seq_len(n - 1)] + last <- chr[n] + + head <- paste(head, collapse = sep) + + # Write a or b. But a, b, or c. + if (n > 2) { + paste0(head, sep, final, " ", last) + } else { + paste0(head, " ", final, " ", last) + } +} + +# nocov end diff --git a/R/import-standalone-types-check.R b/R/import-standalone-types-check.R new file mode 100644 index 0000000..5214a00 --- /dev/null +++ b/R/import-standalone-types-check.R @@ -0,0 +1,596 @@ +# Standalone file: do not edit by hand +# Source: https://github.com/r-lib/rlang/blob/HEAD/R/standalone-types-check.R +# Generated by: usethis::use_standalone("r-lib/rlang", "types-check") +# ---------------------------------------------------------------------- +# +# --- +# repo: r-lib/rlang +# file: standalone-types-check.R +# last-updated: 2023-03-13 +# license: https://unlicense.org +# dependencies: standalone-obj-type.R +# imports: rlang (>= 1.1.0) +# --- +# +# ## Changelog +# +# 2024-08-15: +# - `check_character()` gains an `allow_na` argument (@martaalcalde, #1724) +# +# 2023-03-13: +# - Improved error messages of number checkers (@teunbrand) +# - Added `allow_infinite` argument to `check_number_whole()` (@mgirlich). +# - Added `check_data_frame()` (@mgirlich). +# +# 2023-03-07: +# - Added dependency on rlang (>= 1.1.0). +# +# 2023-02-15: +# - Added `check_logical()`. +# +# - `check_bool()`, `check_number_whole()`, and +# `check_number_decimal()` are now implemented in C. +# +# - For efficiency, `check_number_whole()` and +# `check_number_decimal()` now take a `NULL` default for `min` and +# `max`. This makes it possible to bypass unnecessary type-checking +# and comparisons in the default case of no bounds checks. +# +# 2022-10-07: +# - `check_number_whole()` and `_decimal()` no longer treat +# non-numeric types such as factors or dates as numbers. Numeric +# types are detected with `is.numeric()`. +# +# 2022-10-04: +# - Added `check_name()` that forbids the empty string. +# `check_string()` allows the empty string by default. +# +# 2022-09-28: +# - Removed `what` arguments. +# - Added `allow_na` and `allow_null` arguments. +# - Added `allow_decimal` and `allow_infinite` arguments. +# - Improved errors with absent arguments. +# +# +# 2022-09-16: +# - Unprefixed usage of rlang functions with `rlang::` to +# avoid onLoad issues when called from rlang (#1482). +# +# 2022-08-11: +# - Added changelog. +# +# nocov start + +# Scalars ----------------------------------------------------------------- + +.standalone_types_check_dot_call <- .Call + +check_bool <- function( + x, + ..., + allow_na = FALSE, + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if ( + !missing(x) && + .standalone_types_check_dot_call( + ffi_standalone_is_bool_1.0.7, + x, + allow_na, + allow_null + ) + ) { + return(invisible(NULL)) + } + + stop_input_type( + x, + c("`TRUE`", "`FALSE`"), + ..., + allow_na = allow_na, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_string <- function( + x, + ..., + allow_empty = TRUE, + allow_na = FALSE, + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + is_string <- .rlang_check_is_string( + x, + allow_empty = allow_empty, + allow_na = allow_na, + allow_null = allow_null + ) + if (is_string) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a single string", + ..., + allow_na = allow_na, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +.rlang_check_is_string <- function(x, allow_empty, allow_na, allow_null) { + if (is_string(x)) { + if (allow_empty || !is_string(x, "")) { + return(TRUE) + } + } + + if (allow_null && is_null(x)) { + return(TRUE) + } + + if (allow_na && (identical(x, NA) || identical(x, na_chr))) { + return(TRUE) + } + + FALSE +} + +check_name <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + is_string <- .rlang_check_is_string( + x, + allow_empty = FALSE, + allow_na = FALSE, + allow_null = allow_null + ) + if (is_string) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a valid name", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +IS_NUMBER_true <- 0 +IS_NUMBER_false <- 1 +IS_NUMBER_oob <- 2 + +check_number_decimal <- function( + x, + ..., + min = NULL, + max = NULL, + allow_infinite = TRUE, + allow_na = FALSE, + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (missing(x)) { + exit_code <- IS_NUMBER_false + } else if ( + 0 == + (exit_code <- .standalone_types_check_dot_call( + ffi_standalone_check_number_1.0.7, + x, + allow_decimal = TRUE, + min, + max, + allow_infinite, + allow_na, + allow_null + )) + ) { + return(invisible(NULL)) + } + + .stop_not_number( + x, + ..., + exit_code = exit_code, + allow_decimal = TRUE, + min = min, + max = max, + allow_na = allow_na, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_number_whole <- function( + x, + ..., + min = NULL, + max = NULL, + allow_infinite = FALSE, + allow_na = FALSE, + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (missing(x)) { + exit_code <- IS_NUMBER_false + } else if ( + 0 == + (exit_code <- .standalone_types_check_dot_call( + ffi_standalone_check_number_1.0.7, + x, + allow_decimal = FALSE, + min, + max, + allow_infinite, + allow_na, + allow_null + )) + ) { + return(invisible(NULL)) + } + + .stop_not_number( + x, + ..., + exit_code = exit_code, + allow_decimal = FALSE, + min = min, + max = max, + allow_na = allow_na, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +.stop_not_number <- function( + x, + ..., + exit_code, + allow_decimal, + min, + max, + allow_na, + allow_null, + arg, + call +) { + if (allow_decimal) { + what <- "a number" + } else { + what <- "a whole number" + } + + if (exit_code == IS_NUMBER_oob) { + min <- min %||% -Inf + max <- max %||% Inf + + if (min > -Inf && max < Inf) { + what <- sprintf("%s between %s and %s", what, min, max) + } else if (x < min) { + what <- sprintf("%s larger than or equal to %s", what, min) + } else if (x > max) { + what <- sprintf("%s smaller than or equal to %s", what, max) + } else { + abort("Unexpected state in OOB check", .internal = TRUE) + } + } + + stop_input_type( + x, + what, + ..., + allow_na = allow_na, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_symbol <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_symbol(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a symbol", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_arg <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_symbol(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "an argument name", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_call <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_call(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a defused call", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_environment <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_environment(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "an environment", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_function <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_function(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a function", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_closure <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_closure(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "an R function", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_formula <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_formula(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a formula", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + + +# Vectors ----------------------------------------------------------------- + +# TODO: Figure out what to do with logical `NA` and `allow_na = TRUE` + +check_character <- function( + x, + ..., + allow_na = TRUE, + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_character(x)) { + if (!allow_na && any(is.na(x))) { + abort( + sprintf("`%s` can't contain NA values.", arg), + arg = arg, + call = call + ) + } + + return(invisible(NULL)) + } + + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a character vector", + ..., + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_logical <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_logical(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a logical vector", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_data_frame <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is.data.frame(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a data frame", + ..., + allow_null = allow_null, + arg = arg, + call = call + ) +} + +# nocov end diff --git a/R/importance_perm.R b/R/importance_perm.R index 798afe7..df134f3 100644 --- a/R/importance_perm.R +++ b/R/importance_perm.R @@ -368,11 +368,19 @@ predictions <- function(wflow, new_data, type, eval_time) { wflow |> extract_fit_parsnip() |> augment(new_data = new_data, eval_time = eval_time) - use_post <- has_postprocessor(wflow) - if (use_post) { - post_proc <- extract_postprocessor(wflow) - preds <- predict(post_proc, preds) - } + check_post(wflow) + # For once tailor and tune 2.0 are rleeased + # if (FALSE) { + # post_proc <- extract_postprocessor(wflow) + # preds <- predict(post_proc, preds) + # } } preds } + +check_post <- function(x) { + if (!identical(names(x$post$actions), character(0))) { + cli::cli_abort("{.pkg important} does not currently support postprocessors.") + } + invisible(NULL) +} diff --git a/R/important-package.R b/R/important-package.R index a3a200e..3f465c9 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -2,8 +2,11 @@ "_PACKAGE" ## usethis namespace: start -#' @importFrom stats sd predict -#' @importFrom hardhat extract_fit_parsnip extract_postprocessor +#' @import recipes +#' @import rlang +#' @importFrom stats sd predict as.formula +#' @importFrom hardhat extract_fit_parsnip +#' @importFrom S7 check_is_S7 #' @importFrom ggplot2 autoplot #' @export @@ -27,14 +30,19 @@ utils::globalVariables( "permuted", "predictor", "ranking", - "std_err" + "std_err", + "score", + ".d_overall", + "outcome", + "score_objs", + "removed", + "terms" ) ) -## usethis namespace: end + +# enable usage of @name in package code +#' @rawNamespace if (getRversion() < "4.3.0") importFrom("S7", "@") NULL -## From workflows -# nocov start -has_postprocessor <- function(x) has_postprocessor_tailor(x) -has_postprocessor_tailor <- function(x) "tailor" %in% names(x$post$actions) -# nocov end +## usethis namespace: end +NULL diff --git a/R/recipe_utils.R b/R/recipe_utils.R new file mode 100644 index 0000000..4b5ff7c --- /dev/null +++ b/R/recipe_utils.R @@ -0,0 +1,100 @@ +# For an expression or quosure, list the user-specificied score functions +extract_score_names <- function(x, call = rlang::caller_env()) { + if (rlang::is_quosure(x)) { + if (rlang::quo_is_missing(x)) { + cli::cli_abort("{.arg score} cannot be empty.", call = call) + } + x <- rlang::quo_get_expr(x) + } + res <- all.vars(x) + check_score_names(res, call) +} + +check_score_names <- function(x, call) { + x <- unique(x) + res <- grep("^score_", x, value = TRUE) + if (length(res) == 0) { + cli::cli_abort("No score objects were found in {.arg score}.", call = call) + } + res +} + +# TODO this should go into recipes +pull_outcome_column_name <- function(x) { + outcome_name <- x$variable[x$role == "outcome"] + num_outcomes <- length(outcome_name) + if (num_outcomes != 1) { + cli::cli_abort("One column should have a role of {.val outcome}.") + } + outcome_name +} + +check_weights <- function(object, weights) { + if (object@case_weights & !is.null(weights)) { + res <- weights + } else { + res <- NULL + } + res +} + +compute_score <- function(score, args, form, data, weights = NULL) { + score_obj <- find_score_object(score) + + # Process case weights + weights <- check_weights(score_obj, weights) + if (!is.null(weights)) { + args$case_weights <- weights + } + + cl <- rlang::call2( + "fit", + .ns = "generics", + object = quote(score_obj), + formula = quote(form), + data = quote(data) + ) + cl <- rlang::call_modify(cl, !!!args) + + res <- try(rlang::eval_tidy(cl), silent = TRUE) + + # if error return all NA + res +} + +# Temporary solution +find_score_object <- function(x) { + utils::getFromNamespace(x, "filtro") +} + +update_prop <- function(num_cols, prop) { + min_prop <- 1 / num_cols + if (prop < min_prop) { + prop <- min_prop + 2 * .Machine$double.eps + } + prop +} + +tidy_filtro_rec <- function(x, ...) { + if (is_trained(x)) { + if (is.null(x$results)) { + res <- tibble::tibble(terms = character(), id = character()) + } else { + res <- + x$results |> + dplyr::select(-outcome, terms = predictor) |> + dplyr::relocate(dplyr::any_of("removed"), .after = c(terms)) + } + } else { + term_names <- sel2char(x$terms) + res <- tibble::tibble(terms = term_names) + } + res$id <- x$id + res +} + +all_scores_missing <- function(x) { + scores <- dplyr::select(x, -outcome, -predictor) + all_missing <- purrr::map_lgl(scores, ~ all(is.na(.x))) + all(all_missing) +} diff --git a/R/step_predictor_best.R b/R/step_predictor_best.R new file mode 100644 index 0000000..718d8fe --- /dev/null +++ b/R/step_predictor_best.R @@ -0,0 +1,319 @@ +#' Supervised Feature Selection via Choosing the Top Predictors +#' +#' `step_predictor_best()` creates a *specification* of a recipe step that uses +#' a single scoring function to measure how much each predictor is related to +#' the outcome value. This step retains a proportion of the most important +#' predictors, and this proportion can be tuned. +#' +#' @inheritParams step_predictor_desirability +#' +#' @param score The name of a single score function from the \pkg{filtro} +#' package, such as `"imp_rf"` (for [filtro::score_imp_rf()]), etc. +#' See the Details and Examples sections below. This argument *should be named* +#' when used. +#' +#' @export +#' +#' @details +#' +#' ```{r child = "man/rmd/filtro-scores.Rmd"} +#' ``` +#' +#' Some important notes: +#' +#' - Scores that are p-values are automatically transformed by \pkg{filtro} to +#' be in the format `-log10(pvalue)` so that a p-value of 0.1 is converted to +#' 1.0. For these, use the `maximize()` goal. +#' +#' - Other scores are also transformed in the data. For example, the correlation +#' scores given to the recipe step are in absolute value format. See the +#' \pkg{filtro} documentation for each score. +#' +#' - You can use some in-line functions using base R functions. For example, +#' `maximize(max(score_cor_spearman))`. +#' +#' - If a predictor cannot be computed for all scores, it is given a "fallback +#' value" that will prevent it from being excluded for this reason. +#' +#' This step can potentially remove columns from the data set. This may cause +#' issues for subsequent steps in your recipe if the missing columns are +#' specifically referenced by name. To avoid this, see the advice in the _Tips +#' for saving recipes and filtering columns_ section of [recipes::selections]. +#' +#' +#' ## Ties +#' +#' Note that [dplyr::slice_max()] with the argument `with_ties = TRUE ` is used +#' to select predictors. If there are many ties in overall desirability, the +#' proportion selected can be larger than the value given to `prep_terms()`. +#' +#' ## Case Weights +#' +#' Case weights can be used by some scoring functions. To learn more, load the +#' \pkg{filtro} package and check the `case_weights` property of the score object +#' (see Examples below). For a recipe, use one of the tidymodels case weight +#' functions such as [hardhat::importance_weights()] or +#' [hardhat::frequency_weights], to assign the correct data type to the vector of case +#' weights. A recipe will then interpret that class to be a case weight (and no +#' other role). A full example is below. +#' +#' ## Tidy method +#' +#' For a trained recipe, the `tidy()` method will return a tibble with columns +#' `terms` (the predictor names), `id`, and columns for the estimated scores. +#' The score columns are the raw values, before being filled with "safe values" +#' or transformed. +#' +#' There is an additional local column called `removed` that notes whether the +#' predictor failed the filter and was removed after this step is executed. +#' +#' @return An updated version of `recipe` with the new step added to the +#' sequence of any existing operations. When you +#' [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is returned +#' with columns `terms` and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected to be removed} +#' \item{id}{character, id of this step} +#' } +#' Once trained, additional columns are included (see Details section). +#' @examples +#' library(recipes) +#' +#' rec <- recipe(mpg ~ ., data = mtcars) |> +#' step_predictor_best( +#' all_predictors(), +#' score = "cor_spearman" +#' ) +#' +#' prepped <- prep(rec) +#' +#' bake(prepped, mtcars) +#' +#' tidy(prepped, 1) +step_predictor_best <- function( + recipe, + ..., + score, + role = NA, + trained = FALSE, + prop_terms = 0.5, + update_prop = TRUE, + results = NULL, + removals = NULL, + skip = FALSE, + id = rand_id("predictor_best") +) { + add_step( + recipe, + step_predictor_best_new( + terms = enquos(...), + score = score, + role = role, + trained = trained, + prop_terms = prop_terms, + update_prop = update_prop, + results = results, + removals = removals, + skip = skip, + id = id, + case_weights = NULL + ) + ) +} + +step_predictor_best_new <- + function( + terms, + score, + role, + trained, + prop_terms, + update_prop = update_prop, + results, + removals, + skip, + id, + case_weights + ) { + step( + subclass = "predictor_best", + terms = terms, + score = score, + role = role, + trained = trained, + prop_terms = prop_terms, + update_prop = update_prop, + results = results, + removals = removals, + skip = skip, + id = id, + case_weights = case_weights + ) + } + +#' @export +prep.step_predictor_best <- function(x, training, info = NULL, ...) { + col_names <- recipes_eval_select(x$terms, training, info) + check_type(training[, col_names], types = c("double", "integer", "factor")) + + bottom <- ifelse(x$update_prop, .Machine$double.eps, 0.0) + check_number_decimal( + x$prop_terms, + min = bottom, + max = 1, + arg = "prop_terms" + ) + + if (x$update_prop) { + x$prop_terms <- update_prop(length(col_names), x$prop_terms) + } + + # First we check the _type_ of weight to see if it is used. Later, in + # `compute_score()`, we check to see if the score supports case weights. + wts <- get_case_weights(info, training) + were_weights_used <- are_weights_used(wts, unsupervised = FALSE) + if (isFALSE(were_weights_used)) { + wts <- NULL + } + + outcome_name <- pull_outcome_column_name(info) + + if (length(col_names) > 1) { + filter_res <- calculate_predictor_best( + score = x$score, + prop_terms = x$prop_terms, + outcome = outcome_name, + data = training[, c(outcome_name, col_names)], + weights = wts + ) + } else { + filter_res <- list( + tibble::tibble( + outcome = character(0), + predictor = character(0), + score = double(0), + removed = logical(0) + ), + removals = character(0) + ) + } + + step_predictor_best_new( + terms = x$terms, + score = x$score, + role = x$role, + trained = TRUE, + results = filter_res$raw, + prop_terms = x$prop_terms, + update_prop = x$update_prop, + removals = filter_res$removals, + skip = x$skip, + id = x$id, + case_weights = were_weights_used + ) +} + +calculate_predictor_best <- function( + score, + prop_terms, + outcome = character(0), + data, + weights +) { + score_function <- paste0("score_", score) + + fm <- stats::as.formula(paste(outcome, "~ .")) + + score_res <- compute_score( + score_function, + args = list(), + form = fm, + data = data, + weights = weights + ) + + # ------------------------------------------------------------------------------ + # Fill in missings + + # The current filtro::fill_safe_value() only applies to class_score, not df nor tibble. + + score_df <- + score_res |> + filtro::fill_safe_value(return_results = TRUE, transform = TRUE) + + # ------------------------------------------------------------------------------ + # filter predictors + + if (score_res@direction == "maximize") { + keepers <- score_df |> + dplyr::slice_max(score, prop = prop_terms, with_ties = TRUE) + } else { + keepers <- score_df |> + dplyr::slice_min(score, prop = prop_terms, with_ties = TRUE) + } + keepers <- keepers |> dplyr::pull(predictor) + + removals <- setdiff(score_df$predictor, keepers) + + raw_res <- score_res@results |> dplyr::select(outcome, predictor, score) + raw_res$removed <- raw_res$predictor %in% removals + + list( + raw = raw_res, + removals = removals + ) +} + +#' @export +bake.step_predictor_best <- function(object, new_data, ...) { + new_data <- recipes_remove_cols(new_data, object) + new_data +} + +#' @export +print.step_predictor_best <- function( + x, + width = max(20, options()$width - 36), + ... +) { + if (identical(x$score, rlang::enexpr())) { + title <- cli::format_inline("Feature selection on") + } else { + scores <- unique(x$score) + title <- cli::format_inline("Feature selection via {.code {scores}} on") + } + + print_step( + x$removals, + x$terms, + x$trained, + title, + width, + case_weights = x$case_weights + ) + invisible(x) +} + +#' @usage NULL +#' @export +tidy.step_predictor_best <- tidy_filtro_rec + +#' @export +tunable.step_predictor_best <- function(x, ...) { + tibble::tibble( + name = "prop_terms", + call_info = list( + list(pkg = "dials", fun = "prop_terms") + ), + source = "recipe", + component = "step_predictor_best", + component_id = x$id + ) +} + +#' @rdname required_pkgs.important +#' @export +required_pkgs.step_predictor_best <- function(x, ...) { + c("important", "filtro") +} diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R new file mode 100644 index 0000000..f0c7ecc --- /dev/null +++ b/R/step_predictor_desirability.R @@ -0,0 +1,430 @@ +#' Supervised Multivariate Feature Selection via Desirability Functions +#' +#' `step_predictor_desirability()` creates a *specification* of a recipe step +#' that uses one or more "score" functions to measure how much each predictor +#' is related to the outcome value. These scores are combined into a composite +#' value using user-specified _desirability_ functions and a proportion of the +#' most desirable predictors are retained. +#' +#' @inheritParams recipes::step_center +#' @param score An object produced by [desirability2::desirability()] that uses +#' one or more score functions from the \pkg{filtro} package. See the Details +#' and Examples sections below. This argument *should be named* when used. +#' @param prop_terms The proportion of predictors that should be retained when +#' ordered by overall desirability. A value of [hardhat::tune()] can also be +#' used. +#' @param update_prop A logical: should `prop_terms` be updated so that at least +#' one predictor will be retained? +#' @param removals A character string that contains the names of predictors that +#' should be removed. These values are not determined until [recipes::prep()] +#' is called. +#' @param results A data frame of score and desirability values for each +#' predictor evaluated. These values are not determined until [recipes::prep()] +#' is called. +#' @export +#' +#' @details +#' +#' This recipe step can compute one or more scores and conduct a simultaneous +#' selection of the top predictors using _desirability functions_. These are +#' functions that, for some type of goal, translate the score's values to a +#' scale of `[0, 1]`, where 1.0 is the best result and 0.0 is unacceptable. +#' Once we have these for each score, the overall desirability is computed +#' using the geometric mean of the individual desirabilities. See the examples +#' in [desirability2::d_overall()] and [desirability2::d_max()]. +#' +#' To define desirabilities, use [desirability2::desirability()] function to +#' define _goals_ for each score and pass that to the recipe in the `score` +#' argument. +#' +#' ```{r child = "man/rmd/filtro-scores.Rmd"} +#' ``` +#' Some important notes: +#' +#' - Scores that are p-values are automatically transformed by \pkg{filtro} to +#' be in the format `-log10(pvalue)` so that a p-value of 0.1 is converted to +#' 1.0. For these, use the `maximize()` goal. +#' +#' - Other scores are also transformed in the data. For example, the correlation +#' scores given to the recipe step are in absolute value format. See the +#' \pkg{filtro} documentation for each score. +#' +#' - You can use some in-line functions using base R functions. For example, +#' `maximize(max(cor_spearman))`. +#' +#' - If a predictor cannot be computed for all scores, it is given a "fallback +#' value" that will prevent it from being excluded for this reason. +#' +#' This step can potentially remove columns from the data set. This may cause +#' issues for subsequent steps in your recipe if the missing columns are +#' specifically referenced by name. To avoid this, see the advice in the _Tips +#' for saving recipes and filtering columns_ section of [recipes::selections]. +#' +#' ## Ties +#' +#' Note that [dplyr::slice_max()] with the argument `with_ties = TRUE ` is used +#' to select predictors. If there are many ties in overall desirability, the +#' proportion selected can be larger than the value given to `prep_terms()`. +#' +#' ## Case Weights +#' +#' Case weights can be used by some scoring functions. To learn more, load the +#' \pkg{filtro} package and check the `case_weights` property of the score object +#' (see Examples below). For a recipe, use one of the tidymodels case weight +#' functions such as [hardhat::importance_weights()] or +#' [hardhat::frequency_weights], to assign the correct data type to the vector of case +#' weights. A recipe will then interpret that class to be a case weight (and no +#' other role). A full example is below. +#' +#' ## Tidy method +#' +#' For a trained recipe, the `tidy()` method will return a tibble with columns +#' `terms` (the predictor names), `id`, columns for the estimated scores, and +#' the desirability results. The score columns are the raw values, before being +#' filled with "safe values" or transformed. +#' +#' The desirability columns will have the same name as the scores with an +#' additional prefix of `.d_`. The overall desirability column is called +#' `.d_overall`. +#' +#' There is an additional local column called `removed` that notes whether the +#' predictor failed the filter and was removed after this step is executed. +#' +#' @return An updated version of `recipe` with the new step added to the +#' sequence of any existing operations. When you +#' [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is returned +#' with columns `terms` and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected to be removed} +#' \item{id}{character, id of this step} +#' } +#' Once trained, additional columns are included (see Details section). +#' +#' @seealso [desirability2::desirability()] +#' @references Derringer, G. and Suich, R. (1980), Simultaneous Optimization of +#' Several Response Variables. _Journal of Quality Technology_, 12, 214-219. +#' +#' [https://desirability2.tidymodels.org/reference/inline_desirability.html](https://desirability2.tidymodels.org/reference/inline_desirability.html) +#' @examples +#' library(recipes) +#' library(desirability2) +#' +#' if (rlang::is_installed("modeldata")) { +#' # The `ad_data` has a binary outcome column ("Class") and mostly numeric +#' # predictors. For these, we score the predictors using an analysis of +#' # variance model where the predicor is the outcome and the outcome class +#' # defines the groups. +#' # There is also a single factor predictor ("Genotype") and we'll use +#' # Fisher's Exact test to score it. NOTE that for scores using hypothesis +#' # tests, the -log10(pvalue) is returned so that larger values are more +#' # important. +#' +#' # The score_* objects here are from the filtro package. See Details above. +#' goals <- +#' desirability( +#' maximize(xtab_pval_fisher), +#' maximize(aov_pval) +#' ) +#' +#' example_data <- modeldata::ad_data +#' rec <- +#' recipe(Class ~ ., data = example_data) |> +#' step_predictor_desirability( +#' all_predictors(), +#' score = goals, +#' prop_terms = 1 / 2 +#' ) +#' rec +#' +#' # Now evaluate the predictors and rank them via desirability: +#' prepped <- prep(rec) +#' prepped +#' +#' # Use the tidy() method to get the results: +#' predictor_scores <- tidy(prepped, number = 1) +#' mean(predictor_scores$removed) +#' predictor_scores +#' +#' # -------------------------------------------------------------------------- +#' +#' # Case-weight example: use the hardhat package to create the appropriate type +#' # of case weights. Here, we'll increase the weights for the minority class and +#' # add them to the data frame. +#' +#' library(hardhat) +#' +#' example_weights <- example_data +#' weights <- ifelse(example_data$Class == "Impaired", 5, 1) +#' example_weights$weights <- importance_weights(weights) +#' +#' # To see if the scores can use case weights, load the filtro package and +#' # check the `case_weights` property: +#' +#' library(filtro) +#' +#' score_xtab_pval_fisher@case_weights +#' score_aov_pval@case_weights +#' +#' # The recipe will automatically find the case weights and will +#' # not treat them as predictors. +#' rec_wts <- +#' recipe(Class ~ ., data = example_weights) |> +#' step_predictor_desirability( +#' all_predictors(), +#' score = goals, +#' prop_terms = 1 / 2 +#' ) |> +#' prep() +#' rec_wts +#' +#' predictor_scores_wts <- +#' tidy(rec_wts, number = 1) |> +#' select(terms, .d_overall_weighted = .d_overall) +#' +#' library(dplyr) +#' library(ggplot2) +#' +#' # The selection did not substantially change with these case weights +#' full_join(predictor_scores, predictor_scores_wts, by = "terms") |> +#' ggplot(aes(.d_overall, .d_overall_weighted)) + +#' geom_abline(col = "darkgreen", lty = 2) + +#' geom_point(alpha = 1 / 2) + +#' coord_fixed(ratio = 1) + +#' labs(x = "Unweighted", y = "Class Weighted") +#' } +#' +step_predictor_desirability <- function( + recipe, + ..., + score, + role = NA, + trained = FALSE, + prop_terms = 0.5, + update_prop = TRUE, + results = NULL, + removals = NULL, + skip = FALSE, + id = rand_id("predictor_desirability") +) { + S7::check_is_S7(score) + if (getRversion() >= "4.3.0") { + if (!inherits(score, "desirability2::desirability_set")) { + cli::cli_abort( + "Please use the {.fn desirability} function in the {.pkg desirability2} + package to create an object to pass to {.arg score}." + ) + } + } + + add_step( + recipe, + step_predictor_desirability_new( + terms = enquos(...), + score = score, + role = role, + trained = trained, + prop_terms = prop_terms, + update_prop = update_prop, + results = results, + removals = removals, + skip = skip, + id = id, + case_weights = NULL + ) + ) +} + +step_predictor_desirability_new <- + function( + terms, + score, + role, + trained, + prop_terms, + update_prop = update_prop, + results, + removals, + skip, + id, + case_weights + ) { + step( + subclass = "predictor_desirability", + terms = terms, + score = score, + role = role, + trained = trained, + results = results, + prop_terms = prop_terms, + update_prop = update_prop, + removals = removals, + skip = skip, + id = id, + case_weights = case_weights + ) + } + +#' @export +prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { + rlang::check_installed("desirability2") + col_names <- recipes_eval_select(x$terms, training, info) + check_type(training[, col_names], types = c("double", "integer", "factor")) + + bottom <- ifelse(x$update_prop, .Machine$double.eps, 0.0) + check_number_decimal( + x$prop_terms, + min = bottom, + max = 1, + arg = "prop_terms" + ) + + if (length(col_names) < 2) { + res <- + step_predictor_desirability_new( + terms = x$terms, + score = score, + role = x$role, + trained = TRUE, + prop_terms = x$prop_terms, + update_prop = x$update_prop, + results = NULL, + removals = character(0), + skip = x$skip, + id = x$id, + case_weights = were_weights_used + ) + return(res) + } + + if (x$update_prop) { + x$prop_terms <- update_prop(length(col_names), x$prop_terms) + } + + # First we check the _type_ of weight to see if it is used. Later, in + # `compute_score()`, we check to see if the score supports case weights. + wts <- get_case_weights(info, training) + were_weights_used <- are_weights_used(wts, unsupervised = FALSE) + if (isFALSE(were_weights_used)) { + wts <- NULL + } + + var_names <- unique(unlist(x$score@variables)) + score_names <- paste0("score_", var_names) + score_names <- check_score_names(score_names) + outcome_name <- pull_outcome_column_name(info) + fm <- paste(outcome_name, "~ .") + fm <- stats::as.formula(fm) + + score_objs <- + purrr::map( + score_names, + ~ compute_score(.x, list(), fm, training[c(outcome_name, col_names)], wts) + ) + raw_scores <- filtro::bind_scores(score_objs) + + score_df <- + score_objs |> + filtro::fill_safe_values(transform = TRUE) + + if (all_scores_missing(raw_scores)) { + cli::cli_warn("All score computations failed; skipping feature selection.") + keep_list <- score_df + } else { + # make desirability expression/eval quosure + score_df <- desirability2::make_desirability_cols(x$score, score_df) + + bad_news <- purrr::map_lgl(score_df$.d_overall, ~ identical(.x, 0.0)) + if (all(bad_news)) { + keep_list <- score_df[0, ] + } else { + keep_list <- + score_df |> + dplyr::slice_max(.d_overall, prop = x$prop_terms, with_ties = TRUE) + } + } + + rm_list <- + dplyr::anti_join(score_df, keep_list[, "predictor"], by = "predictor") |> + dplyr::pull(predictor) + + score_df$removed <- score_df$predictor %in% rm_list + + score_df <- score_df |> + dplyr::select(outcome, predictor, removed, dplyr::starts_with(".d_")) |> + dplyr::full_join(raw_scores, by = c("outcome", "predictor")) |> + dplyr::relocate(removed, .after = "predictor") |> + dplyr::relocate(dplyr::starts_with(".d_"), .after = dplyr::everything()) + + step_predictor_desirability_new( + terms = x$terms, + score = x$score, + role = x$role, + trained = TRUE, + results = score_df, + prop_terms = x$prop_terms, + update_prop = x$update_prop, + removals = rm_list, + skip = x$skip, + id = x$id, + case_weights = were_weights_used + ) +} + +#' @export +bake.step_predictor_desirability <- function(object, new_data, ...) { + new_data <- recipes_remove_cols(new_data, object) + new_data +} + +#' @export +print.step_predictor_desirability <- function( + x, + width = max(20, options()$width - 36), + ... +) { + scores <- unique(x$score@variables) + + title <- cli::format_inline( + "Feature selection via desirability functions ({.code {scores}}) on" + ) + print_step( + x$removals, + x$terms, + x$trained, + title, + width, + case_weights = x$case_weights + ) + invisible(x) +} + +#' @usage NULL +#' @export +tidy.step_predictor_desirability <- tidy_filtro_rec + +#' @export +tunable.step_predictor_desirability <- function(x, ...) { + tibble::tibble( + name = "prop_terms", + call_info = list( + list(pkg = "dials", fun = "prop_terms") + ), + source = "recipe", + component = "step_predictor_desirability", + component_id = x$id + ) +} + +#' S3 methods for tracking which additional packages are needed for steps. +#' +#' Recipe-adjacent packages always list themselves as a required package so that +#' the steps can function properly within parallel processing schemes. +#' @param x A recipe step +#' @return A character vector +#' @name required_pkgs.important +#' @keywords internal +#' @export +required_pkgs.step_predictor_desirability <- function(x, ...) { + c("important", "filtro", "desirability2") +} diff --git a/R/step_predictor_retain.R b/R/step_predictor_retain.R new file mode 100644 index 0000000..878b832 --- /dev/null +++ b/R/step_predictor_retain.R @@ -0,0 +1,299 @@ +#' Supervised Feature Selection via A Single Filter +#' +#' `step_predictor_retain()` creates a *specification* of a recipe step that +#' uses a logical statement that includes one or more scoring functions to +#' measure how much each predictor is related to the outcome value. This step +#' retains the predictors that pass the logical statement. +#' +#' @inheritParams step_predictor_desirability +#' @param score A valid R expression that produces a logical result. The +#' equation can contain the names of one or more score functions from the +#' \pkg{filtro} package, such as [filtro::score_imp_rf()], +#' [filtro:: score_roc_auc()]. See the Details and Examples sections below. +#' This argument *should be named* when used. +#' +#' @export +#' +#' @details +#' +#' The `score` should be valid R syntax that produces a logical result and +#' should not use external data. The list of variables that can be used is in +#' the section below. +#' +#' ```{r child = "man/rmd/filtro-scores.Rmd"} +#' ``` +#' +#' Some important notes: +#' +#' - Scores that are p-values are automatically transformed by \pkg{filtro} to +#' be in the format `-log10(pvalue)` so that a p-value of 0.1 is converted to +#' 1.0. For these, use the `maximize()` goal. +#' +#' - Other scores are also transformed in the data. For example, the correlation +#' scores given to the recipe step are in absolute value format. See the +#' \pkg{filtro} documentation for each score. +#' +#' - You can use some in-line functions using base R functions. For example, +#' `maximize(max(score_cor_spearman))`. +#' +#' - If a predictor cannot be computed for all scores, it is given a "fallback +#' value" that will prevent it from being excluded for this reason. +#' +#' This step can potentially remove columns from the data set. This may cause +#' issues for subsequent steps in your recipe if the missing columns are +#' specifically referenced by name. To avoid this, see the advice in the _Tips +#' for saving recipes and filtering columns_ section of [recipes::selections]. +#' +#' ## Case Weights +#' +#' Case weights can be used by some scoring functions. To learn more, load the +#' \pkg{filtro} package and check the `case_weights` property of the score object +#' (see Examples below). For a recipe, use one of the tidymodels case weight +#' functions such as [hardhat::importance_weights()] or +#' [hardhat::frequency_weights], to assign the correct data type to the vector of case +#' weights. A recipe will then interpret that class to be a case weight (and no +#' other role). A full example is below. +#' +#' ## Tidy method +#' +#' For a trained recipe, the `tidy()` method will return a tibble with columns +#' `terms` (the predictor names), `id`, and columns for the estimated scores. +#' The score columns are the raw values, before being filled with "safe values" +#' or transformed. +#' +#' There is an additional local column called `removed` that notes whether the +#' predictor failed the filter and was removed after this step is executed. +#' +#' @return An updated version of `recipe` with the new step added to the +#' sequence of any existing operations. When you +#' [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is returned +#' with columns `terms` and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected to be removed} +#' \item{id}{character, id of this step} +#' } +#' Once trained, additional columns are included (see Details section). +#' +#' @examples +#' library(recipes) +#' +#' rec <- recipe(mpg ~ ., data = mtcars) |> +#' step_predictor_retain( +#' all_predictors(), +#' score = cor_pearson >= 0.75 | cor_spearman >= 0.75 +#' ) +#' +#' prepped <- prep(rec) +#' +#' bake(prepped, mtcars) +#' +#' tidy(prepped, 1) +step_predictor_retain <- function( + recipe, + ..., + score, + role = NA, + trained = FALSE, + results = NULL, + removals = NULL, + skip = FALSE, + id = rand_id("predictor_retain") +) { + add_step( + recipe, + step_predictor_retain_new( + terms = enquos(...), + role = role, + trained = trained, + score = rlang::enexpr(score), + results = results, + removals = removals, + skip = skip, + id = id, + case_weights = NULL + ) + ) +} + +step_predictor_retain_new <- + function( + terms, + role, + trained, + score, + results, + removals, + skip, + id, + case_weights + ) { + step( + subclass = "predictor_retain", + terms = terms, + role = role, + trained = trained, + score = score, + results = results, + removals = removals, + skip = skip, + id = id, + case_weights = case_weights + ) + } + +#' @export +prep.step_predictor_retain <- function(x, training, info = NULL, ...) { + col_names <- recipes_eval_select(x$terms, training, info) + check_type(training[, col_names], types = c("double", "integer")) + + # First we check the _type_ of weight to see if it is used. Later, in + # `compute_score()`, we check to see if the score supports case weights. + wts <- get_case_weights(info, training) + were_weights_used <- are_weights_used(wts, unsupervised = FALSE) + if (isFALSE(were_weights_used)) { + wts <- NULL + } + + outcome_name <- pull_outcome_column_name(info) + + if (length(col_names) > 1) { + filter_res <- calculate_predictor_retain( + xpr = x$score, + outcome = outcome_name, + data = training[, c(outcome_name, col_names)], + weights = wts + ) + } else { + filter_res <- list( + raw = tibble::tibble( + outcome = character(0), + predictor = character(0), + removed = logical(0) + ), + removals = character(0) + ) + } + + step_predictor_retain_new( + terms = x$terms, + role = x$role, + trained = TRUE, + score = x$score, + results = filter_res$raw, + removals = filter_res$removals, + skip = x$skip, + id = x$id, + case_weights = were_weights_used + ) +} + +# TODO: how to pass opts and or tune objects? Internal score set? + +# Assumes `data` has all predictors and the outcome columns (only) +calculate_predictor_retain <- function( + xpr, + outcome = character(0), + data, + weights, + opts = list() +) { + all_scores <- unique(all.vars(xpr)) + all_score_functions <- paste0("score_", all_scores) + + # ------------------------------------------------------------------------------ + # Find all known class_{score} and check against list + + # ------------------------------------------------------------------------------ + # Process any options + + opts <- make_opt_list(opts, all_scores) + + # ------------------------------------------------------------------------------ + + fm <- as.formula(paste(outcome, "~ .")) + + # ------------------------------------------------------------------------------ + + # Get list of args for each scoring method and use map2() + score_res <- purrr::map2( + all_score_functions, + opts, + compute_score, + form = fm, + data = data, + weights = weights + ) + names(score_res) <- all_scores + + # ------------------------------------------------------------------------------ + # Fill in missings + + score_df <- # save for tidy method + score_res |> + filtro::fill_safe_values(transform = TRUE) + + # ------------------------------------------------------------------------------ + # filter predictors + + keepers <- score_df |> dplyr::filter(!!xpr) |> dplyr::pull(predictor) + removals <- setdiff(score_df$predictor, keepers) + + raw_res <- filtro::bind_scores(score_res) + raw_res$removed <- raw_res$predictor %in% removals + + list( + raw = raw_res, + removals = removals + ) +} + +make_opt_list <- function(opts, scores) { + res <- purrr::map(scores, ~ list()) + names(res) <- scores + score_opts <- intersect(scores, names(opts)) + for (i in score_opts) { + res[[i]] <- opts[[i]] + } + res +} + +#' @export +bake.step_predictor_retain <- function(object, new_data, ...) { + new_data <- recipes_remove_cols(new_data, object) + new_data +} + +#' @export +print.step_predictor_retain <- function( + x, + width = max(20, options()$width - 36), + ... +) { + scores <- unique(all.vars(x$score)) + + word <- ifelse(x$trained, "removing", "for") + + title <- cli::format_inline( + "Feature selection using {.and {.code {scores}}} {word}" + ) + print_step( + x$removals, + x$terms, + x$trained, + title, + width, + case_weights = x$case_weights + ) + invisible(x) +} + +#' @usage NULL +#' @export +tidy.step_predictor_retain <- tidy_filtro_rec + +#' @rdname required_pkgs.important +#' @export +required_pkgs.step_predictor_desirability <- function(x, ...) { + c("important", "filtro") +} diff --git a/README.Rmd b/README.Rmd index 268f6d7..670a7ea 100644 --- a/README.Rmd +++ b/README.Rmd @@ -26,7 +26,14 @@ The important package has a succinct interface for obtaining estimates of predic - Importance can be calculated for either the original columns or at the level of any derived model terms created during feature engineering. - The computations that loop across permutation iterations and predictors columns are easily parallelized. - The results are returned in a tidy format. - + +There are also recipe steps for supervised feature selection: + +- `step_predictors_retain()` can filter the predictors using a single conditional statement (e.g., absolute correlation with the outcome > 0.75, etc). +- `step_predictors_best()` can retain the most important predictors for the outcome using a single scoring function. +- `step_predictors_desirability()` retains the most important predictors for the outcome using multiple scoring functions, blended using desirability functions. + +The latter two steps can be tuned over the proportion of predictors to be retained. ## Installation @@ -42,7 +49,7 @@ pak::pak("tidymodels/important") The main reason for making important is censored regression models. tidymodels released tools for fitting and qualifying models that have censored outcomes. This included some dynamic performance metrics that were evaluated at different time points. This was a substantial change for us, and it would have been even more challenging to add to other packages. -## Example +## Variable importance Example Let's look at an analysis that models [food delivery times](https://aml4td.org/chapters/whole-game.html#sec-delivery-times). The outcome is the time between an order being placed and the delivery (all data are complete - there is no censoring). We model this in terms of the order day/time, the distance to the restaurant, and which items are contained in the order. Exploratory data analysis shows several nonlinear trends in the data and some interactions between these trends. @@ -152,6 +159,49 @@ lm_orig_imp |> autoplot(lm_orig_imp) ``` +## Supervised Feature Selection Example + +Using the same dataset, let's illustrate the most common tool for filtering predictors: using random forest importance scores. + +important can use any of the "scoring functions" from the [filtro](https://filtro.tidymodels.org/) package. You can supply one, and the proportion of the predictors to retain: + +```{r} +#| label: select-top +set.seed(491) +selection_rec <- + recipe(time_to_delivery ~ ., data = delivery_train) |> + step_predictor_best(all_predictors(), score = "imp_rf", prop_terms = 1/4) |> + step_dummy(all_factor_predictors()) |> + step_zv(all_predictors()) |> + step_spline_natural(any_of(c("hour", "distance")), deg_free = 10) |> + step_interact(~ starts_with("hour_"):starts_with("day_")) |> + prep() +selection_rec +``` + +A list of possible scores is contained in the help page for the recipe steps. + +Note that we changed selectors in `step_spline_natural()` to use `any_of()` instead of specific names. Any step downstream of any filtering steps should be generalized so that there is no failure if the columns were removed. Using `any_of()` selects these two columns _if they still remain in the data_. + +Which were removed? + +```{r} +#| label: tidy-filter +selection_res <- + tidy(selection_rec, number = 1) |> + arrange(desc(score)) + +selection_res + +mean(selection_res$removed) +``` + +This example shows the basic usage of the recipe. In practice, we would probably do things differently: + + - This step would be included in a workflow so that it is coupled to a model. + - It would be a good idea to optimize how much selection is done by setting `prop_terms = tune()` in the step and using one of the tuning functions to find a good proportion. + +*Inappropriate* use of these selection steps occurs when it is used before the data are split or outside of a resampling step. ## Code of Conduct diff --git a/README.md b/README.md index 18f7948..b132bbf 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,20 @@ features: predictors columns are easily parallelized. - The results are returned in a tidy format. +There are also recipe steps for supervised feature selection: + +- `step_predictors_retain()` can filter the predictors using a single + conditional statement (e.g., absolute correlation with the outcome \> + 0.75, etc). +- `step_predictors_best()` can retain the most important predictors for + the outcome using a single scoring function. +- `step_predictors_desirability()` retains the most important predictors + for the outcome using multiple scoring functions, blended using + desirability functions. + +The latter two steps can be tuned over the proportion of predictors to +be retained. + ## Installation You can install the development version of important from @@ -41,7 +55,7 @@ were evaluated at different time points. This was a substantial change for us, and it would have been even more challenging to add to other packages. -## Example +## Variable importance Example Let’s look at an analysis that models [food delivery times](https://aml4td.org/chapters/whole-game.html#sec-delivery-times). @@ -183,6 +197,93 @@ autoplot(lm_orig_imp) +## Supervised Feature Selection Example + +Using the same dataset, let’s illustrate the most common tool for +filtering predictors: using random forest importance scores. + +important can use any of the “scoring functions” from the +[filtro](https://filtro.tidymodels.org/) package. You can supply one, +and the proportion of the predictors to retain: + +``` r +set.seed(491) +selection_rec <- + recipe(time_to_delivery ~ ., data = delivery_train) |> + step_predictor_best(all_predictors(), score = "imp_rf", prop_terms = 1/4) |> + step_dummy(all_factor_predictors()) |> + step_zv(all_predictors()) |> + step_spline_natural(any_of(c("hour", "distance")), deg_free = 10) |> + step_interact(~ starts_with("hour_"):starts_with("day_")) |> + prep() +selection_rec +#> +#> ── Recipe ────────────────────────────────────────────────────────────────────── +#> +#> ── Inputs +#> Number of variables by role +#> outcome: 1 +#> predictor: 30 +#> +#> ── Training information +#> Training data contained 6004 data points and no incomplete rows. +#> +#> ── Operations +#> • Feature selection via `imp_rf` on: item_03 item_04, ... | Trained +#> • Dummy variables from: day | Trained +#> • Zero variance filter removed: | Trained +#> • Natural spline expansion: hour distance | Trained +#> • Interactions with: hour_01:day_Tue hour_01:day_Wed, ... | Trained +``` + +A list of possible scores is contained in the help page for the recipe +steps. + +Note that we changed selectors in `step_spline_natural()` to use +`any_of()` instead of specific names. Any step downstream of any +filtering steps should be generalized so that there is no failure if the +columns were removed. Using `any_of()` selects these two columns *if +they still remain in the data*. + +Which were removed? + +``` r +selection_res <- + tidy(selection_rec, number = 1) |> + arrange(desc(score)) + +selection_res +#> # A tibble: 30 × 4 +#> terms removed score id +#> +#> 1 hour FALSE 48.5 predictor_best_rCIMa +#> 2 day FALSE 13.5 predictor_best_rCIMa +#> 3 distance FALSE 13.3 predictor_best_rCIMa +#> 4 item_10 FALSE 1.18 predictor_best_rCIMa +#> 5 item_01 FALSE 1.01 predictor_best_rCIMa +#> 6 item_24 FALSE 0.160 predictor_best_rCIMa +#> 7 item_02 FALSE 0.0676 predictor_best_rCIMa +#> 8 item_26 TRUE 0.0666 predictor_best_rCIMa +#> 9 item_03 TRUE 0.0593 predictor_best_rCIMa +#> 10 item_22 TRUE 0.0565 predictor_best_rCIMa +#> # ℹ 20 more rows + +mean(selection_res$removed) +#> [1] 0.7666667 +``` + +This example shows the basic usage of the recipe. In practice, we would +probably do things differently: + +- This step would be included in a workflow so that it is coupled to a + model. +- It would be a good idea to optimize how much selection is done by + setting `prop_terms = tune()` in the step and using one of the tuning + functions to find a good proportion. + +*Inappropriate* use of these selection steps occurs when it is used +before the data are split or outside of a resampling step. + ## Code of Conduct Please note that the important project is released with a [Contributor diff --git a/important.Rproj b/important.Rproj index e65709f..792a875 100644 --- a/important.Rproj +++ b/important.Rproj @@ -19,3 +19,4 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace diff --git a/inst/WORDLIST b/inst/WORDLIST index 8e746a7..8fc125b 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -4,7 +4,10 @@ CMD Codecov ORCID PBC +Suich +desirabilities doi +filtro funder importances mirai diff --git a/man/required_pkgs.important.Rd b/man/required_pkgs.important.Rd new file mode 100644 index 0000000..cf62557 --- /dev/null +++ b/man/required_pkgs.important.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_predictor_best.R, +% R/step_predictor_desirability.R, R/step_predictor_retain.R +\name{required_pkgs.step_predictor_best} +\alias{required_pkgs.step_predictor_best} +\alias{required_pkgs.important} +\alias{required_pkgs.step_predictor_desirability} +\title{S3 methods for tracking which additional packages are needed for steps.} +\usage{ +\method{required_pkgs}{step_predictor_best}(x, ...) + +\method{required_pkgs}{step_predictor_desirability}(x, ...) + +\method{required_pkgs}{step_predictor_desirability}(x, ...) +} +\arguments{ +\item{x}{A recipe step} +} +\value{ +A character vector +} +\description{ +Recipe-adjacent packages always list themselves as a required package so that +the steps can function properly within parallel processing schemes. +} +\keyword{internal} diff --git a/man/rmd/filtro-scores.Rmd b/man/rmd/filtro-scores.Rmd new file mode 100644 index 0000000..48c76fb --- /dev/null +++ b/man/rmd/filtro-scores.Rmd @@ -0,0 +1,34 @@ +```{r} +#| include: false +library(filtro) +scores <- ls(pattern = "^score_", envir = asNamespace("filtro")) +scores <- sort(scores) +no_prefix <- gsub("^score_", "", scores) + +f_ver <- packageDescription("filtro")$Version +``` + +## Scoring Functions + +As of version `r f_ver` of the \pkg{filtro} package, the following score functions are available: + +\itemize{ + +```{r} +#| echo: FALSE +#| results: asis + +cat( + paste0( + " \\item \\code{", + no_prefix, + "} (\\code{\\link[filtro:score_", + no_prefix, + "]{documentation}})", + collapse = "\n" +) +) +cat("\n\n") +``` + +} diff --git a/man/step_predictor_best.Rd b/man/step_predictor_best.Rd new file mode 100644 index 0000000..1620e59 --- /dev/null +++ b/man/step_predictor_best.Rd @@ -0,0 +1,165 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_predictor_best.R +\name{step_predictor_best} +\alias{step_predictor_best} +\title{Supervised Feature Selection via Choosing the Top Predictors} +\usage{ +step_predictor_best( + recipe, + ..., + score, + role = NA, + trained = FALSE, + prop_terms = 0.5, + update_prop = TRUE, + results = NULL, + removals = NULL, + skip = FALSE, + id = rand_id("predictor_best") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the sequence of +operations for this recipe.} + +\item{...}{One or more selector functions to choose variables for this step. +See \code{\link[recipes:selections]{selections()}} for more details.} + +\item{score}{The name of a single score function from the \pkg{filtro} +package, such as \code{"imp_rf"} (for \code{\link[filtro:score_imp_rf]{filtro::score_imp_rf()}}), etc. +See the Details and Examples sections below. This argument \emph{should be named} +when used.} + +\item{role}{Not used by this step since no new variables are created.} + +\item{trained}{A logical to indicate if the quantities for preprocessing have +been estimated.} + +\item{prop_terms}{The proportion of predictors that should be retained when +ordered by overall desirability. A value of \code{\link[hardhat:tune]{hardhat::tune()}} can also be +used.} + +\item{update_prop}{A logical: should \code{prop_terms} be updated so that at least +one predictor will be retained?} + +\item{results}{A data frame of score and desirability values for each +predictor evaluated. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + +\item{removals}{A character string that contains the names of predictors that +should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + +\item{skip}{A logical. Should the step be skipped when the recipe is baked by +\code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some +operations may not be able to be conducted on new data (e.g. processing the +outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it +may affect the computations for subsequent operations.} + +\item{id}{A character string that is unique to this step to identify it.} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. When you +\code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is returned +with columns \code{terms} and \code{id}: + +\describe{ +\item{terms}{character, the selectors or variables selected to be removed} +\item{id}{character, id of this step} +} +Once trained, additional columns are included (see Details section). +} +\description{ +\code{step_predictor_best()} creates a \emph{specification} of a recipe step that uses +a single scoring function to measure how much each predictor is related to +the outcome value. This step retains a proportion of the most important +predictors, and this proportion can be tuned. +} +\details{ +\subsection{Scoring Functions}{ + +As of version 0.2.0 of the \pkg{filtro} package, the following score functions are available: + +\itemize{ + +\item \code{aov_fstat} (\code{\link[filtro:score_aov_fstat]{documentation}}) +\item \code{aov_pval} (\code{\link[filtro:score_aov_pval]{documentation}}) +\item \code{cor_pearson} (\code{\link[filtro:score_cor_pearson]{documentation}}) +\item \code{cor_spearman} (\code{\link[filtro:score_cor_spearman]{documentation}}) +\item \code{gain_ratio} (\code{\link[filtro:score_gain_ratio]{documentation}}) +\item \code{imp_rf} (\code{\link[filtro:score_imp_rf]{documentation}}) +\item \code{imp_rf_conditional} (\code{\link[filtro:score_imp_rf_conditional]{documentation}}) +\item \code{imp_rf_oblique} (\code{\link[filtro:score_imp_rf_oblique]{documentation}}) +\item \code{info_gain} (\code{\link[filtro:score_info_gain]{documentation}}) +\item \code{roc_auc} (\code{\link[filtro:score_roc_auc]{documentation}}) +\item \code{sym_uncert} (\code{\link[filtro:score_sym_uncert]{documentation}}) +\item \code{xtab_pval_chisq} (\code{\link[filtro:score_xtab_pval_chisq]{documentation}}) +\item \code{xtab_pval_fisher} (\code{\link[filtro:score_xtab_pval_fisher]{documentation}}) + +} + +Some important notes: +\itemize{ +\item Scores that are p-values are automatically transformed by \pkg{filtro} to +be in the format \code{-log10(pvalue)} so that a p-value of 0.1 is converted to +1.0. For these, use the \code{maximize()} goal. +\item Other scores are also transformed in the data. For example, the correlation +scores given to the recipe step are in absolute value format. See the +\pkg{filtro} documentation for each score. +\item You can use some in-line functions using base R functions. For example, +\code{maximize(max(score_cor_spearman))}. +\item If a predictor cannot be computed for all scores, it is given a "fallback +value" that will prevent it from being excluded for this reason. +} + +This step can potentially remove columns from the data set. This may cause +issues for subsequent steps in your recipe if the missing columns are +specifically referenced by name. To avoid this, see the advice in the \emph{Tips +for saving recipes and filtering columns} section of \link[recipes:selections]{recipes::selections}. +} + +\subsection{Ties}{ + +Note that \code{\link[dplyr:slice]{dplyr::slice_max()}} with the argument \code{with_ties = TRUE } is used +to select predictors. If there are many ties in overall desirability, the +proportion selected can be larger than the value given to \code{prep_terms()}. +} + +\subsection{Case Weights}{ + +Case weights can be used by some scoring functions. To learn more, load the +\pkg{filtro} package and check the \code{case_weights} property of the score object +(see Examples below). For a recipe, use one of the tidymodels case weight +functions such as \code{\link[hardhat:importance_weights]{hardhat::importance_weights()}} or +\link[hardhat:frequency_weights]{hardhat::frequency_weights}, to assign the correct data type to the vector of case +weights. A recipe will then interpret that class to be a case weight (and no +other role). A full example is below. +} + +\subsection{Tidy method}{ + +For a trained recipe, the \code{tidy()} method will return a tibble with columns +\code{terms} (the predictor names), \code{id}, and columns for the estimated scores. +The score columns are the raw values, before being filled with "safe values" +or transformed. + +There is an additional local column called \code{removed} that notes whether the +predictor failed the filter and was removed after this step is executed. +} +} +\examples{ +library(recipes) + +rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_spearman" + ) + +prepped <- prep(rec) + +bake(prepped, mtcars) + +tidy(prepped, 1) +} diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd new file mode 100644 index 0000000..7dc37cb --- /dev/null +++ b/man/step_predictor_desirability.Rd @@ -0,0 +1,262 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_predictor_desirability.R +\name{step_predictor_desirability} +\alias{step_predictor_desirability} +\title{Supervised Multivariate Feature Selection via Desirability Functions} +\usage{ +step_predictor_desirability( + recipe, + ..., + score, + role = NA, + trained = FALSE, + prop_terms = 0.5, + update_prop = TRUE, + results = NULL, + removals = NULL, + skip = FALSE, + id = rand_id("predictor_desirability") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the sequence of +operations for this recipe.} + +\item{...}{One or more selector functions to choose variables for this step. +See \code{\link[recipes:selections]{selections()}} for more details.} + +\item{score}{An object produced by \code{\link[desirability2:desirability]{desirability2::desirability()}} that uses +one or more score functions from the \pkg{filtro} package. See the Details +and Examples sections below. This argument \emph{should be named} when used.} + +\item{role}{Not used by this step since no new variables are created.} + +\item{trained}{A logical to indicate if the quantities for preprocessing have +been estimated.} + +\item{prop_terms}{The proportion of predictors that should be retained when +ordered by overall desirability. A value of \code{\link[hardhat:tune]{hardhat::tune()}} can also be +used.} + +\item{update_prop}{A logical: should \code{prop_terms} be updated so that at least +one predictor will be retained?} + +\item{results}{A data frame of score and desirability values for each +predictor evaluated. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + +\item{removals}{A character string that contains the names of predictors that +should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + +\item{skip}{A logical. Should the step be skipped when the recipe is baked by +\code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some +operations may not be able to be conducted on new data (e.g. processing the +outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it +may affect the computations for subsequent operations.} + +\item{id}{A character string that is unique to this step to identify it.} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. When you +\code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is returned +with columns \code{terms} and \code{id}: + +\describe{ +\item{terms}{character, the selectors or variables selected to be removed} +\item{id}{character, id of this step} +} +Once trained, additional columns are included (see Details section). +} +\description{ +\code{step_predictor_desirability()} creates a \emph{specification} of a recipe step +that uses one or more "score" functions to measure how much each predictor +is related to the outcome value. These scores are combined into a composite +value using user-specified \emph{desirability} functions and a proportion of the +most desirable predictors are retained. +} +\details{ +This recipe step can compute one or more scores and conduct a simultaneous +selection of the top predictors using \emph{desirability functions}. These are +functions that, for some type of goal, translate the score's values to a +scale of \verb{[0, 1]}, where 1.0 is the best result and 0.0 is unacceptable. +Once we have these for each score, the overall desirability is computed +using the geometric mean of the individual desirabilities. See the examples +in \code{\link[desirability2:d_overall]{desirability2::d_overall()}} and \code{\link[desirability2:inline_desirability]{desirability2::d_max()}}. + +To define desirabilities, use \code{\link[desirability2:desirability]{desirability2::desirability()}} function to +define \emph{goals} for each score and pass that to the recipe in the \code{score} +argument. +\subsection{Scoring Functions}{ + +As of version 0.2.0 of the \pkg{filtro} package, the following score functions are available: + +\itemize{ + +\item \code{aov_fstat} (\code{\link[filtro:score_aov_fstat]{documentation}}) +\item \code{aov_pval} (\code{\link[filtro:score_aov_pval]{documentation}}) +\item \code{cor_pearson} (\code{\link[filtro:score_cor_pearson]{documentation}}) +\item \code{cor_spearman} (\code{\link[filtro:score_cor_spearman]{documentation}}) +\item \code{gain_ratio} (\code{\link[filtro:score_gain_ratio]{documentation}}) +\item \code{imp_rf} (\code{\link[filtro:score_imp_rf]{documentation}}) +\item \code{imp_rf_conditional} (\code{\link[filtro:score_imp_rf_conditional]{documentation}}) +\item \code{imp_rf_oblique} (\code{\link[filtro:score_imp_rf_oblique]{documentation}}) +\item \code{info_gain} (\code{\link[filtro:score_info_gain]{documentation}}) +\item \code{roc_auc} (\code{\link[filtro:score_roc_auc]{documentation}}) +\item \code{sym_uncert} (\code{\link[filtro:score_sym_uncert]{documentation}}) +\item \code{xtab_pval_chisq} (\code{\link[filtro:score_xtab_pval_chisq]{documentation}}) +\item \code{xtab_pval_fisher} (\code{\link[filtro:score_xtab_pval_fisher]{documentation}}) + +} +Some important notes: +\itemize{ +\item Scores that are p-values are automatically transformed by \pkg{filtro} to +be in the format \code{-log10(pvalue)} so that a p-value of 0.1 is converted to +1.0. For these, use the \code{maximize()} goal. +\item Other scores are also transformed in the data. For example, the correlation +scores given to the recipe step are in absolute value format. See the +\pkg{filtro} documentation for each score. +\item You can use some in-line functions using base R functions. For example, +\code{maximize(max(cor_spearman))}. +\item If a predictor cannot be computed for all scores, it is given a "fallback +value" that will prevent it from being excluded for this reason. +} + +This step can potentially remove columns from the data set. This may cause +issues for subsequent steps in your recipe if the missing columns are +specifically referenced by name. To avoid this, see the advice in the \emph{Tips +for saving recipes and filtering columns} section of \link[recipes:selections]{recipes::selections}. +} + +\subsection{Ties}{ + +Note that \code{\link[dplyr:slice]{dplyr::slice_max()}} with the argument \code{with_ties = TRUE } is used +to select predictors. If there are many ties in overall desirability, the +proportion selected can be larger than the value given to \code{prep_terms()}. +} + +\subsection{Case Weights}{ + +Case weights can be used by some scoring functions. To learn more, load the +\pkg{filtro} package and check the \code{case_weights} property of the score object +(see Examples below). For a recipe, use one of the tidymodels case weight +functions such as \code{\link[hardhat:importance_weights]{hardhat::importance_weights()}} or +\link[hardhat:frequency_weights]{hardhat::frequency_weights}, to assign the correct data type to the vector of case +weights. A recipe will then interpret that class to be a case weight (and no +other role). A full example is below. +} + +\subsection{Tidy method}{ + +For a trained recipe, the \code{tidy()} method will return a tibble with columns +\code{terms} (the predictor names), \code{id}, columns for the estimated scores, and +the desirability results. The score columns are the raw values, before being +filled with "safe values" or transformed. + +The desirability columns will have the same name as the scores with an +additional prefix of \code{.d_}. The overall desirability column is called +\code{.d_overall}. + +There is an additional local column called \code{removed} that notes whether the +predictor failed the filter and was removed after this step is executed. +} +} +\examples{ +library(recipes) +library(desirability2) + +if (rlang::is_installed("modeldata")) { + # The `ad_data` has a binary outcome column ("Class") and mostly numeric + # predictors. For these, we score the predictors using an analysis of + # variance model where the predicor is the outcome and the outcome class + # defines the groups. + # There is also a single factor predictor ("Genotype") and we'll use + # Fisher's Exact test to score it. NOTE that for scores using hypothesis + # tests, the -log10(pvalue) is returned so that larger values are more + # important. + + # The score_* objects here are from the filtro package. See Details above. + goals <- + desirability( + maximize(xtab_pval_fisher), + maximize(aov_pval) + ) + + example_data <- modeldata::ad_data + rec <- + recipe(Class ~ ., data = example_data) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 1 / 2 + ) + rec + + # Now evaluate the predictors and rank them via desirability: + prepped <- prep(rec) + prepped + + # Use the tidy() method to get the results: + predictor_scores <- tidy(prepped, number = 1) + mean(predictor_scores$removed) + predictor_scores + + # -------------------------------------------------------------------------- + + # Case-weight example: use the hardhat package to create the appropriate type + # of case weights. Here, we'll increase the weights for the minority class and + # add them to the data frame. + + library(hardhat) + + example_weights <- example_data + weights <- ifelse(example_data$Class == "Impaired", 5, 1) + example_weights$weights <- importance_weights(weights) + + # To see if the scores can use case weights, load the filtro package and + # check the `case_weights` property: + + library(filtro) + + score_xtab_pval_fisher@case_weights + score_aov_pval@case_weights + + # The recipe will automatically find the case weights and will + # not treat them as predictors. + rec_wts <- + recipe(Class ~ ., data = example_weights) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 1 / 2 + ) |> + prep() + rec_wts + + predictor_scores_wts <- + tidy(rec_wts, number = 1) |> + select(terms, .d_overall_weighted = .d_overall) + + library(dplyr) + library(ggplot2) + + # The selection did not substantially change with these case weights + full_join(predictor_scores, predictor_scores_wts, by = "terms") |> + ggplot(aes(.d_overall, .d_overall_weighted)) + + geom_abline(col = "darkgreen", lty = 2) + + geom_point(alpha = 1 / 2) + + coord_fixed(ratio = 1) + + labs(x = "Unweighted", y = "Class Weighted") +} + +} +\references{ +Derringer, G. and Suich, R. (1980), Simultaneous Optimization of +Several Response Variables. \emph{Journal of Quality Technology}, 12, 214-219. + +\url{https://desirability2.tidymodels.org/reference/inline_desirability.html} +} +\seealso{ +\code{\link[desirability2:desirability]{desirability2::desirability()}} +} diff --git a/man/step_predictor_retain.Rd b/man/step_predictor_retain.Rd new file mode 100644 index 0000000..1f704b5 --- /dev/null +++ b/man/step_predictor_retain.Rd @@ -0,0 +1,153 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_predictor_retain.R +\name{step_predictor_retain} +\alias{step_predictor_retain} +\title{Supervised Feature Selection via A Single Filter} +\usage{ +step_predictor_retain( + recipe, + ..., + score, + role = NA, + trained = FALSE, + results = NULL, + removals = NULL, + skip = FALSE, + id = rand_id("predictor_retain") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the sequence of +operations for this recipe.} + +\item{...}{One or more selector functions to choose variables for this step. +See \code{\link[recipes:selections]{selections()}} for more details.} + +\item{score}{A valid R expression that produces a logical result. The +equation can contain the names of one or more score functions from the +\pkg{filtro} package, such as \code{\link[filtro:score_imp_rf]{filtro::score_imp_rf()}}, +\code{\link[filtro:score_roc_auc]{filtro:: score_roc_auc()}}. See the Details and Examples sections below. +This argument \emph{should be named} when used.} + +\item{role}{Not used by this step since no new variables are created.} + +\item{trained}{A logical to indicate if the quantities for preprocessing have +been estimated.} + +\item{results}{A data frame of score and desirability values for each +predictor evaluated. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + +\item{removals}{A character string that contains the names of predictors that +should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + +\item{skip}{A logical. Should the step be skipped when the recipe is baked by +\code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some +operations may not be able to be conducted on new data (e.g. processing the +outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it +may affect the computations for subsequent operations.} + +\item{id}{A character string that is unique to this step to identify it.} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. When you +\code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is returned +with columns \code{terms} and \code{id}: + +\describe{ +\item{terms}{character, the selectors or variables selected to be removed} +\item{id}{character, id of this step} +} +Once trained, additional columns are included (see Details section). +} +\description{ +\code{step_predictor_retain()} creates a \emph{specification} of a recipe step that +uses a logical statement that includes one or more scoring functions to +measure how much each predictor is related to the outcome value. This step +retains the predictors that pass the logical statement. +} +\details{ +The \code{score} should be valid R syntax that produces a logical result and +should not use external data. The list of variables that can be used is in +the section below. +\subsection{Scoring Functions}{ + +As of version 0.2.0 of the \pkg{filtro} package, the following score functions are available: + +\itemize{ + +\item \code{aov_fstat} (\code{\link[filtro:score_aov_fstat]{documentation}}) +\item \code{aov_pval} (\code{\link[filtro:score_aov_pval]{documentation}}) +\item \code{cor_pearson} (\code{\link[filtro:score_cor_pearson]{documentation}}) +\item \code{cor_spearman} (\code{\link[filtro:score_cor_spearman]{documentation}}) +\item \code{gain_ratio} (\code{\link[filtro:score_gain_ratio]{documentation}}) +\item \code{imp_rf} (\code{\link[filtro:score_imp_rf]{documentation}}) +\item \code{imp_rf_conditional} (\code{\link[filtro:score_imp_rf_conditional]{documentation}}) +\item \code{imp_rf_oblique} (\code{\link[filtro:score_imp_rf_oblique]{documentation}}) +\item \code{info_gain} (\code{\link[filtro:score_info_gain]{documentation}}) +\item \code{roc_auc} (\code{\link[filtro:score_roc_auc]{documentation}}) +\item \code{sym_uncert} (\code{\link[filtro:score_sym_uncert]{documentation}}) +\item \code{xtab_pval_chisq} (\code{\link[filtro:score_xtab_pval_chisq]{documentation}}) +\item \code{xtab_pval_fisher} (\code{\link[filtro:score_xtab_pval_fisher]{documentation}}) + +} + +Some important notes: +\itemize{ +\item Scores that are p-values are automatically transformed by \pkg{filtro} to +be in the format \code{-log10(pvalue)} so that a p-value of 0.1 is converted to +1.0. For these, use the \code{maximize()} goal. +\item Other scores are also transformed in the data. For example, the correlation +scores given to the recipe step are in absolute value format. See the +\pkg{filtro} documentation for each score. +\item You can use some in-line functions using base R functions. For example, +\code{maximize(max(score_cor_spearman))}. +\item If a predictor cannot be computed for all scores, it is given a "fallback +value" that will prevent it from being excluded for this reason. +} + +This step can potentially remove columns from the data set. This may cause +issues for subsequent steps in your recipe if the missing columns are +specifically referenced by name. To avoid this, see the advice in the \emph{Tips +for saving recipes and filtering columns} section of \link[recipes:selections]{recipes::selections}. +} + +\subsection{Case Weights}{ + +Case weights can be used by some scoring functions. To learn more, load the +\pkg{filtro} package and check the \code{case_weights} property of the score object +(see Examples below). For a recipe, use one of the tidymodels case weight +functions such as \code{\link[hardhat:importance_weights]{hardhat::importance_weights()}} or +\link[hardhat:frequency_weights]{hardhat::frequency_weights}, to assign the correct data type to the vector of case +weights. A recipe will then interpret that class to be a case weight (and no +other role). A full example is below. +} + +\subsection{Tidy method}{ + +For a trained recipe, the \code{tidy()} method will return a tibble with columns +\code{terms} (the predictor names), \code{id}, and columns for the estimated scores. +The score columns are the raw values, before being filled with "safe values" +or transformed. + +There is an additional local column called \code{removed} that notes whether the +predictor failed the filter and was removed after this step is executed. +} +} +\examples{ +library(recipes) + +rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = cor_pearson >= 0.75 | cor_spearman >= 0.75 + ) + +prepped <- prep(rec) + +bake(prepped, mtcars) + +tidy(prepped, 1) +} diff --git a/tests/testthat/_snaps/recipes_utils.md b/tests/testthat/_snaps/recipes_utils.md new file mode 100644 index 0000000..4dbcbfa --- /dev/null +++ b/tests/testthat/_snaps/recipes_utils.md @@ -0,0 +1,16 @@ +# Find score objects in an expression or quosure + + Code + important:::extract_score_names(obj_4) + Condition + Error: + ! No score objects were found in `score`. + +--- + + Code + important:::extract_score_names(obj_5) + Condition + Error: + ! `score` cannot be empty. + diff --git a/tests/testthat/_snaps/step_predictor_best.md b/tests/testthat/_snaps/step_predictor_best.md new file mode 100644 index 0000000..58cf779 --- /dev/null +++ b/tests/testthat/_snaps/step_predictor_best.md @@ -0,0 +1,109 @@ +# case weights work + + Code + print(weighted) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + case_weights: 1 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection via `cor_pearson` on: drat qsec, ... | Trained, weighted + +# missing score arg + + Code + step_predictor_best(recipe(mpg ~ ., data = mtcars), all_predictors(), + prop_terms = 1 / 2) + Condition + Error in `step_predictor_best()`: + ! argument "score" is missing, with no default + +# empty printing + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Feature selection via `cor_pearson` on: + +--- + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection via `cor_pearson` on: | Trained + +# printing + + Code + print(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Feature selection via `cor_pearson` on: all_predictors() + +--- + + Code + prep(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection via `cor_pearson` on: qsec, vs, am, gear, carb | Trained + +# bad args + + Code + prep(step_predictor_best(recipe(mpg ~ ., mtcars), all_predictors(), prop_terms = 2, + score = "cor_pearson")) + Condition + Error in `step_predictor_best()`: + Caused by error in `prep()`: + ! `prop_terms` must be a number between 2.22044604925031e-16 and 1, not the number 2. + diff --git a/tests/testthat/_snaps/step_predictor_desirability.md b/tests/testthat/_snaps/step_predictor_desirability.md new file mode 100644 index 0000000..340b840 --- /dev/null +++ b/tests/testthat/_snaps/step_predictor_desirability.md @@ -0,0 +1,81 @@ +# wrong score type + + All score computations failed; skipping feature selection. + +# case weights work + + Code + print(weighted) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + case_weights: 1 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection via desirability functions (`cor_pearson` and + `cor_spearman`) on: disp, hp, drat, qsec, vs, am, gear, carb | Trained, + weighted + +# empty printing + + Code + step_predictor_desirability(rec) + Condition + Error in `step_predictor_desirability()`: + ! argument "score" is missing, with no default + +# printing + + Code + print(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Feature selection via desirability functions (`cor_pearson` and + `cor_spearman`) on: all_predictors() + +--- + + Code + prep(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection via desirability functions (`cor_pearson` and + `cor_spearman`) on: drat, qsec, am, gear, carb | Trained + +# bad args + + Code + prep(step_predictor_desirability(recipe(mpg ~ ., mtcars), all_predictors(), + score = goals, prop_terms = 2)) + Condition + Error in `step_predictor_desirability()`: + Caused by error in `prep()`: + ! `prop_terms` must be a number between 2.22044604925031e-16 and 1, not the number 2. + diff --git a/tests/testthat/_snaps/step_predictor_retain.md b/tests/testthat/_snaps/step_predictor_retain.md new file mode 100644 index 0000000..ad38311 --- /dev/null +++ b/tests/testthat/_snaps/step_predictor_retain.md @@ -0,0 +1,101 @@ +# case weights work + + Code + print(weighted) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + case_weights: 1 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection using `cor_pearson` and `cor_spearman` removing: drat, + qsec, vs, am, gear, carb | Trained, weighted + +# empty printing + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Feature selection using for: + +--- + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection using removing: | Trained + +# printing + + Code + print(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Feature selection using `cor_pearson` for: all_predictors() + +--- + + Code + prep(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection using `cor_pearson` removing: drat qsec, ... | Trained + +# bad args + + Code + prep(step_predictor_retain(recipe(mpg ~ ., mtcars), all_predictors(), + threshold = 2)) + Condition + Error in `step_predictor_retain()`: + Caused by error in `prep()`: + ! The following argument was specified but does not exist: `threshold`. + diff --git a/tests/testthat/helper-objects.R b/tests/testthat/helper-objects.R index 20e4b0a..e1dfdac 100644 --- a/tests/testthat/helper-objects.R +++ b/tests/testthat/helper-objects.R @@ -4,6 +4,9 @@ suppressPackageStartupMessages(library(dplyr)) suppressPackageStartupMessages(library(purrr)) suppressPackageStartupMessages(library(parsnip)) # imported by tune suppressPackageStartupMessages(library(yardstick)) # imported by tune +suppressPackageStartupMessages(library(filtro)) +suppressPackageStartupMessages(library(desirability2)) +suppressPackageStartupMessages(library(S7)) # ------------------------------------------------------------------------------ # regression examples @@ -11,14 +14,14 @@ suppressPackageStartupMessages(library(yardstick)) # imported by tune CO2_ex <- CO2 |> dplyr::select(-Plant, -Treatment) co2_rec <- recipes::recipe(uptake ~ ., data = CO2_ex) |> - recipes::step_dummy(recipes::all_factor_predictors()) + recipes::step_dummy(recipes::all_factor_predictors()) reg_f_wflow <- workflows::workflow(uptake ~ ., parsnip::linear_reg()) reg_r_wflow <- workflows::workflow(co2_rec, parsnip::linear_reg()) reg_v_wflow <- - workflows::workflow() |> - workflows::add_model(parsnip::linear_reg()) |> - workflows::add_variables(outcomes = uptake, predictors = c(everything())) + workflows::workflow() |> + workflows::add_model(parsnip::linear_reg()) |> + workflows::add_variables(outcomes = uptake, predictors = c(everything())) reg_1d_wflow <- workflows::workflow(uptake ~ conc, parsnip::linear_reg()) reg_f_fit <- workflows:::fit.workflow(reg_f_wflow, CO2_ex) @@ -33,9 +36,9 @@ reg_mtr <- yardstick::metric_set(yardstick::rsq, yardstick::mae) cls_f_wflow <- workflows::workflow(Class ~ ., parsnip::logistic_reg()) cls_v_wflow <- - workflows::workflow() |> - workflows::add_model(parsnip::logistic_reg()) |> - workflows::add_variables(outcomes = Class, predictors = c(everything())) + workflows::workflow() |> + workflows::add_model(parsnip::logistic_reg()) |> + workflows::add_variables(outcomes = Class, predictors = c(everything())) cls_1d_wflow <- workflows::workflow(Class ~ tau, parsnip::logistic_reg()) if (rlang::is_installed("modeldata")) { @@ -46,9 +49,9 @@ if (rlang::is_installed("modeldata")) { dplyr::select(Class, tau, p_tau, VEGF, MMP10, Genotype, male) ad_rec <- - recipes::recipe(Class ~ ., data = ad_data_small) |> - recipes::step_pca(tau, p_tau, VEGF, MMP10, num_comp = 2) |> - recipes::step_dummy(recipes::all_factor_predictors()) + recipes::recipe(Class ~ ., data = ad_data_small) |> + recipes::step_pca(tau, p_tau, VEGF, MMP10, num_comp = 2) |> + recipes::step_dummy(recipes::all_factor_predictors()) cls_r_wflow <- workflows::workflow(ad_rec, parsnip::logistic_reg()) @@ -58,7 +61,11 @@ if (rlang::is_installed("modeldata")) { cls_1d_fit <- workflows:::fit.workflow(cls_1d_wflow, ad_data_small) } -cls_mtr <- yardstick::metric_set(yardstick::brier_class, yardstick::kap, yardstick::mcc) +cls_mtr <- yardstick::metric_set( + yardstick::brier_class, + yardstick::kap, + yardstick::mcc +) # ------------------------------------------------------------------------------ # survival examples @@ -77,7 +84,10 @@ if (rlang::is_installed("censored")) { srv_times <- (1:4) / 4 } -srv_mtr <- yardstick::metric_set(yardstick::concordance_survival, yardstick::roc_auc_survival) +srv_mtr <- yardstick::metric_set( + yardstick::concordance_survival, + yardstick::roc_auc_survival +) # ------------------------------------------------------------------------------ @@ -91,4 +101,19 @@ ex_seed <- id = 548676L ) +# ------------------------------------------------------------------------------ +# for recipes + +POTATO <- function(x) { + rlang::enquo(x) +} + +goals <- + desirability2::desirability( + maximize(cor_pearson), + constrain(cor_spearman, low = 0.7, high = 1) + ) +mtcars_wts <- mtcars +.wts <- seq(0, 1, length.out = 32) +mtcars_wts$case_weights <- hardhat::importance_weights(.wts) diff --git a/tests/testthat/test-recipes_utils.R b/tests/testthat/test-recipes_utils.R new file mode 100644 index 0000000..2dfc914 --- /dev/null +++ b/tests/testthat/test-recipes_utils.R @@ -0,0 +1,49 @@ +test_that("Find score objects in an expression or quosure", { + + obj_1 <- POTATO(c(minimize(score_a), constrain(score_b))) + res_1 <- important:::extract_score_names(obj_1) + expect_equal(res_1, c("score_a", "score_b")) + expect_equal( + obj_1 |> rlang::quo_get_expr() |> important:::extract_score_names(), + c("score_a", "score_b") + ) + + obj_2 <- POTATO(c(minimize(score_a), constrain(score_a))) + res_2 <- important:::extract_score_names(obj_2) + expect_equal(res_2, c("score_a")) + expect_equal( + obj_2 |> rlang::quo_get_expr() |> important:::extract_score_names(), + c("score_a") + ) + + obj_3 <- POTATO(score_a < 2 & score_b > 3) + res_3 <- important:::extract_score_names(obj_3) + expect_equal(res_3, c("score_a", "score_b")) + expect_equal( + obj_3 |> rlang::quo_get_expr() |> important:::extract_score_names(), + c("score_a", "score_b") + ) + + obj_4 <- POTATO(a < 2 & b > 3) + expect_snapshot(important:::extract_score_names(obj_4), error = TRUE) + + obj_5 <- POTATO() + expect_snapshot(important:::extract_score_names(obj_5), error = TRUE) + + obj_6 <- POTATO(I(score_a < 2) & predictor == "another potato") + res_6 <- important:::extract_score_names(obj_6) + expect_equal(res_6, c("score_a")) + expect_equal( + obj_6 |> rlang::quo_get_expr() |> important:::extract_score_names(), + c("score_a") + ) + + obj_7 <- POTATO(score_a / c < 2) + res_7 <- important:::extract_score_names(obj_7) + expect_equal(res_7, c("score_a")) + expect_equal( + obj_7 |> rlang::quo_get_expr() |> important:::extract_score_names(), + c("score_a") + ) + +}) diff --git a/tests/testthat/test-step_predictor_best.R b/tests/testthat/test-step_predictor_best.R new file mode 100644 index 0000000..f0df075 --- /dev/null +++ b/tests/testthat/test-step_predictor_best.R @@ -0,0 +1,226 @@ +test_that("step works", { + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 1 / 2 + ) + + prepped <- prep(rec) + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + cor_pearson_res <- filtro::score_cor_pearson |> + filtro::fit(mpg ~ ., data = mtcars) + + cor_pearson_res <- cor_pearson_res |> + filtro::fill_safe_value(transform = TRUE) + exp <- cor_pearson_res@results |> + dplyr::slice_max(score, prop = 1 / 2, with_ties = TRUE) |> + dplyr::pull(predictor) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(setdiff(names(mtcars)[-1], exp)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$removed]), + sort(setdiff(names(mtcars)[-1], exp)) + ) + expect_named( + res_tidy, + c("terms", "removed", "score", "id") + ) +}) + +test_that("EVERYTHING MUST GO", { + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 0.0, + update_prop = FALSE + ) + + prepped <- prep(rec) + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(names(mtcars)[-1]) + ) + expect_true( + all(res_tidy$removed) + ) + expect_named( + res_tidy, + c("terms", "removed", "score", "id") + ) +}) + +test_that("keep everything", { + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 1 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(names(res_bake)), + sort(names(mtcars)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$removed]), + character(0) + ) + expect_named( + res_tidy, + c("terms", "removed", "score", "id") + ) +}) + +test_that("case weights work", { + unweighted <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 1 / 2 + ) |> + prep() + + weighted <- recipe(mpg ~ ., data = mtcars_wts) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 1 / 2 + ) |> + prep() + + expect_snapshot(print(weighted)) + + unweighted_res <- tidy(unweighted, number = 1) |> + dplyr::select(terms, unweighted = score) + weighted_res <- tidy(weighted, number = 1) |> + dplyr::select(terms, weighted = score) + both_res <- dplyr::full_join(unweighted_res, weighted_res, by = "terms") + expect_false(isTRUE(all.equal(both_res$weighted, both_res$unweighted))) +}) + +test_that("missing score arg", { + skip_if(getRversion() <= "4.3.0") + expect_snapshot( + error = TRUE, + recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + prop_terms = 1 / 2 + ) + ) +}) + +# Infrastructure --------------------------------------------------------------- +test_that("bake method errors when needed non-standard role columns are missing", { + # Here for completeness + # step_predictor_best() removes variables and thus does not care if they are not there. + expect_true(TRUE) +}) + +test_that("empty printing", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_predictor_best(rec, score = "cor_pearson") + + expect_snapshot(rec) + + rec <- prep(rec, mtcars) + + expect_snapshot(rec) +}) + +test_that("empty selection prep/bake is a no-op", { + rec1 <- recipe(mpg ~ ., mtcars) + rec2 <- step_predictor_best(rec1, score = "cor_pearson") + + rec1 <- prep(rec1, mtcars) + rec2 <- prep(rec2, mtcars) + + baked1 <- bake(rec1, mtcars) + baked2 <- bake(rec2, mtcars) + + expect_identical(baked1, baked2) +}) + +test_that("empty selection tidy method works", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_predictor_best(rec, score = "cor_pearson") + + expect <- tibble(terms = character(), id = character()) + + expect_identical(tidy(rec, number = 1), expect) + + rec <- prep(rec, mtcars) + + expect_identical(tidy(rec, number = 1), expect) +}) + +test_that("printing", { + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best(all_predictors(), score = "cor_pearson") + + expect_snapshot(print(rec)) + expect_snapshot(prep(rec)) +}) + +test_that("tunable is setup to work with extract_parameter_set_dials", { + skip_if_not_installed("dials", minimum_version = "1.4.1.9000") + rec <- recipe(~., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = hardhat::tune() + ) + + params <- extract_parameter_set_dials(rec) + + expect_s3_class(params, "parameters") + expect_identical(nrow(params), 1L) +}) + +test_that("bad args", { + expect_snapshot( + recipe(mpg ~ ., mtcars) |> + step_predictor_best( + all_predictors(), + prop_terms = 2, + score = "cor_pearson" + ) |> + prep(), + error = TRUE + ) +}) + +test_that("0 and 1 rows data work in bake method", { + data <- mtcars + rec <- recipe(mpg ~ ., data) |> + step_predictor_best(all_numeric_predictors(), score = "cor_pearson") |> + prep() + + expect_identical( + nrow(bake(rec, slice(data, 1))), + 1L + ) + expect_identical( + nrow(bake(rec, slice(data, 0))), + 0L + ) +}) diff --git a/tests/testthat/test-step_predictor_desirability.R b/tests/testthat/test-step_predictor_desirability.R new file mode 100644 index 0000000..0a5fd91 --- /dev/null +++ b/tests/testthat/test-step_predictor_desirability.R @@ -0,0 +1,268 @@ +test_that("step works", { + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 0.2 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + score_res <- + list( + filtro::score_cor_pearson |> filtro::fit(mpg ~ ., data = mtcars), + filtro::score_cor_spearman |> filtro::fit(mpg ~ ., data = mtcars) + ) |> + filtro::fill_safe_values(transform = TRUE) |> + dplyr::mutate( + d_pearson = desirability2::d_max(cor_pearson, use_data = TRUE), + d_spearman = desirability2::d_box(cor_spearman, low = 0.7, high = 1.0), + d_overall = desirability2::d_overall(dplyr::across(dplyr::starts_with( + "d_" + ))) + ) + + retained <- score_res |> + dplyr::slice_max(d_overall, prop = 0.2, with_ties = TRUE) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(setdiff(names(mtcars)[-1], retained$predictor)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$removed]), + sort(setdiff(names(mtcars)[-1], retained$predictor)) + ) + expect_named( + res_tidy, + c( + "terms", + "removed", + "cor_pearson", + "cor_spearman", + ".d_max_cor_pearson", + ".d_box_cor_spearman", + ".d_overall", + "id" + ) + ) +}) + +test_that("EVERYTHING MUST GO", { + bad_goals <- + desirability2::desirability( + constrain(cor_spearman, low = 2, high = 3) + ) + + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = bad_goals, + prop_terms = 0.2 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + names(res_bake), + "mpg" + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$removed]), + sort(names(mtcars)[-1]) + ) + expect_named( + res_tidy, + c( + "terms", + "removed", + "cor_spearman", + ".d_box_cor_spearman", + ".d_overall", + "id" + ) + ) +}) + +test_that("wrong score type", { + wrong_goals <- + desirability2::desirability( + maximize(xtab_pval_fisher) + ) + + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = wrong_goals, + prop_terms = 0.2 + ) + + expect_snapshot_warning(prepped <- prep(rec)) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(names(res_bake)), + sort(names(mtcars)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$removed]), + character(0) + ) + expect_named( + res_tidy, + c("terms", "removed", "xtab_pval_fisher", "id") + ) +}) + +test_that("keep everything", { + easy_goals <- + desirability2::desirability( + constrain(cor_spearman, low = -2, high = 3) + ) + + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = easy_goals, + prop_terms = 0.2 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(names(res_bake)), + sort(names(mtcars)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$removed]), + character(0) + ) + expect_named( + res_tidy, + c( + "terms", + "removed", + "cor_spearman", + ".d_box_cor_spearman", + ".d_overall", + "id" + ) + ) +}) + +test_that("case weights work", { + unweighted <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 0.2 + ) |> + prep() + + weighted <- recipe(mpg ~ ., data = mtcars_wts) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 0.2 + ) |> + prep() + + expect_snapshot(print(weighted)) + + unweighted_res <- tidy(unweighted, number = 1) |> + dplyr::select(terms, unweighted = cor_spearman) + weighted_res <- tidy(weighted, number = 1) |> + dplyr::select(terms, weighted = cor_spearman) + both_res <- dplyr::full_join(unweighted_res, weighted_res, by = "terms") + expect_false(isTRUE(all.equal(both_res$weighted, both_res$unweighted))) +}) + +# Infrastructure --------------------------------------------------------------- + +test_that("bake method errors when needed non-standard role columns are missing", { + # Here for completeness + # step_predictor_desirability() removes variables and thus does not care if they are not there. + expect_true(TRUE) +}) + +test_that("empty printing", { + skip_if(getRversion() <= "4.3.0") + rec <- recipe(mpg ~ ., mtcars) + expect_snapshot(step_predictor_desirability(rec), error = TRUE) +}) + +test_that("printing", { + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals + ) + + expect_snapshot(print(rec)) + expect_snapshot(prep(rec)) +}) + +test_that("tunable is setup to work with extract_parameter_set_dials", { + skip_if_not_installed("dials", minimum_version = "1.4.1.9000") + + rec <- recipe(~., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = hardhat::tune() + ) + + params <- extract_parameter_set_dials(rec) + + expect_s3_class(params, "parameters") + expect_identical(nrow(params), 1L) +}) + +test_that("bad args", { + expect_snapshot( + recipe(mpg ~ ., mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 2 + ) |> + prep(), + error = TRUE + ) +}) + +test_that("0 and 1 rows data work in bake method", { + data <- mtcars + rec <- recipe(mpg ~ ., data) |> + step_predictor_desirability(all_numeric_predictors(), score = goals, ) |> + prep() + + expect_identical( + nrow(bake(rec, slice(data, 1))), + 1L + ) + expect_identical( + nrow(bake(rec, slice(data, 0))), + 0L + ) +}) diff --git a/tests/testthat/test-step_predictor_retain.R b/tests/testthat/test-step_predictor_retain.R new file mode 100644 index 0000000..c3ff9bf --- /dev/null +++ b/tests/testthat/test-step_predictor_retain.R @@ -0,0 +1,230 @@ +test_that("step works", { + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= 0.75 & abs(cor_spearman) >= 0.6 + ) + + prepped <- prep(rec) + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + cor_pearson_res <- filtro::score_cor_pearson |> + filtro::fit(mpg ~ ., data = mtcars) + cor_spearman_res <- filtro::score_cor_spearman |> + filtro::fit(mpg ~ ., data = mtcars) + + exp <- filtro::fill_safe_values(list( + cor_pearson_res, + cor_spearman_res + )) |> + dplyr::filter(abs(cor_pearson) < 0.75 | abs(cor_spearman) < 0.6) |> + dplyr::pull(predictor) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(exp) + ) + expect_identical( + sort(res_tidy$terms[res_tidy$removed]), + sort(exp) + ) + expect_named( + res_tidy, + c("terms", "removed", "cor_pearson", "cor_spearman", "id") + ) +}) + +test_that("EVERYTHING MUST GO", { + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= Inf & abs(cor_spearman) >= Inf + ) + + prepped <- prep(rec) + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(names(mtcars)[-1]) + ) + expect_true( + all(res_tidy$removed) + ) + expect_named( + res_tidy, + c("terms", "removed", "cor_pearson", "cor_spearman", "id") + ) +}) + +test_that("keep everything", { + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= -1 & abs(cor_spearman) >= -1 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(names(res_bake)), + sort(names(mtcars)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$removed]), + character(0) + ) + expect_named( + res_tidy, + c("terms", "removed", "cor_pearson", "cor_spearman", "id") + ) +}) + +test_that("allows for one score", { + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= 0.7 + ) + + prepped <- prep(rec) + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + cor_pearson_res <- filtro::score_cor_pearson |> + filtro::fit(mpg ~ ., data = mtcars) + + exp <- cor_pearson_res@results |> + dplyr::filter(abs(score) < 0.7) |> + dplyr::pull(predictor) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(exp) + ) + expect_named( + res_tidy, + c("terms", "removed", "cor_pearson", "id") + ) +}) + +test_that("case weights work", { + unweighted <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= 0.75 & abs(cor_spearman) >= 0.6 + ) |> + prep() + + weighted <- recipe(mpg ~ ., data = mtcars_wts) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= 0.75 & abs(cor_spearman) >= 0.6 + ) |> + prep() + + expect_snapshot(print(weighted)) + + unweighted_res <- tidy(unweighted, number = 1) |> + dplyr::select(terms, unweighted = cor_spearman) + weighted_res <- tidy(weighted, number = 1) |> + dplyr::select(terms, weighted = cor_spearman) + both_res <- dplyr::full_join(unweighted_res, weighted_res, by = "terms") + expect_false(isTRUE(all.equal(both_res$weighted, both_res$unweighted))) +}) + +# Infrastructure --------------------------------------------------------------- + +test_that("bake method errors when needed non-standard role columns are missing", { + # Here for completeness + # step_predictor_retain() removes variables and thus does not care if they are not there. + expect_true(TRUE) +}) + +test_that("empty printing", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_predictor_retain(rec) + + expect_snapshot(rec) + + rec <- prep(rec, mtcars) + + expect_snapshot(rec) +}) + +test_that("empty selection prep/bake is a no-op", { + rec1 <- recipe(mpg ~ ., mtcars) + rec2 <- step_predictor_retain(rec1) + + rec1 <- prep(rec1, mtcars) + rec2 <- prep(rec2, mtcars) + + baked1 <- bake(rec1, mtcars) + baked2 <- bake(rec2, mtcars) + + expect_identical(baked1, baked2) +}) + +test_that("empty selection tidy method works", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_predictor_retain(rec) + + expect <- tibble(terms = character(), id = character()) + + expect_identical(tidy(rec, number = 1), expect) + + rec <- prep(rec, mtcars) + + expect_identical( + tidy(rec, number = 1), + tibble::tibble( + terms = character(0), + removed = logical(0), + id = character(0) + ) + ) +}) + +test_that("printing", { + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain(all_predictors(), score = abs(cor_pearson) >= 0.75) + + expect_snapshot(print(rec)) + expect_snapshot(prep(rec)) +}) + +test_that("bad args", { + expect_snapshot( + recipe(mpg ~ ., mtcars) |> + step_predictor_retain(all_predictors(), threshold = 2) |> + prep(), + error = TRUE + ) +}) + +test_that("0 and 1 rows data work in bake method", { + data <- mtcars + rec <- recipe(mpg ~ ., data) |> + step_predictor_retain( + all_numeric_predictors(), + score = abs(cor_pearson) >= 0.75 + ) |> + prep() + + expect_identical( + nrow(bake(rec, slice(data, 1))), + 1L + ) + expect_identical( + nrow(bake(rec, slice(data, 0))), + 0L + ) +})