From 02d20b8fea781322226a3d8ad624a2cc7cff11da Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Tue, 19 Aug 2025 11:46:36 -0700 Subject: [PATCH 01/65] add rlang standalone checkers --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/import-standalone-obj-type.R | 365 ++++++++++++++++++ R/import-standalone-types-check.R | 596 ++++++++++++++++++++++++++++++ R/important-package.R | 1 + 5 files changed, 964 insertions(+), 1 deletion(-) create mode 100644 R/import-standalone-obj-type.R create mode 100644 R/import-standalone-types-check.R diff --git a/DESCRIPTION b/DESCRIPTION index c25c5aa..fecde13 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,7 +23,7 @@ Imports: ggplot2, hardhat (>= 1.4.0.9002), purrr, - rlang, + rlang (>= 1.1.0), tibble, tidyr, tune, diff --git a/NAMESPACE b/NAMESPACE index 63caeb2..b30e810 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(augment) export(autoplot) export(importance_perm) export(required_pkgs) +import(rlang) importFrom(generics,augment) importFrom(generics,required_pkgs) importFrom(ggplot2,autoplot) diff --git a/R/import-standalone-obj-type.R b/R/import-standalone-obj-type.R new file mode 100644 index 0000000..c582ba0 --- /dev/null +++ b/R/import-standalone-obj-type.R @@ -0,0 +1,365 @@ +# Standalone file: do not edit by hand +# Source: https://github.com/r-lib/rlang/blob/HEAD/R/standalone-obj-type.R +# Generated by: usethis::use_standalone("r-lib/rlang", "obj-type") +# ---------------------------------------------------------------------- +# +# --- +# repo: r-lib/rlang +# file: standalone-obj-type.R +# last-updated: 2024-02-14 +# license: https://unlicense.org +# imports: rlang (>= 1.1.0) +# --- +# +# ## Changelog +# +# 2024-02-14: +# - `obj_type_friendly()` now works for S7 objects. +# +# 2023-05-01: +# - `obj_type_friendly()` now only displays the first class of S3 objects. +# +# 2023-03-30: +# - `stop_input_type()` now handles `I()` input literally in `arg`. +# +# 2022-10-04: +# - `obj_type_friendly(value = TRUE)` now shows numeric scalars +# literally. +# - `stop_friendly_type()` now takes `show_value`, passed to +# `obj_type_friendly()` as the `value` argument. +# +# 2022-10-03: +# - Added `allow_na` and `allow_null` arguments. +# - `NULL` is now backticked. +# - Better friendly type for infinities and `NaN`. +# +# 2022-09-16: +# - Unprefixed usage of rlang functions with `rlang::` to +# avoid onLoad issues when called from rlang (#1482). +# +# 2022-08-11: +# - Prefixed usage of rlang functions with `rlang::`. +# +# 2022-06-22: +# - `friendly_type_of()` is now `obj_type_friendly()`. +# - Added `obj_type_oo()`. +# +# 2021-12-20: +# - Added support for scalar values and empty vectors. +# - Added `stop_input_type()` +# +# 2021-06-30: +# - Added support for missing arguments. +# +# 2021-04-19: +# - Added support for matrices and arrays (#141). +# - Added documentation. +# - Added changelog. +# +# nocov start + +#' Return English-friendly type +#' @param x Any R object. +#' @param value Whether to describe the value of `x`. Special values +#' like `NA` or `""` are always described. +#' @param length Whether to mention the length of vectors and lists. +#' @return A string describing the type. Starts with an indefinite +#' article, e.g. "an integer vector". +#' @noRd +obj_type_friendly <- function(x, value = TRUE) { + if (is_missing(x)) { + return("absent") + } + + if (is.object(x)) { + if (inherits(x, "quosure")) { + type <- "quosure" + } else { + type <- class(x)[[1L]] + } + return(sprintf("a <%s> object", type)) + } + + if (!is_vector(x)) { + return(.rlang_as_friendly_type(typeof(x))) + } + + n_dim <- length(dim(x)) + + if (!n_dim) { + if (!is_list(x) && length(x) == 1) { + if (is_na(x)) { + return(switch( + typeof(x), + logical = "`NA`", + integer = "an integer `NA`", + double = if (is.nan(x)) { + "`NaN`" + } else { + "a numeric `NA`" + }, + complex = "a complex `NA`", + character = "a character `NA`", + .rlang_stop_unexpected_typeof(x) + )) + } + + show_infinites <- function(x) { + if (x > 0) { + "`Inf`" + } else { + "`-Inf`" + } + } + str_encode <- function(x, width = 30, ...) { + if (nchar(x) > width) { + x <- substr(x, 1, width - 3) + x <- paste0(x, "...") + } + encodeString(x, ...) + } + + if (value) { + if (is.numeric(x) && is.infinite(x)) { + return(show_infinites(x)) + } + + if (is.numeric(x) || is.complex(x)) { + number <- as.character(round(x, 2)) + what <- if (is.complex(x)) "the complex number" else "the number" + return(paste(what, number)) + } + + return(switch( + typeof(x), + logical = if (x) "`TRUE`" else "`FALSE`", + character = { + what <- if (nzchar(x)) "the string" else "the empty string" + paste(what, str_encode(x, quote = "\"")) + }, + raw = paste("the raw value", as.character(x)), + .rlang_stop_unexpected_typeof(x) + )) + } + + return(switch( + typeof(x), + logical = "a logical value", + integer = "an integer", + double = if (is.infinite(x)) show_infinites(x) else "a number", + complex = "a complex number", + character = if (nzchar(x)) "a string" else "\"\"", + raw = "a raw value", + .rlang_stop_unexpected_typeof(x) + )) + } + + if (length(x) == 0) { + return(switch( + typeof(x), + logical = "an empty logical vector", + integer = "an empty integer vector", + double = "an empty numeric vector", + complex = "an empty complex vector", + character = "an empty character vector", + raw = "an empty raw vector", + list = "an empty list", + .rlang_stop_unexpected_typeof(x) + )) + } + } + + vec_type_friendly(x) +} + +vec_type_friendly <- function(x, length = FALSE) { + if (!is_vector(x)) { + abort("`x` must be a vector.") + } + type <- typeof(x) + n_dim <- length(dim(x)) + + add_length <- function(type) { + if (length && !n_dim) { + paste0(type, sprintf(" of length %s", length(x))) + } else { + type + } + } + + if (type == "list") { + if (n_dim < 2) { + return(add_length("a list")) + } else if (is.data.frame(x)) { + return("a data frame") + } else if (n_dim == 2) { + return("a list matrix") + } else { + return("a list array") + } + } + + type <- switch( + type, + logical = "a logical %s", + integer = "an integer %s", + numeric = , + double = "a double %s", + complex = "a complex %s", + character = "a character %s", + raw = "a raw %s", + type = paste0("a ", type, " %s") + ) + + if (n_dim < 2) { + kind <- "vector" + } else if (n_dim == 2) { + kind <- "matrix" + } else { + kind <- "array" + } + out <- sprintf(type, kind) + + if (n_dim >= 2) { + out + } else { + add_length(out) + } +} + +.rlang_as_friendly_type <- function(type) { + switch( + type, + + list = "a list", + + NULL = "`NULL`", + environment = "an environment", + externalptr = "a pointer", + weakref = "a weak reference", + S4 = "an S4 object", + + name = , + symbol = "a symbol", + language = "a call", + pairlist = "a pairlist node", + expression = "an expression vector", + + char = "an internal string", + promise = "an internal promise", + ... = "an internal dots object", + any = "an internal `any` object", + bytecode = "an internal bytecode object", + + primitive = , + builtin = , + special = "a primitive function", + closure = "a function", + + type + ) +} + +.rlang_stop_unexpected_typeof <- function(x, call = caller_env()) { + abort( + sprintf("Unexpected type <%s>.", typeof(x)), + call = call + ) +} + +#' Return OO type +#' @param x Any R object. +#' @return One of `"bare"` (for non-OO objects), `"S3"`, `"S4"`, +#' `"R6"`, or `"S7"`. +#' @noRd +obj_type_oo <- function(x) { + if (!is.object(x)) { + return("bare") + } + + class <- inherits(x, c("R6", "S7_object"), which = TRUE) + + if (class[[1]]) { + "R6" + } else if (class[[2]]) { + "S7" + } else if (isS4(x)) { + "S4" + } else { + "S3" + } +} + +#' @param x The object type which does not conform to `what`. Its +#' `obj_type_friendly()` is taken and mentioned in the error message. +#' @param what The friendly expected type as a string. Can be a +#' character vector of expected types, in which case the error +#' message mentions all of them in an "or" enumeration. +#' @param show_value Passed to `value` argument of `obj_type_friendly()`. +#' @param ... Arguments passed to [abort()]. +#' @inheritParams args_error_context +#' @noRd +stop_input_type <- function( + x, + what, + ..., + allow_na = FALSE, + allow_null = FALSE, + show_value = TRUE, + arg = caller_arg(x), + call = caller_env() +) { + # From standalone-cli.R + cli <- env_get_list( + nms = c("format_arg", "format_code"), + last = topenv(), + default = function(x) sprintf("`%s`", x), + inherit = TRUE + ) + + if (allow_na) { + what <- c(what, cli$format_code("NA")) + } + if (allow_null) { + what <- c(what, cli$format_code("NULL")) + } + if (length(what)) { + what <- oxford_comma(what) + } + if (inherits(arg, "AsIs")) { + format_arg <- identity + } else { + format_arg <- cli$format_arg + } + + message <- sprintf( + "%s must be %s, not %s.", + format_arg(arg), + what, + obj_type_friendly(x, value = show_value) + ) + + abort(message, ..., call = call, arg = arg) +} + +oxford_comma <- function(chr, sep = ", ", final = "or") { + n <- length(chr) + + if (n < 2) { + return(chr) + } + + head <- chr[seq_len(n - 1)] + last <- chr[n] + + head <- paste(head, collapse = sep) + + # Write a or b. But a, b, or c. + if (n > 2) { + paste0(head, sep, final, " ", last) + } else { + paste0(head, " ", final, " ", last) + } +} + +# nocov end diff --git a/R/import-standalone-types-check.R b/R/import-standalone-types-check.R new file mode 100644 index 0000000..5214a00 --- /dev/null +++ b/R/import-standalone-types-check.R @@ -0,0 +1,596 @@ +# Standalone file: do not edit by hand +# Source: https://github.com/r-lib/rlang/blob/HEAD/R/standalone-types-check.R +# Generated by: usethis::use_standalone("r-lib/rlang", "types-check") +# ---------------------------------------------------------------------- +# +# --- +# repo: r-lib/rlang +# file: standalone-types-check.R +# last-updated: 2023-03-13 +# license: https://unlicense.org +# dependencies: standalone-obj-type.R +# imports: rlang (>= 1.1.0) +# --- +# +# ## Changelog +# +# 2024-08-15: +# - `check_character()` gains an `allow_na` argument (@martaalcalde, #1724) +# +# 2023-03-13: +# - Improved error messages of number checkers (@teunbrand) +# - Added `allow_infinite` argument to `check_number_whole()` (@mgirlich). +# - Added `check_data_frame()` (@mgirlich). +# +# 2023-03-07: +# - Added dependency on rlang (>= 1.1.0). +# +# 2023-02-15: +# - Added `check_logical()`. +# +# - `check_bool()`, `check_number_whole()`, and +# `check_number_decimal()` are now implemented in C. +# +# - For efficiency, `check_number_whole()` and +# `check_number_decimal()` now take a `NULL` default for `min` and +# `max`. This makes it possible to bypass unnecessary type-checking +# and comparisons in the default case of no bounds checks. +# +# 2022-10-07: +# - `check_number_whole()` and `_decimal()` no longer treat +# non-numeric types such as factors or dates as numbers. Numeric +# types are detected with `is.numeric()`. +# +# 2022-10-04: +# - Added `check_name()` that forbids the empty string. +# `check_string()` allows the empty string by default. +# +# 2022-09-28: +# - Removed `what` arguments. +# - Added `allow_na` and `allow_null` arguments. +# - Added `allow_decimal` and `allow_infinite` arguments. +# - Improved errors with absent arguments. +# +# +# 2022-09-16: +# - Unprefixed usage of rlang functions with `rlang::` to +# avoid onLoad issues when called from rlang (#1482). +# +# 2022-08-11: +# - Added changelog. +# +# nocov start + +# Scalars ----------------------------------------------------------------- + +.standalone_types_check_dot_call <- .Call + +check_bool <- function( + x, + ..., + allow_na = FALSE, + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if ( + !missing(x) && + .standalone_types_check_dot_call( + ffi_standalone_is_bool_1.0.7, + x, + allow_na, + allow_null + ) + ) { + return(invisible(NULL)) + } + + stop_input_type( + x, + c("`TRUE`", "`FALSE`"), + ..., + allow_na = allow_na, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_string <- function( + x, + ..., + allow_empty = TRUE, + allow_na = FALSE, + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + is_string <- .rlang_check_is_string( + x, + allow_empty = allow_empty, + allow_na = allow_na, + allow_null = allow_null + ) + if (is_string) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a single string", + ..., + allow_na = allow_na, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +.rlang_check_is_string <- function(x, allow_empty, allow_na, allow_null) { + if (is_string(x)) { + if (allow_empty || !is_string(x, "")) { + return(TRUE) + } + } + + if (allow_null && is_null(x)) { + return(TRUE) + } + + if (allow_na && (identical(x, NA) || identical(x, na_chr))) { + return(TRUE) + } + + FALSE +} + +check_name <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + is_string <- .rlang_check_is_string( + x, + allow_empty = FALSE, + allow_na = FALSE, + allow_null = allow_null + ) + if (is_string) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a valid name", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +IS_NUMBER_true <- 0 +IS_NUMBER_false <- 1 +IS_NUMBER_oob <- 2 + +check_number_decimal <- function( + x, + ..., + min = NULL, + max = NULL, + allow_infinite = TRUE, + allow_na = FALSE, + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (missing(x)) { + exit_code <- IS_NUMBER_false + } else if ( + 0 == + (exit_code <- .standalone_types_check_dot_call( + ffi_standalone_check_number_1.0.7, + x, + allow_decimal = TRUE, + min, + max, + allow_infinite, + allow_na, + allow_null + )) + ) { + return(invisible(NULL)) + } + + .stop_not_number( + x, + ..., + exit_code = exit_code, + allow_decimal = TRUE, + min = min, + max = max, + allow_na = allow_na, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_number_whole <- function( + x, + ..., + min = NULL, + max = NULL, + allow_infinite = FALSE, + allow_na = FALSE, + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (missing(x)) { + exit_code <- IS_NUMBER_false + } else if ( + 0 == + (exit_code <- .standalone_types_check_dot_call( + ffi_standalone_check_number_1.0.7, + x, + allow_decimal = FALSE, + min, + max, + allow_infinite, + allow_na, + allow_null + )) + ) { + return(invisible(NULL)) + } + + .stop_not_number( + x, + ..., + exit_code = exit_code, + allow_decimal = FALSE, + min = min, + max = max, + allow_na = allow_na, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +.stop_not_number <- function( + x, + ..., + exit_code, + allow_decimal, + min, + max, + allow_na, + allow_null, + arg, + call +) { + if (allow_decimal) { + what <- "a number" + } else { + what <- "a whole number" + } + + if (exit_code == IS_NUMBER_oob) { + min <- min %||% -Inf + max <- max %||% Inf + + if (min > -Inf && max < Inf) { + what <- sprintf("%s between %s and %s", what, min, max) + } else if (x < min) { + what <- sprintf("%s larger than or equal to %s", what, min) + } else if (x > max) { + what <- sprintf("%s smaller than or equal to %s", what, max) + } else { + abort("Unexpected state in OOB check", .internal = TRUE) + } + } + + stop_input_type( + x, + what, + ..., + allow_na = allow_na, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_symbol <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_symbol(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a symbol", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_arg <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_symbol(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "an argument name", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_call <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_call(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a defused call", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_environment <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_environment(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "an environment", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_function <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_function(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a function", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_closure <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_closure(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "an R function", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_formula <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_formula(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a formula", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + + +# Vectors ----------------------------------------------------------------- + +# TODO: Figure out what to do with logical `NA` and `allow_na = TRUE` + +check_character <- function( + x, + ..., + allow_na = TRUE, + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_character(x)) { + if (!allow_na && any(is.na(x))) { + abort( + sprintf("`%s` can't contain NA values.", arg), + arg = arg, + call = call + ) + } + + return(invisible(NULL)) + } + + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a character vector", + ..., + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_logical <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is_logical(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a logical vector", + ..., + allow_na = FALSE, + allow_null = allow_null, + arg = arg, + call = call + ) +} + +check_data_frame <- function( + x, + ..., + allow_null = FALSE, + arg = caller_arg(x), + call = caller_env() +) { + if (!missing(x)) { + if (is.data.frame(x)) { + return(invisible(NULL)) + } + if (allow_null && is_null(x)) { + return(invisible(NULL)) + } + } + + stop_input_type( + x, + "a data frame", + ..., + allow_null = allow_null, + arg = arg, + call = call + ) +} + +# nocov end diff --git a/R/important-package.R b/R/important-package.R index a3a200e..85cf918 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -2,6 +2,7 @@ "_PACKAGE" ## usethis namespace: start +#' @import rlang #' @importFrom stats sd predict #' @importFrom hardhat extract_fit_parsnip extract_postprocessor From 5af6bc093b08ff2d1801f40045c6e65330b59d82 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Tue, 19 Aug 2025 11:47:21 -0700 Subject: [PATCH 02/65] add recipe step --- DESCRIPTION | 4 +- NAMESPACE | 7 ++ R/important-package.R | 1 + R/step-select-1.R | 177 +++++++++++++++++++++++++++++++++++++++ man/rmd/tunable-args.Rmd | 40 +++++++++ man/step_select_1.Rd | 91 ++++++++++++++++++++ 6 files changed, 318 insertions(+), 2 deletions(-) create mode 100644 R/step-select-1.R create mode 100644 man/rmd/tunable-args.Rmd create mode 100644 man/step_select_1.Rd diff --git a/DESCRIPTION b/DESCRIPTION index fecde13..8bee702 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,7 +15,8 @@ License: MIT + file LICENSE URL: https://github.com/tidymodels/important BugReports: https://github.com/tidymodels/important/issues Depends: - R (>= 4.1.0) + R (>= 4.1.0), + recipes (>= 1.1.0) Imports: cli, dplyr, @@ -37,7 +38,6 @@ Suggests: mirai, modeldata, parsnip, - recipes, spelling, survival, testthat (>= 3.0.0), diff --git a/NAMESPACE b/NAMESPACE index b30e810..f2707bf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,10 +1,17 @@ # Generated by roxygen2: do not edit by hand S3method(autoplot,importance_perm) +S3method(bake,step_select_1) +S3method(prep,step_select_1) +S3method(print,step_select_1) +S3method(tidy,step_select_1) +S3method(tunable,step_select_1) export(augment) export(autoplot) export(importance_perm) export(required_pkgs) +export(step_select_1) +import(recipes) import(rlang) importFrom(generics,augment) importFrom(generics,required_pkgs) diff --git a/R/important-package.R b/R/important-package.R index 85cf918..cfd3d3f 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -2,6 +2,7 @@ "_PACKAGE" ## usethis namespace: start +#' @import recipes #' @import rlang #' @importFrom stats sd predict #' @importFrom hardhat extract_fit_parsnip extract_postprocessor diff --git a/R/step-select-1.R b/R/step-select-1.R new file mode 100644 index 0000000..7ad148f --- /dev/null +++ b/R/step-select-1.R @@ -0,0 +1,177 @@ +#' Feature Selection +#' +#' `step_select_1()` creates a *specification* of a recipe step that will +#' perform feature selection by ... +#' +#' @inheritParams recipes::step_center +#' @param threshold ... +#' @param removals A character string that contains the names of columns that +#' should be removed. These values are not determined until [recipes::prep()] +#' is called. +#' @return An updated version of `recipe` with the new step added to the +#' sequence of any existing operations. +#' @export +#' +#' @details +#' +#' This step ... +#' +#' This step can potentially remove columns from the data set. This may +#' cause issues for subsequent steps in your recipe if the missing columns are +#' specifically referenced by name. To avoid this, see the advice in the +#' _Tips for saving recipes and filtering columns_ section of +#' [recipes::selections]. +#' +#' # Tidying +#' +#' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is +#' returned with columns `terms` and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected to be removed} +#' \item{id}{character, id of this step} +#' } +#' +#' ```{r, echo = FALSE, results="asis"} +#' step <- "step_select_1" +#' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") +#' cat(result) +#' ``` +#' +#' The underlying operation does not allow for case weights. +#' +#' @examples +#' library(recipes) +#' +#' rec <- recipe(mpg ~ ., data = mtcars) |> +#' step_select_1(all_predictors()) +#' +#' prepped <- prep(rec) +#' +#' bake(prepped, mtcars) +#' +#' tidy(prepped, 1) +step_select_1 <- function( + recipe, + ..., + role = NA, + trained = FALSE, + threshold = 0.9, + removals = NULL, + skip = FALSE, + id = rand_id("select_1") +) { + add_step( + recipe, + step_select_1_new( + terms = enquos(...), + role = role, + trained = trained, + threshold = threshold, + removals = removals, + skip = skip, + id = id, + case_weights = NULL + ) + ) +} + +step_select_1_new <- + function( + terms, + role, + trained, + threshold, + removals, + skip, + id, + case_weights + ) { + step( + subclass = "select_1", + terms = terms, + role = role, + trained = trained, + threshold = threshold, + removals = removals, + skip = skip, + id = id, + case_weights = case_weights + ) + } + +#' @export +prep.step_select_1 <- function(x, training, info = NULL, ...) { + col_names <- recipes_eval_select(x$terms, training, info) + check_type(training[, col_names], types = c("double", "integer")) + check_number_decimal(x$threshold, min = 0, max = 1, arg = "threshold") + + wts <- get_case_weights(info, training) + were_weights_used <- are_weights_used(wts, unsupervised = TRUE) + if (isFALSE(were_weights_used)) { + wts <- NULL + } + + if (length(col_names) > 1) { + filter <- character(0) + } else { + filter <- character(0) + } + + step_select_1_new( + terms = x$terms, + role = x$role, + trained = TRUE, + threshold = x$threshold, + removals = filter, + skip = x$skip, + id = x$id, + case_weights = were_weights_used + ) +} + +#' @export +bake.step_select_1 <- function(object, new_data, ...) { + new_data <- recipes_remove_cols(new_data, object) + new_data +} + +#' @export +print.step_select_1 <- function(x, width = max(20, options()$width - 36), ...) { + title <- "Feature selection on " + print_step( + x$removals, + x$terms, + x$trained, + title, + width, + case_weights = x$case_weights + ) + invisible(x) +} + +#' @usage NULL +#' @export +tidy.step_select_1 <- function(x, ...) { + if (is_trained(x)) { + res <- tibble::tibble(terms = unname(x$removals)) + } else { + term_names <- sel2char(x$terms) + res <- tibble::tibble(terms = term_names) + } + res$id <- x$id + res +} + +#' @export +tunable.step_select_1 <- function(x, ...) { + tibble::tibble( + name = "threshold", + call_info = list( + list(pkg = "dials", fun = "threshold") + ), + source = "recipe", + component = "step_select_1", + component_id = x$id + ) +} diff --git a/man/rmd/tunable-args.Rmd b/man/rmd/tunable-args.Rmd new file mode 100644 index 0000000..d19d0ad --- /dev/null +++ b/man/rmd/tunable-args.Rmd @@ -0,0 +1,40 @@ +```{r} +#| include: false +get_dials <- function(x) { + if (any(names(x) == "range")) { + cl <- rlang::call2(x$fun, .ns = x$pkg, range = x$range) + } else { + cl <- rlang::call2(x$fun, .ns = x$pkg) + } + rlang::eval_tidy(cl) +} + +get_param_list <- function(x) { + args <- formals(x) + params <- getS3method("tunable", x)(list()) |> + dplyr::mutate( + default = args[name], + dials = purrr::map(call_info, get_dials), + label = purrr::map_chr(dials, \(.x) .x$label), + type = purrr::map_chr(dials, \(.x) .x$type), + item = glue::glue("- `{name}`: {label} (type: {type}, default: {default})\n\n") + ) + + params$item +} +``` + +# Tuning Parameters + +```{r} +#| echo: false +param <- get_param_list(step) +``` + +This step has `r length(param)` tuning parameters: + +```{r} +#| echo: false +#| results: asis +param +``` diff --git a/man/step_select_1.Rd b/man/step_select_1.Rd new file mode 100644 index 0000000..0d11117 --- /dev/null +++ b/man/step_select_1.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step-select-1.R +\name{step_select_1} +\alias{step_select_1} +\title{Feature Selection} +\usage{ +step_select_1( + recipe, + ..., + role = NA, + trained = FALSE, + threshold = 0.9, + removals = NULL, + skip = FALSE, + id = rand_id("select_1") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the sequence of +operations for this recipe.} + +\item{...}{One or more selector functions to choose variables for this step. +See \code{\link[recipes:selections]{selections()}} for more details.} + +\item{role}{Not used by this step since no new variables are created.} + +\item{trained}{A logical to indicate if the quantities for preprocessing have +been estimated.} + +\item{threshold}{...} + +\item{removals}{A character string that contains the names of columns that +should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + +\item{skip}{A logical. Should the step be skipped when the recipe is baked by +\code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some +operations may not be able to be conducted on new data (e.g. processing the +outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it +may affect the computations for subsequent operations.} + +\item{id}{A character string that is unique to this step to identify it.} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +\code{step_select_1()} creates a \emph{specification} of a recipe step that will +perform feature selection by ... +} +\details{ +This step ... + +This step can potentially remove columns from the data set. This may +cause issues for subsequent steps in your recipe if the missing columns are +specifically referenced by name. To avoid this, see the advice in the +\emph{Tips for saving recipes and filtering columns} section of +\link[recipes:selections]{recipes::selections}. +} +\section{Tidying}{ +When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is +returned with columns \code{terms} and \code{id}: + +\describe{ +\item{terms}{character, the selectors or variables selected to be removed} +\item{id}{character, id of this step} +} +} + +\section{Tuning Parameters}{ +This step has 1 tuning parameters: +\itemize{ +\item \code{threshold}: Threshold (type: double, default: 0.9) +} + +The underlying operation does not allow for case weights. +} + +\examples{ +library(recipes) + +rec <- recipe(mpg ~ ., data = mtcars) |> + step_select_1(all_predictors()) + +prepped <- prep(rec) + +bake(prepped, mtcars) + +tidy(prepped, 1) +} From 33141d0ab0c77ba9525f0f15ddf0b284475e9e90 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Tue, 19 Aug 2025 11:50:12 -0700 Subject: [PATCH 03/65] add 2 more steps --- NAMESPACE | 12 +++ R/step-select-2.R | 177 +++++++++++++++++++++++++++++++++++++++++++ R/step-select-3.R | 177 +++++++++++++++++++++++++++++++++++++++++++ man/step_select_2.Rd | 91 ++++++++++++++++++++++ man/step_select_3.Rd | 91 ++++++++++++++++++++++ 5 files changed, 548 insertions(+) create mode 100644 R/step-select-2.R create mode 100644 R/step-select-3.R create mode 100644 man/step_select_2.Rd create mode 100644 man/step_select_3.Rd diff --git a/NAMESPACE b/NAMESPACE index f2707bf..19e2c23 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,15 +2,27 @@ S3method(autoplot,importance_perm) S3method(bake,step_select_1) +S3method(bake,step_select_2) +S3method(bake,step_select_3) S3method(prep,step_select_1) +S3method(prep,step_select_2) +S3method(prep,step_select_3) S3method(print,step_select_1) +S3method(print,step_select_2) +S3method(print,step_select_3) S3method(tidy,step_select_1) +S3method(tidy,step_select_2) +S3method(tidy,step_select_3) S3method(tunable,step_select_1) +S3method(tunable,step_select_2) +S3method(tunable,step_select_3) export(augment) export(autoplot) export(importance_perm) export(required_pkgs) export(step_select_1) +export(step_select_2) +export(step_select_3) import(recipes) import(rlang) importFrom(generics,augment) diff --git a/R/step-select-2.R b/R/step-select-2.R new file mode 100644 index 0000000..afaeb45 --- /dev/null +++ b/R/step-select-2.R @@ -0,0 +1,177 @@ +#' Feature Selection +#' +#' `step_select_2()` creates a *specification* of a recipe step that will +#' perform feature selection by ... +#' +#' @inheritParams recipes::step_center +#' @param threshold ... +#' @param removals A character string that contains the names of columns that +#' should be removed. These values are not determined until [recipes::prep()] +#' is called. +#' @return An updated version of `recipe` with the new step added to the +#' sequence of any existing operations. +#' @export +#' +#' @details +#' +#' This step ... +#' +#' This step can potentially remove columns from the data set. This may +#' cause issues for subsequent steps in your recipe if the missing columns are +#' specifically referenced by name. To avoid this, see the advice in the +#' _Tips for saving recipes and filtering columns_ section of +#' [recipes::selections]. +#' +#' # Tidying +#' +#' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is +#' returned with columns `terms` and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected to be removed} +#' \item{id}{character, id of this step} +#' } +#' +#' ```{r, echo = FALSE, results="asis"} +#' step <- "step_select_2" +#' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") +#' cat(result) +#' ``` +#' +#' The underlying operation does not allow for case weights. +#' +#' @examples +#' library(recipes) +#' +#' rec <- recipe(mpg ~ ., data = mtcars) |> +#' step_select_2(all_predictors()) +#' +#' prepped <- prep(rec) +#' +#' bake(prepped, mtcars) +#' +#' tidy(prepped, 1) +step_select_2 <- function( + recipe, + ..., + role = NA, + trained = FALSE, + threshold = 0.9, + removals = NULL, + skip = FALSE, + id = rand_id("select_2") +) { + add_step( + recipe, + step_select_2_new( + terms = enquos(...), + role = role, + trained = trained, + threshold = threshold, + removals = removals, + skip = skip, + id = id, + case_weights = NULL + ) + ) +} + +step_select_2_new <- + function( + terms, + role, + trained, + threshold, + removals, + skip, + id, + case_weights + ) { + step( + subclass = "select_2", + terms = terms, + role = role, + trained = trained, + threshold = threshold, + removals = removals, + skip = skip, + id = id, + case_weights = case_weights + ) + } + +#' @export +prep.step_select_2 <- function(x, training, info = NULL, ...) { + col_names <- recipes_eval_select(x$terms, training, info) + check_type(training[, col_names], types = c("double", "integer")) + check_number_decimal(x$threshold, min = 0, max = 1, arg = "threshold") + + wts <- get_case_weights(info, training) + were_weights_used <- are_weights_used(wts, unsupervised = TRUE) + if (isFALSE(were_weights_used)) { + wts <- NULL + } + + if (length(col_names) > 1) { + filter <- character(0) + } else { + filter <- character(0) + } + + step_select_2_new( + terms = x$terms, + role = x$role, + trained = TRUE, + threshold = x$threshold, + removals = filter, + skip = x$skip, + id = x$id, + case_weights = were_weights_used + ) +} + +#' @export +bake.step_select_2 <- function(object, new_data, ...) { + new_data <- recipes_remove_cols(new_data, object) + new_data +} + +#' @export +print.step_select_2 <- function(x, width = max(20, options()$width - 36), ...) { + title <- "Feature selection on " + print_step( + x$removals, + x$terms, + x$trained, + title, + width, + case_weights = x$case_weights + ) + invisible(x) +} + +#' @usage NULL +#' @export +tidy.step_select_2 <- function(x, ...) { + if (is_trained(x)) { + res <- tibble::tibble(terms = unname(x$removals)) + } else { + term_names <- sel2char(x$terms) + res <- tibble::tibble(terms = term_names) + } + res$id <- x$id + res +} + +#' @export +tunable.step_select_2 <- function(x, ...) { + tibble::tibble( + name = "threshold", + call_info = list( + list(pkg = "dials", fun = "threshold") + ), + source = "recipe", + component = "step_select_2", + component_id = x$id + ) +} diff --git a/R/step-select-3.R b/R/step-select-3.R new file mode 100644 index 0000000..de6e1d4 --- /dev/null +++ b/R/step-select-3.R @@ -0,0 +1,177 @@ +#' Feature Selection +#' +#' `step_select_3()` creates a *specification* of a recipe step that will +#' perform feature selection by ... +#' +#' @inheritParams recipes::step_center +#' @param threshold ... +#' @param removals A character string that contains the names of columns that +#' should be removed. These values are not determined until [recipes::prep()] +#' is called. +#' @return An updated version of `recipe` with the new step added to the +#' sequence of any existing operations. +#' @export +#' +#' @details +#' +#' This step ... +#' +#' This step can potentially remove columns from the data set. This may +#' cause issues for subsequent steps in your recipe if the missing columns are +#' specifically referenced by name. To avoid this, see the advice in the +#' _Tips for saving recipes and filtering columns_ section of +#' [recipes::selections]. +#' +#' # Tidying +#' +#' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is +#' returned with columns `terms` and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected to be removed} +#' \item{id}{character, id of this step} +#' } +#' +#' ```{r, echo = FALSE, results="asis"} +#' step <- "step_select_3" +#' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") +#' cat(result) +#' ``` +#' +#' The underlying operation does not allow for case weights. +#' +#' @examples +#' library(recipes) +#' +#' rec <- recipe(mpg ~ ., data = mtcars) |> +#' step_select_3(all_predictors()) +#' +#' prepped <- prep(rec) +#' +#' bake(prepped, mtcars) +#' +#' tidy(prepped, 1) +step_select_3 <- function( + recipe, + ..., + role = NA, + trained = FALSE, + threshold = 0.9, + removals = NULL, + skip = FALSE, + id = rand_id("select_3") +) { + add_step( + recipe, + step_select_3_new( + terms = enquos(...), + role = role, + trained = trained, + threshold = threshold, + removals = removals, + skip = skip, + id = id, + case_weights = NULL + ) + ) +} + +step_select_3_new <- + function( + terms, + role, + trained, + threshold, + removals, + skip, + id, + case_weights + ) { + step( + subclass = "select_3", + terms = terms, + role = role, + trained = trained, + threshold = threshold, + removals = removals, + skip = skip, + id = id, + case_weights = case_weights + ) + } + +#' @export +prep.step_select_3 <- function(x, training, info = NULL, ...) { + col_names <- recipes_eval_select(x$terms, training, info) + check_type(training[, col_names], types = c("double", "integer")) + check_number_decimal(x$threshold, min = 0, max = 1, arg = "threshold") + + wts <- get_case_weights(info, training) + were_weights_used <- are_weights_used(wts, unsupervised = TRUE) + if (isFALSE(were_weights_used)) { + wts <- NULL + } + + if (length(col_names) > 1) { + filter <- character(0) + } else { + filter <- character(0) + } + + step_select_3_new( + terms = x$terms, + role = x$role, + trained = TRUE, + threshold = x$threshold, + removals = filter, + skip = x$skip, + id = x$id, + case_weights = were_weights_used + ) +} + +#' @export +bake.step_select_3 <- function(object, new_data, ...) { + new_data <- recipes_remove_cols(new_data, object) + new_data +} + +#' @export +print.step_select_3 <- function(x, width = max(20, options()$width - 36), ...) { + title <- "Feature selection on " + print_step( + x$removals, + x$terms, + x$trained, + title, + width, + case_weights = x$case_weights + ) + invisible(x) +} + +#' @usage NULL +#' @export +tidy.step_select_3 <- function(x, ...) { + if (is_trained(x)) { + res <- tibble::tibble(terms = unname(x$removals)) + } else { + term_names <- sel2char(x$terms) + res <- tibble::tibble(terms = term_names) + } + res$id <- x$id + res +} + +#' @export +tunable.step_select_3 <- function(x, ...) { + tibble::tibble( + name = "threshold", + call_info = list( + list(pkg = "dials", fun = "threshold") + ), + source = "recipe", + component = "step_select_3", + component_id = x$id + ) +} diff --git a/man/step_select_2.Rd b/man/step_select_2.Rd new file mode 100644 index 0000000..de37c94 --- /dev/null +++ b/man/step_select_2.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step-select-2.R +\name{step_select_2} +\alias{step_select_2} +\title{Feature Selection} +\usage{ +step_select_2( + recipe, + ..., + role = NA, + trained = FALSE, + threshold = 0.9, + removals = NULL, + skip = FALSE, + id = rand_id("select_2") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the sequence of +operations for this recipe.} + +\item{...}{One or more selector functions to choose variables for this step. +See \code{\link[recipes:selections]{selections()}} for more details.} + +\item{role}{Not used by this step since no new variables are created.} + +\item{trained}{A logical to indicate if the quantities for preprocessing have +been estimated.} + +\item{threshold}{...} + +\item{removals}{A character string that contains the names of columns that +should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + +\item{skip}{A logical. Should the step be skipped when the recipe is baked by +\code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some +operations may not be able to be conducted on new data (e.g. processing the +outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it +may affect the computations for subsequent operations.} + +\item{id}{A character string that is unique to this step to identify it.} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +\code{step_select_2()} creates a \emph{specification} of a recipe step that will +perform feature selection by ... +} +\details{ +This step ... + +This step can potentially remove columns from the data set. This may +cause issues for subsequent steps in your recipe if the missing columns are +specifically referenced by name. To avoid this, see the advice in the +\emph{Tips for saving recipes and filtering columns} section of +\link[recipes:selections]{recipes::selections}. +} +\section{Tidying}{ +When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is +returned with columns \code{terms} and \code{id}: + +\describe{ +\item{terms}{character, the selectors or variables selected to be removed} +\item{id}{character, id of this step} +} +} + +\section{Tuning Parameters}{ +This step has 1 tuning parameters: +\itemize{ +\item \code{threshold}: Threshold (type: double, default: 0.9) +} + +The underlying operation does not allow for case weights. +} + +\examples{ +library(recipes) + +rec <- recipe(mpg ~ ., data = mtcars) |> + step_select_2(all_predictors()) + +prepped <- prep(rec) + +bake(prepped, mtcars) + +tidy(prepped, 1) +} diff --git a/man/step_select_3.Rd b/man/step_select_3.Rd new file mode 100644 index 0000000..160bd54 --- /dev/null +++ b/man/step_select_3.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step-select-3.R +\name{step_select_3} +\alias{step_select_3} +\title{Feature Selection} +\usage{ +step_select_3( + recipe, + ..., + role = NA, + trained = FALSE, + threshold = 0.9, + removals = NULL, + skip = FALSE, + id = rand_id("select_3") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the sequence of +operations for this recipe.} + +\item{...}{One or more selector functions to choose variables for this step. +See \code{\link[recipes:selections]{selections()}} for more details.} + +\item{role}{Not used by this step since no new variables are created.} + +\item{trained}{A logical to indicate if the quantities for preprocessing have +been estimated.} + +\item{threshold}{...} + +\item{removals}{A character string that contains the names of columns that +should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + +\item{skip}{A logical. Should the step be skipped when the recipe is baked by +\code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some +operations may not be able to be conducted on new data (e.g. processing the +outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it +may affect the computations for subsequent operations.} + +\item{id}{A character string that is unique to this step to identify it.} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +\code{step_select_3()} creates a \emph{specification} of a recipe step that will +perform feature selection by ... +} +\details{ +This step ... + +This step can potentially remove columns from the data set. This may +cause issues for subsequent steps in your recipe if the missing columns are +specifically referenced by name. To avoid this, see the advice in the +\emph{Tips for saving recipes and filtering columns} section of +\link[recipes:selections]{recipes::selections}. +} +\section{Tidying}{ +When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is +returned with columns \code{terms} and \code{id}: + +\describe{ +\item{terms}{character, the selectors or variables selected to be removed} +\item{id}{character, id of this step} +} +} + +\section{Tuning Parameters}{ +This step has 1 tuning parameters: +\itemize{ +\item \code{threshold}: Threshold (type: double, default: 0.9) +} + +The underlying operation does not allow for case weights. +} + +\examples{ +library(recipes) + +rec <- recipe(mpg ~ ., data = mtcars) |> + step_select_3(all_predictors()) + +prepped <- prep(rec) + +bake(prepped, mtcars) + +tidy(prepped, 1) +} From 0e60102e6e5eaf432be9ca918f2e5a01aafe7aed Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Tue, 19 Aug 2025 11:56:15 -0700 Subject: [PATCH 04/65] add infrastruture tests --- tests/testthat/_snaps/step-select-1.md | 77 ++++++++++++++++++++++ tests/testthat/_snaps/step-select-2.md | 77 ++++++++++++++++++++++ tests/testthat/_snaps/step-select-3.md | 77 ++++++++++++++++++++++ tests/testthat/test-step-select-1.R | 89 ++++++++++++++++++++++++++ tests/testthat/test-step-select-2.R | 89 ++++++++++++++++++++++++++ tests/testthat/test-step-select-3.R | 89 ++++++++++++++++++++++++++ 6 files changed, 498 insertions(+) create mode 100644 tests/testthat/_snaps/step-select-1.md create mode 100644 tests/testthat/_snaps/step-select-2.md create mode 100644 tests/testthat/_snaps/step-select-3.md create mode 100644 tests/testthat/test-step-select-1.R create mode 100644 tests/testthat/test-step-select-2.R create mode 100644 tests/testthat/test-step-select-3.R diff --git a/tests/testthat/_snaps/step-select-1.md b/tests/testthat/_snaps/step-select-1.md new file mode 100644 index 0000000..a02cb7f --- /dev/null +++ b/tests/testthat/_snaps/step-select-1.md @@ -0,0 +1,77 @@ +# empty printing + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Feature selection on: + +--- + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection on: | Trained + +# printing + + Code + print(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + predictor: 11 + + -- Operations + * Feature selection on: all_predictors() + +--- + + Code + prep(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + predictor: 11 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection on: | Trained + +# bad args + + Code + prep(step_select_1(recipe(mpg ~ ., mtcars), all_predictors(), threshold = 2)) + Condition + Error in `step_select_1()`: + Caused by error in `prep()`: + ! `threshold` must be a number between 0 and 1, not the number 2. + diff --git a/tests/testthat/_snaps/step-select-2.md b/tests/testthat/_snaps/step-select-2.md new file mode 100644 index 0000000..346878e --- /dev/null +++ b/tests/testthat/_snaps/step-select-2.md @@ -0,0 +1,77 @@ +# empty printing + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Feature selection on: + +--- + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection on: | Trained + +# printing + + Code + print(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + predictor: 11 + + -- Operations + * Feature selection on: all_predictors() + +--- + + Code + prep(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + predictor: 11 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection on: | Trained + +# bad args + + Code + prep(step_select_2(recipe(mpg ~ ., mtcars), all_predictors(), threshold = 2)) + Condition + Error in `step_select_2()`: + Caused by error in `prep()`: + ! `threshold` must be a number between 0 and 1, not the number 2. + diff --git a/tests/testthat/_snaps/step-select-3.md b/tests/testthat/_snaps/step-select-3.md new file mode 100644 index 0000000..4ff1096 --- /dev/null +++ b/tests/testthat/_snaps/step-select-3.md @@ -0,0 +1,77 @@ +# empty printing + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Feature selection on: + +--- + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection on: | Trained + +# printing + + Code + print(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + predictor: 11 + + -- Operations + * Feature selection on: all_predictors() + +--- + + Code + prep(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + predictor: 11 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection on: | Trained + +# bad args + + Code + prep(step_select_3(recipe(mpg ~ ., mtcars), all_predictors(), threshold = 2)) + Condition + Error in `step_select_3()`: + Caused by error in `prep()`: + ! `threshold` must be a number between 0 and 1, not the number 2. + diff --git a/tests/testthat/test-step-select-1.R b/tests/testthat/test-step-select-1.R new file mode 100644 index 0000000..6113873 --- /dev/null +++ b/tests/testthat/test-step-select-1.R @@ -0,0 +1,89 @@ +# Infrastructure --------------------------------------------------------------- + +test_that("bake method errors when needed non-standard role columns are missing", { + # Here for completeness + # step_select_1() removes variables and thus does not care if they are not there. + expect_true(TRUE) +}) + +test_that("empty printing", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_select_1(rec) + + expect_snapshot(rec) + + rec <- prep(rec, mtcars) + + expect_snapshot(rec) +}) + +test_that("empty selection prep/bake is a no-op", { + rec1 <- recipe(mpg ~ ., mtcars) + rec2 <- step_select_1(rec1) + + rec1 <- prep(rec1, mtcars) + rec2 <- prep(rec2, mtcars) + + baked1 <- bake(rec1, mtcars) + baked2 <- bake(rec2, mtcars) + + expect_identical(baked1, baked2) +}) + +test_that("empty selection tidy method works", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_select_1(rec) + + expect <- tibble(terms = character(), id = character()) + + expect_identical(tidy(rec, number = 1), expect) + + rec <- prep(rec, mtcars) + + expect_identical(tidy(rec, number = 1), expect) +}) + +test_that("printing", { + set.seed(1) + rec <- recipe(~., data = mtcars) |> + step_select_1(all_predictors()) + + expect_snapshot(print(rec)) + expect_snapshot(prep(rec)) +}) + +test_that("tunable is setup to work with extract_parameter_set_dials", { + skip_if_not_installed("dials") + rec <- recipe(~., data = mtcars) |> + step_select_1(all_predictors(), threshold = hardhat::tune()) + + params <- extract_parameter_set_dials(rec) + + expect_s3_class(params, "parameters") + expect_identical(nrow(params), 1L) +}) + +test_that("bad args", { + expect_snapshot( + recipe(mpg ~ ., mtcars) |> + step_select_1(all_predictors(), threshold = 2) |> + prep(), + error = TRUE + ) +}) + +test_that("0 and 1 rows data work in bake method", { + data <- mtcars + rec <- recipe(~., data) |> + step_select_1(all_numeric_predictors()) |> + prep() + + expect_identical( + nrow(bake(rec, slice(data, 1))), + 1L + ) + expect_identical( + nrow(bake(rec, slice(data, 0))), + 0L + ) +}) diff --git a/tests/testthat/test-step-select-2.R b/tests/testthat/test-step-select-2.R new file mode 100644 index 0000000..f07d291 --- /dev/null +++ b/tests/testthat/test-step-select-2.R @@ -0,0 +1,89 @@ +# Infrastructure --------------------------------------------------------------- + +test_that("bake method errors when needed non-standard role columns are missing", { + # Here for completeness + # step_select_2() removes variables and thus does not care if they are not there. + expect_true(TRUE) +}) + +test_that("empty printing", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_select_2(rec) + + expect_snapshot(rec) + + rec <- prep(rec, mtcars) + + expect_snapshot(rec) +}) + +test_that("empty selection prep/bake is a no-op", { + rec1 <- recipe(mpg ~ ., mtcars) + rec2 <- step_select_2(rec1) + + rec1 <- prep(rec1, mtcars) + rec2 <- prep(rec2, mtcars) + + baked1 <- bake(rec1, mtcars) + baked2 <- bake(rec2, mtcars) + + expect_identical(baked1, baked2) +}) + +test_that("empty selection tidy method works", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_select_2(rec) + + expect <- tibble(terms = character(), id = character()) + + expect_identical(tidy(rec, number = 1), expect) + + rec <- prep(rec, mtcars) + + expect_identical(tidy(rec, number = 1), expect) +}) + +test_that("printing", { + set.seed(1) + rec <- recipe(~., data = mtcars) |> + step_select_2(all_predictors()) + + expect_snapshot(print(rec)) + expect_snapshot(prep(rec)) +}) + +test_that("tunable is setup to work with extract_parameter_set_dials", { + skip_if_not_installed("dials") + rec <- recipe(~., data = mtcars) |> + step_select_2(all_predictors(), threshold = hardhat::tune()) + + params <- extract_parameter_set_dials(rec) + + expect_s3_class(params, "parameters") + expect_identical(nrow(params), 1L) +}) + +test_that("bad args", { + expect_snapshot( + recipe(mpg ~ ., mtcars) |> + step_select_2(all_predictors(), threshold = 2) |> + prep(), + error = TRUE + ) +}) + +test_that("0 and 1 rows data work in bake method", { + data <- mtcars + rec <- recipe(~., data) |> + step_select_2(all_numeric_predictors()) |> + prep() + + expect_identical( + nrow(bake(rec, slice(data, 1))), + 1L + ) + expect_identical( + nrow(bake(rec, slice(data, 0))), + 0L + ) +}) diff --git a/tests/testthat/test-step-select-3.R b/tests/testthat/test-step-select-3.R new file mode 100644 index 0000000..b314af6 --- /dev/null +++ b/tests/testthat/test-step-select-3.R @@ -0,0 +1,89 @@ +# Infrastructure --------------------------------------------------------------- + +test_that("bake method errors when needed non-standard role columns are missing", { + # Here for completeness + # step_select_3() removes variables and thus does not care if they are not there. + expect_true(TRUE) +}) + +test_that("empty printing", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_select_3(rec) + + expect_snapshot(rec) + + rec <- prep(rec, mtcars) + + expect_snapshot(rec) +}) + +test_that("empty selection prep/bake is a no-op", { + rec1 <- recipe(mpg ~ ., mtcars) + rec2 <- step_select_3(rec1) + + rec1 <- prep(rec1, mtcars) + rec2 <- prep(rec2, mtcars) + + baked1 <- bake(rec1, mtcars) + baked2 <- bake(rec2, mtcars) + + expect_identical(baked1, baked2) +}) + +test_that("empty selection tidy method works", { + rec <- recipe(mpg ~ ., mtcars) + rec <- step_select_3(rec) + + expect <- tibble(terms = character(), id = character()) + + expect_identical(tidy(rec, number = 1), expect) + + rec <- prep(rec, mtcars) + + expect_identical(tidy(rec, number = 1), expect) +}) + +test_that("printing", { + set.seed(1) + rec <- recipe(~., data = mtcars) |> + step_select_3(all_predictors()) + + expect_snapshot(print(rec)) + expect_snapshot(prep(rec)) +}) + +test_that("tunable is setup to work with extract_parameter_set_dials", { + skip_if_not_installed("dials") + rec <- recipe(~., data = mtcars) |> + step_select_3(all_predictors(), threshold = hardhat::tune()) + + params <- extract_parameter_set_dials(rec) + + expect_s3_class(params, "parameters") + expect_identical(nrow(params), 1L) +}) + +test_that("bad args", { + expect_snapshot( + recipe(mpg ~ ., mtcars) |> + step_select_3(all_predictors(), threshold = 2) |> + prep(), + error = TRUE + ) +}) + +test_that("0 and 1 rows data work in bake method", { + data <- mtcars + rec <- recipe(~., data) |> + step_select_3(all_numeric_predictors()) |> + prep() + + expect_identical( + nrow(bake(rec, slice(data, 1))), + 1L + ) + expect_identical( + nrow(bake(rec, slice(data, 0))), + 0L + ) +}) From 1b54b75fb7de4731aad7cf664271c9370e843054 Mon Sep 17 00:00:00 2001 From: topepo Date: Tue, 19 Aug 2025 17:29:57 -0400 Subject: [PATCH 05/65] some utility functions for recipes --- R/important-package.R | 3 +- R/recipe_utils.R | 22 ++++++++++++ tests/testthat/_snaps/recipes_utils.md | 16 +++++++++ tests/testthat/helper-objects.R | 6 ++++ tests/testthat/test-recipes_utils.R | 49 ++++++++++++++++++++++++++ 5 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 R/recipe_utils.R create mode 100644 tests/testthat/_snaps/recipes_utils.md create mode 100644 tests/testthat/test-recipes_utils.R diff --git a/R/important-package.R b/R/important-package.R index cfd3d3f..f085509 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -29,7 +29,8 @@ utils::globalVariables( "permuted", "predictor", "ranking", - "std_err" + "std_err", + "score" ) ) ## usethis namespace: end diff --git a/R/recipe_utils.R b/R/recipe_utils.R new file mode 100644 index 0000000..8ff0ee0 --- /dev/null +++ b/R/recipe_utils.R @@ -0,0 +1,22 @@ +# Check input for step_predictor_desirability() +check_desirability_arg <- function(x) { + # check for `c()` + # check not empty +} + +# For an expression or quosure, list the user-specificied score functions +extract_score_names <- function(x, call = rlang::caller_env()) { + if (rlang::is_quosure(x)) { + if (rlang::quo_is_missing(x)) { + cli::cli_abort("{.arg score} cannot be empty.", call = call) + } + x <- rlang::quo_get_expr(x) + } + res <- unique(all.vars(x)) + res <- grep("^score_", res, value = TRUE) + if (length(res) == 0) { + cli::cli_abort("No score objects were found in {.arg score}.", call = call) + } + res +} + diff --git a/tests/testthat/_snaps/recipes_utils.md b/tests/testthat/_snaps/recipes_utils.md new file mode 100644 index 0000000..4dbcbfa --- /dev/null +++ b/tests/testthat/_snaps/recipes_utils.md @@ -0,0 +1,16 @@ +# Find score objects in an expression or quosure + + Code + important:::extract_score_names(obj_4) + Condition + Error: + ! No score objects were found in `score`. + +--- + + Code + important:::extract_score_names(obj_5) + Condition + Error: + ! `score` cannot be empty. + diff --git a/tests/testthat/helper-objects.R b/tests/testthat/helper-objects.R index 20e4b0a..7ea19d9 100644 --- a/tests/testthat/helper-objects.R +++ b/tests/testthat/helper-objects.R @@ -91,4 +91,10 @@ ex_seed <- id = 548676L ) +# ------------------------------------------------------------------------------ +# for recipes + +POTATO <- function(x) { + rlang::enquo(x) +} diff --git a/tests/testthat/test-recipes_utils.R b/tests/testthat/test-recipes_utils.R new file mode 100644 index 0000000..2dfc914 --- /dev/null +++ b/tests/testthat/test-recipes_utils.R @@ -0,0 +1,49 @@ +test_that("Find score objects in an expression or quosure", { + + obj_1 <- POTATO(c(minimize(score_a), constrain(score_b))) + res_1 <- important:::extract_score_names(obj_1) + expect_equal(res_1, c("score_a", "score_b")) + expect_equal( + obj_1 |> rlang::quo_get_expr() |> important:::extract_score_names(), + c("score_a", "score_b") + ) + + obj_2 <- POTATO(c(minimize(score_a), constrain(score_a))) + res_2 <- important:::extract_score_names(obj_2) + expect_equal(res_2, c("score_a")) + expect_equal( + obj_2 |> rlang::quo_get_expr() |> important:::extract_score_names(), + c("score_a") + ) + + obj_3 <- POTATO(score_a < 2 & score_b > 3) + res_3 <- important:::extract_score_names(obj_3) + expect_equal(res_3, c("score_a", "score_b")) + expect_equal( + obj_3 |> rlang::quo_get_expr() |> important:::extract_score_names(), + c("score_a", "score_b") + ) + + obj_4 <- POTATO(a < 2 & b > 3) + expect_snapshot(important:::extract_score_names(obj_4), error = TRUE) + + obj_5 <- POTATO() + expect_snapshot(important:::extract_score_names(obj_5), error = TRUE) + + obj_6 <- POTATO(I(score_a < 2) & predictor == "another potato") + res_6 <- important:::extract_score_names(obj_6) + expect_equal(res_6, c("score_a")) + expect_equal( + obj_6 |> rlang::quo_get_expr() |> important:::extract_score_names(), + c("score_a") + ) + + obj_7 <- POTATO(score_a / c < 2) + res_7 <- important:::extract_score_names(obj_7) + expect_equal(res_7, c("score_a")) + expect_equal( + obj_7 |> rlang::quo_get_expr() |> important:::extract_score_names(), + c("score_a") + ) + +}) From 1c98b6d112702978afd84bbb1e65e04acc61d342 Mon Sep 17 00:00:00 2001 From: topepo Date: Tue, 19 Aug 2025 17:30:09 -0400 Subject: [PATCH 06/65] rename files, add argument placeholders --- NAMESPACE | 12 +-- ...lect-2.R => step_predictor_desirability.R} | 62 +++++++-------- ...ct_2.Rd => step_predictor_desirability.Rd} | 29 ++++--- tests/testthat/_snaps/step-select-2.md | 77 ------------------- ...2.R => test-step-predictor-desirability.R} | 17 ++-- 5 files changed, 59 insertions(+), 138 deletions(-) rename R/{step-select-2.R => step_predictor_desirability.R} (70%) rename man/{step_select_2.Rd => step_predictor_desirability.Rd} (82%) delete mode 100644 tests/testthat/_snaps/step-select-2.md rename tests/testthat/{test-step-select-2.R => test-step-predictor-desirability.R} (76%) diff --git a/NAMESPACE b/NAMESPACE index 19e2c23..efe84d6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,27 +1,27 @@ # Generated by roxygen2: do not edit by hand S3method(autoplot,importance_perm) +S3method(bake,step_predictor_desirability) S3method(bake,step_select_1) -S3method(bake,step_select_2) S3method(bake,step_select_3) +S3method(prep,step_predictor_desirability) S3method(prep,step_select_1) -S3method(prep,step_select_2) S3method(prep,step_select_3) +S3method(print,step_predictor_desirability) S3method(print,step_select_1) -S3method(print,step_select_2) S3method(print,step_select_3) +S3method(tidy,step_predictor_desirability) S3method(tidy,step_select_1) -S3method(tidy,step_select_2) S3method(tidy,step_select_3) +S3method(tunable,step_predictor_desirability) S3method(tunable,step_select_1) -S3method(tunable,step_select_2) S3method(tunable,step_select_3) export(augment) export(autoplot) export(importance_perm) export(required_pkgs) +export(step_predictor_desirability) export(step_select_1) -export(step_select_2) export(step_select_3) import(recipes) import(rlang) diff --git a/R/step-select-2.R b/R/step_predictor_desirability.R similarity index 70% rename from R/step-select-2.R rename to R/step_predictor_desirability.R index afaeb45..8504dfc 100644 --- a/R/step-select-2.R +++ b/R/step_predictor_desirability.R @@ -1,10 +1,11 @@ #' Feature Selection #' -#' `step_select_2()` creates a *specification* of a recipe step that will +#' `step_predictor_desirability()` creates a *specification* of a recipe step that will #' perform feature selection by ... #' #' @inheritParams recipes::step_center -#' @param threshold ... +#' @param score A thing +#' @param prop_terms ... #' @param removals A character string that contains the names of columns that #' should be removed. These values are not determined until [recipes::prep()] #' is called. @@ -33,7 +34,7 @@ #' } #' #' ```{r, echo = FALSE, results="asis"} -#' step <- "step_select_2" +#' step <- "step_predictor_desirability" #' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") #' cat(result) #' ``` @@ -44,30 +45,26 @@ #' library(recipes) #' #' rec <- recipe(mpg ~ ., data = mtcars) |> -#' step_select_2(all_predictors()) -#' -#' prepped <- prep(rec) -#' -#' bake(prepped, mtcars) -#' -#' tidy(prepped, 1) -step_select_2 <- function( +#' step_predictor_desirability(all_predictors()) +step_predictor_desirability <- function( recipe, ..., + score, role = NA, trained = FALSE, - threshold = 0.9, + prop_terms = 0.9, removals = NULL, skip = FALSE, - id = rand_id("select_2") + id = rand_id("predictor_desirability") ) { add_step( recipe, - step_select_2_new( + step_predictor_desirability_new( terms = enquos(...), + score = enquos(score), role = role, trained = trained, - threshold = threshold, + prop_terms = prop_terms, removals = removals, skip = skip, id = id, @@ -76,23 +73,25 @@ step_select_2 <- function( ) } -step_select_2_new <- +step_predictor_desirability_new <- function( terms, + score, role, trained, - threshold, + prop_terms, removals, skip, id, case_weights ) { step( - subclass = "select_2", + subclass = "predictor_desirability", terms = terms, + score = score, role = role, trained = trained, - threshold = threshold, + prop_terms = prop_terms, removals = removals, skip = skip, id = id, @@ -101,10 +100,10 @@ step_select_2_new <- } #' @export -prep.step_select_2 <- function(x, training, info = NULL, ...) { +prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { col_names <- recipes_eval_select(x$terms, training, info) - check_type(training[, col_names], types = c("double", "integer")) - check_number_decimal(x$threshold, min = 0, max = 1, arg = "threshold") + check_type(training[, col_names], types = c("double", "integer", "factor")) + check_number_decimal(x$prop_terms, min = .Machine$double.eps, max = 1, arg = "prop_terms") wts <- get_case_weights(info, training) were_weights_used <- are_weights_used(wts, unsupervised = TRUE) @@ -118,11 +117,12 @@ prep.step_select_2 <- function(x, training, info = NULL, ...) { filter <- character(0) } - step_select_2_new( + step_predictor_desirability_new( terms = x$terms, + score = score, role = x$role, trained = TRUE, - threshold = x$threshold, + prop_terms = x$prop_terms, removals = filter, skip = x$skip, id = x$id, @@ -131,14 +131,14 @@ prep.step_select_2 <- function(x, training, info = NULL, ...) { } #' @export -bake.step_select_2 <- function(object, new_data, ...) { +bake.step_predictor_desirability <- function(object, new_data, ...) { new_data <- recipes_remove_cols(new_data, object) new_data } #' @export -print.step_select_2 <- function(x, width = max(20, options()$width - 36), ...) { - title <- "Feature selection on " +print.step_predictor_desirability <- function(x, width = max(20, options()$width - 36), ...) { + title <- "Feature selection via desirability functions on" print_step( x$removals, x$terms, @@ -152,7 +152,7 @@ print.step_select_2 <- function(x, width = max(20, options()$width - 36), ...) { #' @usage NULL #' @export -tidy.step_select_2 <- function(x, ...) { +tidy.step_predictor_desirability <- function(x, ...) { if (is_trained(x)) { res <- tibble::tibble(terms = unname(x$removals)) } else { @@ -164,14 +164,14 @@ tidy.step_select_2 <- function(x, ...) { } #' @export -tunable.step_select_2 <- function(x, ...) { +tunable.step_predictor_desirability <- function(x, ...) { tibble::tibble( - name = "threshold", + name = "prop_terms", call_info = list( list(pkg = "dials", fun = "threshold") ), source = "recipe", - component = "step_select_2", + component = "step_predictor_desirability", component_id = x$id ) } diff --git a/man/step_select_2.Rd b/man/step_predictor_desirability.Rd similarity index 82% rename from man/step_select_2.Rd rename to man/step_predictor_desirability.Rd index de37c94..ad28c72 100644 --- a/man/step_select_2.Rd +++ b/man/step_predictor_desirability.Rd @@ -1,18 +1,19 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/step-select-2.R -\name{step_select_2} -\alias{step_select_2} +% Please edit documentation in R/step_predictor_desirability.R +\name{step_predictor_desirability} +\alias{step_predictor_desirability} \title{Feature Selection} \usage{ -step_select_2( +step_predictor_desirability( recipe, ..., + score, role = NA, trained = FALSE, - threshold = 0.9, + prop_terms = 0.9, removals = NULL, skip = FALSE, - id = rand_id("select_2") + id = rand_id("predictor_desirability") ) } \arguments{ @@ -22,12 +23,14 @@ operations for this recipe.} \item{...}{One or more selector functions to choose variables for this step. See \code{\link[recipes:selections]{selections()}} for more details.} +\item{score}{A thing} + \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} -\item{threshold}{...} +\item{prop_terms}{...} \item{removals}{A character string that contains the names of columns that should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} @@ -46,7 +49,7 @@ An updated version of \code{recipe} with the new step added to the sequence of any existing operations. } \description{ -\code{step_select_2()} creates a \emph{specification} of a recipe step that will +\code{step_predictor_desirability()} creates a \emph{specification} of a recipe step that will perform feature selection by ... } \details{ @@ -71,7 +74,7 @@ returned with columns \code{terms} and \code{id}: \section{Tuning Parameters}{ This step has 1 tuning parameters: \itemize{ -\item \code{threshold}: Threshold (type: double, default: 0.9) +\item \code{prop_terms}: Threshold (type: double, default: 0.9) } The underlying operation does not allow for case weights. @@ -81,11 +84,5 @@ The underlying operation does not allow for case weights. library(recipes) rec <- recipe(mpg ~ ., data = mtcars) |> - step_select_2(all_predictors()) - -prepped <- prep(rec) - -bake(prepped, mtcars) - -tidy(prepped, 1) + step_predictor_desirability(all_predictors()) } diff --git a/tests/testthat/_snaps/step-select-2.md b/tests/testthat/_snaps/step-select-2.md deleted file mode 100644 index 346878e..0000000 --- a/tests/testthat/_snaps/step-select-2.md +++ /dev/null @@ -1,77 +0,0 @@ -# empty printing - - Code - rec - Message - - -- Recipe ---------------------------------------------------------------------- - - -- Inputs - Number of variables by role - outcome: 1 - predictor: 10 - - -- Operations - * Feature selection on: - ---- - - Code - rec - Message - - -- Recipe ---------------------------------------------------------------------- - - -- Inputs - Number of variables by role - outcome: 1 - predictor: 10 - - -- Training information - Training data contained 32 data points and no incomplete rows. - - -- Operations - * Feature selection on: | Trained - -# printing - - Code - print(rec) - Message - - -- Recipe ---------------------------------------------------------------------- - - -- Inputs - Number of variables by role - predictor: 11 - - -- Operations - * Feature selection on: all_predictors() - ---- - - Code - prep(rec) - Message - - -- Recipe ---------------------------------------------------------------------- - - -- Inputs - Number of variables by role - predictor: 11 - - -- Training information - Training data contained 32 data points and no incomplete rows. - - -- Operations - * Feature selection on: | Trained - -# bad args - - Code - prep(step_select_2(recipe(mpg ~ ., mtcars), all_predictors(), threshold = 2)) - Condition - Error in `step_select_2()`: - Caused by error in `prep()`: - ! `threshold` must be a number between 0 and 1, not the number 2. - diff --git a/tests/testthat/test-step-select-2.R b/tests/testthat/test-step-predictor-desirability.R similarity index 76% rename from tests/testthat/test-step-select-2.R rename to tests/testthat/test-step-predictor-desirability.R index f07d291..55826bb 100644 --- a/tests/testthat/test-step-select-2.R +++ b/tests/testthat/test-step-predictor-desirability.R @@ -1,14 +1,15 @@ # Infrastructure --------------------------------------------------------------- +skip("not yet!") test_that("bake method errors when needed non-standard role columns are missing", { # Here for completeness - # step_select_2() removes variables and thus does not care if they are not there. + # step_predictor_desirability() removes variables and thus does not care if they are not there. expect_true(TRUE) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) - rec <- step_select_2(rec) + rec <- step_predictor_desirability(rec) expect_snapshot(rec) @@ -19,7 +20,7 @@ test_that("empty printing", { test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) - rec2 <- step_select_2(rec1) + rec2 <- step_predictor_desirability(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) @@ -32,7 +33,7 @@ test_that("empty selection prep/bake is a no-op", { test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) - rec <- step_select_2(rec) + rec <- step_predictor_desirability(rec) expect <- tibble(terms = character(), id = character()) @@ -46,7 +47,7 @@ test_that("empty selection tidy method works", { test_that("printing", { set.seed(1) rec <- recipe(~., data = mtcars) |> - step_select_2(all_predictors()) + step_predictor_desirability(all_predictors()) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) @@ -55,7 +56,7 @@ test_that("printing", { test_that("tunable is setup to work with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) |> - step_select_2(all_predictors(), threshold = hardhat::tune()) + step_predictor_desirability(all_predictors(), threshold = hardhat::tune()) params <- extract_parameter_set_dials(rec) @@ -66,7 +67,7 @@ test_that("tunable is setup to work with extract_parameter_set_dials", { test_that("bad args", { expect_snapshot( recipe(mpg ~ ., mtcars) |> - step_select_2(all_predictors(), threshold = 2) |> + step_predictor_desirability(all_predictors(), threshold = 2) |> prep(), error = TRUE ) @@ -75,7 +76,7 @@ test_that("bad args", { test_that("0 and 1 rows data work in bake method", { data <- mtcars rec <- recipe(~., data) |> - step_select_2(all_numeric_predictors()) |> + step_predictor_desirability(all_numeric_predictors()) |> prep() expect_identical( From d210f7e573340b7b76112077fd8638bf6a326f1d Mon Sep 17 00:00:00 2001 From: Frances Lin <37535633+franceslinyc@users.noreply.github.com> Date: Tue, 19 Aug 2025 16:45:27 -0700 Subject: [PATCH 07/65] Replace all step_select_1 with step_predictor_best --- NAMESPACE | 12 ++++++------ R/step-select-1.R | 24 ++++++++++++++---------- man/step_select_1.Rd | 10 +++++----- tests/testthat/_snaps/step-select-1.md | 4 ++-- tests/testthat/test-step-select-1.R | 16 ++++++++-------- 5 files changed, 35 insertions(+), 31 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index efe84d6..0638efa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,26 +2,26 @@ S3method(autoplot,importance_perm) S3method(bake,step_predictor_desirability) -S3method(bake,step_select_1) +S3method(bake,step_predictor_best) S3method(bake,step_select_3) S3method(prep,step_predictor_desirability) -S3method(prep,step_select_1) +S3method(prep,step_predictor_best) S3method(prep,step_select_3) S3method(print,step_predictor_desirability) -S3method(print,step_select_1) +S3method(print,step_predictor_best) S3method(print,step_select_3) S3method(tidy,step_predictor_desirability) -S3method(tidy,step_select_1) +S3method(tidy,step_predictor_best) S3method(tidy,step_select_3) S3method(tunable,step_predictor_desirability) -S3method(tunable,step_select_1) +S3method(tunable,step_predictor_best) S3method(tunable,step_select_3) export(augment) export(autoplot) export(importance_perm) export(required_pkgs) export(step_predictor_desirability) -export(step_select_1) +export(step_predictor_best) export(step_select_3) import(recipes) import(rlang) diff --git a/R/step-select-1.R b/R/step-select-1.R index 7ad148f..efe4e63 100644 --- a/R/step-select-1.R +++ b/R/step-select-1.R @@ -1,6 +1,6 @@ #' Feature Selection #' -#' `step_select_1()` creates a *specification* of a recipe step that will +#' `step_predictor_best()` creates a *specification* of a recipe step that will #' perform feature selection by ... #' #' @inheritParams recipes::step_center @@ -33,7 +33,7 @@ #' } #' #' ```{r, echo = FALSE, results="asis"} -#' step <- "step_select_1" +#' step <- "step_predictor_best" #' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") #' cat(result) #' ``` @@ -44,14 +44,14 @@ #' library(recipes) #' #' rec <- recipe(mpg ~ ., data = mtcars) |> -#' step_select_1(all_predictors()) +#' step_predictor_best(all_predictors()) #' #' prepped <- prep(rec) #' #' bake(prepped, mtcars) #' #' tidy(prepped, 1) -step_select_1 <- function( +step_predictor_best <- function( recipe, ..., role = NA, @@ -101,7 +101,7 @@ step_select_1_new <- } #' @export -prep.step_select_1 <- function(x, training, info = NULL, ...) { +prep.step_predictor_best <- function(x, training, info = NULL, ...) { col_names <- recipes_eval_select(x$terms, training, info) check_type(training[, col_names], types = c("double", "integer")) check_number_decimal(x$threshold, min = 0, max = 1, arg = "threshold") @@ -131,13 +131,17 @@ prep.step_select_1 <- function(x, training, info = NULL, ...) { } #' @export -bake.step_select_1 <- function(object, new_data, ...) { +bake.step_predictor_best <- function(object, new_data, ...) { new_data <- recipes_remove_cols(new_data, object) new_data } #' @export -print.step_select_1 <- function(x, width = max(20, options()$width - 36), ...) { +print.step_predictor_best <- function( + x, + width = max(20, options()$width - 36), + ... +) { title <- "Feature selection on " print_step( x$removals, @@ -152,7 +156,7 @@ print.step_select_1 <- function(x, width = max(20, options()$width - 36), ...) { #' @usage NULL #' @export -tidy.step_select_1 <- function(x, ...) { +tidy.step_predictor_best <- function(x, ...) { if (is_trained(x)) { res <- tibble::tibble(terms = unname(x$removals)) } else { @@ -164,14 +168,14 @@ tidy.step_select_1 <- function(x, ...) { } #' @export -tunable.step_select_1 <- function(x, ...) { +tunable.step_predictor_best <- function(x, ...) { tibble::tibble( name = "threshold", call_info = list( list(pkg = "dials", fun = "threshold") ), source = "recipe", - component = "step_select_1", + component = "step_predictor_best", component_id = x$id ) } diff --git a/man/step_select_1.Rd b/man/step_select_1.Rd index 0d11117..5e65eb3 100644 --- a/man/step_select_1.Rd +++ b/man/step_select_1.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-select-1.R -\name{step_select_1} -\alias{step_select_1} +\name{step_predictor_best} +\alias{step_predictor_best} \title{Feature Selection} \usage{ -step_select_1( +step_predictor_best( recipe, ..., role = NA, @@ -46,7 +46,7 @@ An updated version of \code{recipe} with the new step added to the sequence of any existing operations. } \description{ -\code{step_select_1()} creates a \emph{specification} of a recipe step that will +\code{step_predictor_best()} creates a \emph{specification} of a recipe step that will perform feature selection by ... } \details{ @@ -81,7 +81,7 @@ The underlying operation does not allow for case weights. library(recipes) rec <- recipe(mpg ~ ., data = mtcars) |> - step_select_1(all_predictors()) + step_predictor_best(all_predictors()) prepped <- prep(rec) diff --git a/tests/testthat/_snaps/step-select-1.md b/tests/testthat/_snaps/step-select-1.md index a02cb7f..f34cfaf 100644 --- a/tests/testthat/_snaps/step-select-1.md +++ b/tests/testthat/_snaps/step-select-1.md @@ -69,9 +69,9 @@ # bad args Code - prep(step_select_1(recipe(mpg ~ ., mtcars), all_predictors(), threshold = 2)) + prep(step_predictor_best(recipe(mpg ~ ., mtcars), all_predictors(), threshold = 2)) Condition - Error in `step_select_1()`: + Error in `step_predictor_best()`: Caused by error in `prep()`: ! `threshold` must be a number between 0 and 1, not the number 2. diff --git a/tests/testthat/test-step-select-1.R b/tests/testthat/test-step-select-1.R index 6113873..e3fd0c0 100644 --- a/tests/testthat/test-step-select-1.R +++ b/tests/testthat/test-step-select-1.R @@ -2,13 +2,13 @@ test_that("bake method errors when needed non-standard role columns are missing", { # Here for completeness - # step_select_1() removes variables and thus does not care if they are not there. + # step_predictor_best() removes variables and thus does not care if they are not there. expect_true(TRUE) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) - rec <- step_select_1(rec) + rec <- step_predictor_best(rec) expect_snapshot(rec) @@ -19,7 +19,7 @@ test_that("empty printing", { test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) - rec2 <- step_select_1(rec1) + rec2 <- step_predictor_best(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) @@ -32,7 +32,7 @@ test_that("empty selection prep/bake is a no-op", { test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) - rec <- step_select_1(rec) + rec <- step_predictor_best(rec) expect <- tibble(terms = character(), id = character()) @@ -46,7 +46,7 @@ test_that("empty selection tidy method works", { test_that("printing", { set.seed(1) rec <- recipe(~., data = mtcars) |> - step_select_1(all_predictors()) + step_predictor_best(all_predictors()) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) @@ -55,7 +55,7 @@ test_that("printing", { test_that("tunable is setup to work with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) |> - step_select_1(all_predictors(), threshold = hardhat::tune()) + step_predictor_best(all_predictors(), threshold = hardhat::tune()) params <- extract_parameter_set_dials(rec) @@ -66,7 +66,7 @@ test_that("tunable is setup to work with extract_parameter_set_dials", { test_that("bad args", { expect_snapshot( recipe(mpg ~ ., mtcars) |> - step_select_1(all_predictors(), threshold = 2) |> + step_predictor_best(all_predictors(), threshold = 2) |> prep(), error = TRUE ) @@ -75,7 +75,7 @@ test_that("bad args", { test_that("0 and 1 rows data work in bake method", { data <- mtcars rec <- recipe(~., data) |> - step_select_1(all_numeric_predictors()) |> + step_predictor_best(all_numeric_predictors()) |> prep() expect_identical( From a9584bb5df692735fec8594033b852e7eb51e44f Mon Sep 17 00:00:00 2001 From: Frances Lin <37535633+franceslinyc@users.noreply.github.com> Date: Tue, 19 Aug 2025 16:46:59 -0700 Subject: [PATCH 08/65] Rename files --- R/{step-select-1.R => step-predictor-best.R} | 0 .../testthat/{test-step-select-1.R => test-step-predictor-best.R} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename R/{step-select-1.R => step-predictor-best.R} (100%) rename tests/testthat/{test-step-select-1.R => test-step-predictor-best.R} (100%) diff --git a/R/step-select-1.R b/R/step-predictor-best.R similarity index 100% rename from R/step-select-1.R rename to R/step-predictor-best.R diff --git a/tests/testthat/test-step-select-1.R b/tests/testthat/test-step-predictor-best.R similarity index 100% rename from tests/testthat/test-step-select-1.R rename to tests/testthat/test-step-predictor-best.R From f3ff011c274aa8841f600e41e4de47c5912dd6e2 Mon Sep 17 00:00:00 2001 From: Frances Lin <37535633+franceslinyc@users.noreply.github.com> Date: Tue, 19 Aug 2025 16:52:35 -0700 Subject: [PATCH 09/65] Add skip() --- tests/testthat/_snaps/step-select-1.md | 77 ----------------------- tests/testthat/test-step-predictor-best.R | 1 + 2 files changed, 1 insertion(+), 77 deletions(-) delete mode 100644 tests/testthat/_snaps/step-select-1.md diff --git a/tests/testthat/_snaps/step-select-1.md b/tests/testthat/_snaps/step-select-1.md deleted file mode 100644 index f34cfaf..0000000 --- a/tests/testthat/_snaps/step-select-1.md +++ /dev/null @@ -1,77 +0,0 @@ -# empty printing - - Code - rec - Message - - -- Recipe ---------------------------------------------------------------------- - - -- Inputs - Number of variables by role - outcome: 1 - predictor: 10 - - -- Operations - * Feature selection on: - ---- - - Code - rec - Message - - -- Recipe ---------------------------------------------------------------------- - - -- Inputs - Number of variables by role - outcome: 1 - predictor: 10 - - -- Training information - Training data contained 32 data points and no incomplete rows. - - -- Operations - * Feature selection on: | Trained - -# printing - - Code - print(rec) - Message - - -- Recipe ---------------------------------------------------------------------- - - -- Inputs - Number of variables by role - predictor: 11 - - -- Operations - * Feature selection on: all_predictors() - ---- - - Code - prep(rec) - Message - - -- Recipe ---------------------------------------------------------------------- - - -- Inputs - Number of variables by role - predictor: 11 - - -- Training information - Training data contained 32 data points and no incomplete rows. - - -- Operations - * Feature selection on: | Trained - -# bad args - - Code - prep(step_predictor_best(recipe(mpg ~ ., mtcars), all_predictors(), threshold = 2)) - Condition - Error in `step_predictor_best()`: - Caused by error in `prep()`: - ! `threshold` must be a number between 0 and 1, not the number 2. - diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index e3fd0c0..a09d47b 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -1,4 +1,5 @@ # Infrastructure --------------------------------------------------------------- +skip() test_that("bake method errors when needed non-standard role columns are missing", { # Here for completeness From 26357418c4ce41b83063bd50f38c57aa666c77c4 Mon Sep 17 00:00:00 2001 From: Frances Lin <37535633+franceslinyc@users.noreply.github.com> Date: Tue, 19 Aug 2025 17:21:30 -0700 Subject: [PATCH 10/65] Fix R CMD check error --- NAMESPACE | 12 ++++++------ R/step-predictor-best.R | 12 ++++++------ man/{step_select_1.Rd => step_predictor_best.Rd} | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) rename man/{step_select_1.Rd => step_predictor_best.Rd} (96%) diff --git a/NAMESPACE b/NAMESPACE index 0638efa..16cbad8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,27 +1,27 @@ # Generated by roxygen2: do not edit by hand S3method(autoplot,importance_perm) -S3method(bake,step_predictor_desirability) S3method(bake,step_predictor_best) +S3method(bake,step_predictor_desirability) S3method(bake,step_select_3) -S3method(prep,step_predictor_desirability) S3method(prep,step_predictor_best) +S3method(prep,step_predictor_desirability) S3method(prep,step_select_3) -S3method(print,step_predictor_desirability) S3method(print,step_predictor_best) +S3method(print,step_predictor_desirability) S3method(print,step_select_3) -S3method(tidy,step_predictor_desirability) S3method(tidy,step_predictor_best) +S3method(tidy,step_predictor_desirability) S3method(tidy,step_select_3) -S3method(tunable,step_predictor_desirability) S3method(tunable,step_predictor_best) +S3method(tunable,step_predictor_desirability) S3method(tunable,step_select_3) export(augment) export(autoplot) export(importance_perm) export(required_pkgs) -export(step_predictor_desirability) export(step_predictor_best) +export(step_predictor_desirability) export(step_select_3) import(recipes) import(rlang) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index efe4e63..05f60a3 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -59,11 +59,11 @@ step_predictor_best <- function( threshold = 0.9, removals = NULL, skip = FALSE, - id = rand_id("select_1") + id = rand_id("predictor_best") ) { add_step( recipe, - step_select_1_new( + step_predictor_best_new( terms = enquos(...), role = role, trained = trained, @@ -76,7 +76,7 @@ step_predictor_best <- function( ) } -step_select_1_new <- +step_predictor_best_new <- function( terms, role, @@ -88,7 +88,7 @@ step_select_1_new <- case_weights ) { step( - subclass = "select_1", + subclass = "predictor_best", terms = terms, role = role, trained = trained, @@ -118,7 +118,7 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { filter <- character(0) } - step_select_1_new( + step_predictor_best_new( terms = x$terms, role = x$role, trained = TRUE, @@ -142,7 +142,7 @@ print.step_predictor_best <- function( width = max(20, options()$width - 36), ... ) { - title <- "Feature selection on " + title <- "Feature selection on" print_step( x$removals, x$terms, diff --git a/man/step_select_1.Rd b/man/step_predictor_best.Rd similarity index 96% rename from man/step_select_1.Rd rename to man/step_predictor_best.Rd index 5e65eb3..4e526da 100644 --- a/man/step_select_1.Rd +++ b/man/step_predictor_best.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/step-select-1.R +% Please edit documentation in R/step-predictor-best.R \name{step_predictor_best} \alias{step_predictor_best} \title{Feature Selection} @@ -12,7 +12,7 @@ step_predictor_best( threshold = 0.9, removals = NULL, skip = FALSE, - id = rand_id("select_1") + id = rand_id("predictor_best") ) } \arguments{ From eaa76a41e4f1e8cc7d45fbf40d614ba7439dd272 Mon Sep 17 00:00:00 2001 From: topepo Date: Tue, 19 Aug 2025 22:00:07 -0400 Subject: [PATCH 11/65] added a few more utilities --- R/recipe_utils.R | 47 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/R/recipe_utils.R b/R/recipe_utils.R index 8ff0ee0..4516543 100644 --- a/R/recipe_utils.R +++ b/R/recipe_utils.R @@ -12,11 +12,54 @@ extract_score_names <- function(x, call = rlang::caller_env()) { } x <- rlang::quo_get_expr(x) } - res <- unique(all.vars(x)) - res <- grep("^score_", res, value = TRUE) + res <- all.vars(x) + check_score_names(res, call) +} + +check_score_names <- function(x, call) { + x <- unique(x) + res <- grep("^score_", x, value = TRUE) if (length(res) == 0) { cli::cli_abort("No score objects were found in {.arg score}.", call = call) } res } +pull_outcome_column_name <- function(x) { + outcome_name <- x$variable[x$role == "outcome"] + num_outcomes <- length(outcome_name) + if (num_outcomes != 1) { + cli::cli_abort("One column should have a role of {.val outcome}.") + } + outcome_name +} + +compute_score <- function(score, args, form, data) { + fn <- find_score_object(score) + cl <- rlang::call2( + "fit", + .ns = "generics", + object = quote(fn), + formula = quote(form), + data = quote(data) + ) + cl <- rlang::call_modify(cl, !!!args) + + res <- try(rlang::eval_tidy(cl), silent = TRUE) + + # if error return all NA + res +} + +# Temporary solution +find_score_object <- function(x) { + utils::getFromNamespace(x, "filtro") +} + +update_prop <- function(num_cols, prop) { + min_prop <- 1/num_cols + if (prop < min_prop) { + prop <- min_prop + 2 * .Machine$double.eps + } + prop +} From 4213eae2dfccd3435a76d710e90f3c413b4220cb Mon Sep 17 00:00:00 2001 From: topepo Date: Tue, 19 Aug 2025 22:00:28 -0400 Subject: [PATCH 12/65] imports and global variable false positives --- NAMESPACE | 1 + R/important-package.R | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 16cbad8..bb0d7a9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,5 +30,6 @@ importFrom(generics,required_pkgs) importFrom(ggplot2,autoplot) importFrom(hardhat,extract_fit_parsnip) importFrom(hardhat,extract_postprocessor) +importFrom(stats,as.formula) importFrom(stats,predict) importFrom(stats,sd) diff --git a/R/important-package.R b/R/important-package.R index f085509..09fb713 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -4,7 +4,7 @@ ## usethis namespace: start #' @import recipes #' @import rlang -#' @importFrom stats sd predict +#' @importFrom stats sd predict as.formula #' @importFrom hardhat extract_fit_parsnip extract_postprocessor #' @importFrom ggplot2 autoplot @@ -30,7 +30,9 @@ utils::globalVariables( "predictor", "ranking", "std_err", - "score" + "score", + ".d_overall", + "outcome" ) ) ## usethis namespace: end From c9108e7bd471fcd876ed8eaddc6de575046a5312 Mon Sep 17 00:00:00 2001 From: topepo Date: Tue, 19 Aug 2025 22:01:01 -0400 Subject: [PATCH 13/65] kind of working version of desirability step --- DESCRIPTION | 4 + R/step_predictor_desirability.R | 149 +++++++++++++++++++++++++---- inst/WORDLIST | 1 + man/step_predictor_desirability.Rd | 74 ++++++++++++-- 4 files changed, 201 insertions(+), 27 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8bee702..08286b4 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,7 +19,9 @@ Depends: recipes (>= 1.1.0) Imports: cli, + desirability2 (>= 0.1.0.9000), dplyr, + filtro, generics, ggplot2, hardhat (>= 1.4.0.9002), @@ -42,6 +44,8 @@ Suggests: survival, testthat (>= 3.0.0), yardstick +Remote: + tidymodels/desirability2 Config/Needs/website: tidyverse/tidytemplate, tidymodels Config/testthat/edition: 3 Config/usethis/last-upkeep: 2025-06-09 diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 8504dfc..65d64e6 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -1,16 +1,27 @@ #' Feature Selection #' -#' `step_predictor_desirability()` creates a *specification* of a recipe step that will -#' perform feature selection by ... +#' `step_predictor_desirability()` creates a *specification* of a recipe step +#' that uses one or more "score" functions to measure how how much each +#' predictor is related to the outcome value. These scores are combined into +#' a composite value using user-specified _desirability_ functions and a +#' proportion of the most desirable predictors are retained. #' #' @inheritParams recipes::step_center -#' @param score A thing -#' @param prop_terms ... -#' @param removals A character string that contains the names of columns that +#' @param score An object produced by [desirability2::desirability()] that uses +#' one or more score functions from the \pkg{filtro} package. See the Details +#' and Examples sections below. This argument *should be named* when used. +#' @param prop_terms The proportion of predictors that should be retained when +#' ordered by overall desirability. +#' @param update_prop A logical: should `prop_terms` be updated so that at least +#' one predictor will be retained? +#' @param removals A character string that contains the names of predictors that #' should be removed. These values are not determined until [recipes::prep()] #' is called. +#' @param results A data frame of score and desirability values for each +#' predictor evaluated. These values are not determined until +#' [recipes::prep()] is called. #' @return An updated version of `recipe` with the new step added to the -#' sequence of any existing operations. +#' sequence of any existing operations. #' @export #' #' @details @@ -41,30 +52,77 @@ #' #' The underlying operation does not allow for case weights. #' +#' @seealso [desirability2::desirability()] +#' @references Derringer, G. and Suich, R. (1980), Simultaneous Optimization of +#' Several Response Variables. _Journal of Quality Technology_, 12, 214-219. #' @examples #' library(recipes) +#' library(desirability2) #' -#' rec <- recipe(mpg ~ ., data = mtcars) |> -#' step_predictor_desirability(all_predictors()) +#' if (rlang::is_installed("modeldata")) { +#' # The `ad_data` has a binary outcome column ("Class") and mostly numeric +#' # predictors. For these, we score the predictors using an analysis of +#' # variance model where the predicor is the outcome and the outcome class +#' # defines the groups. +#' # There is also a single factor predictor ("Genotype") and we'll use +#' # Fisher's Exact test to score it. NOTE that for scores using hypothesis +#' # tests, the -log10(pvalue) is returned so that larger values are more +#' # important. +#' +#' # The score_* objects here are from the filtro package. See Details above. +#' goals <- +#' desirability( +#' maximize(score_xtab_pval_fisher), +#' maximize(score_aov_pval) +#' ) +#' +#' rec <- +#' recipe(Class ~ ., data = modeldata::ad_data) |> +#' step_predictor_desirability( +#' all_predictors(), +#' score = goals, +#' prop_terms = 1/2 +#' ) +#' rec +#' +#' # Now evaluate the predictors and rank them via desirability: +#' rec_trained <- prep(rec) +#' rec_trained +#' +#' # Use the tidy() method to get the results: +#' predictor_scores <- tidy(rec, number = 1) +#' mean(predictor_scores$retained) +#' predictor_scores +#' } step_predictor_desirability <- function( recipe, ..., score, role = NA, trained = FALSE, - prop_terms = 0.9, + prop_terms = 0.5, + update_prop = TRUE, + results = NULL, removals = NULL, skip = FALSE, id = rand_id("predictor_desirability") ) { + if (!inherits(score, "desirability2::desirability_set")) { + cli::cli_abort( + "Please use the {.fn desirability} function in the {.pkg desirability2} + package to create an object to pass to {.arg score}." + ) + } add_step( recipe, step_predictor_desirability_new( terms = enquos(...), - score = enquos(score), + score = score, role = role, trained = trained, prop_terms = prop_terms, + update_prop = update_prop, + results = results, removals = removals, skip = skip, id = id, @@ -80,6 +138,8 @@ step_predictor_desirability_new <- role, trained, prop_terms, + update_prop = update_prop, + results, removals, skip, id, @@ -91,7 +151,9 @@ step_predictor_desirability_new <- score = score, role = role, trained = trained, + results = results, prop_terms = prop_terms, + update_prop = update_prop, removals = removals, skip = skip, id = id, @@ -105,25 +167,71 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { check_type(training[, col_names], types = c("double", "integer", "factor")) check_number_decimal(x$prop_terms, min = .Machine$double.eps, max = 1, arg = "prop_terms") + if (length(col_names) < 2) { + res <- + step_predictor_desirability_new( + terms = x$terms, + score = score, + role = x$role, + trained = TRUE, + prop_terms = x$prop_terms, + update_prop = x$update_prop, + results = NULL, + removals = character(0), + skip = x$skip, + id = x$id, + case_weights = were_weights_used + ) + return(res) + } + + if (x$update_prop) { + x$prop_terms <- update_prop(length(col_names), x$prop_terms) + } + wts <- get_case_weights(info, training) were_weights_used <- are_weights_used(wts, unsupervised = TRUE) if (isFALSE(were_weights_used)) { wts <- NULL } - if (length(col_names) > 1) { - filter <- character(0) - } else { - filter <- character(0) - } + score_names <- check_score_names(unlist(x$score@variables)) + outcome_name <- pull_outcome_column_name(info) + fm <- paste(outcome_name, "~ .") + fm <- stats::as.formula(fm) + + # TODO ooof... how to handle case weights? + score_objs <- + purrr::map( + score_names, + ~ compute_score(.x, list(), fm, training[ c(outcome_name, col_names)]) + ) |> + filtro::fill_safe_values() # and then transform? + + # The score names include "score_" but the column names don't + rm_vec <- gsub("^score_", "", score_names) + names(rm_vec) <- score_names + score_objs <- dplyr::rename(score_objs, rm_vec) + + # make desirability expression/eval quosre + score_objs <- desirability2::make_desirability_cols(x$score, score_objs) + + keep_list <- + score_objs |> + dplyr::slice_max(.d_overall, prop = x$prop_terms, with_ties = TRUE) + rm_list <- + dplyr::anti_join(score_objs, keep_list[, "predictor"], by = "predictor") |> + purrr::pluck("predictor") step_predictor_desirability_new( terms = x$terms, - score = score, + score = x$score, role = x$role, trained = TRUE, + results = score_objs, prop_terms = x$prop_terms, - removals = filter, + update_prop = x$update_prop, + removals = rm_list, skip = x$skip, id = x$id, case_weights = were_weights_used @@ -154,7 +262,10 @@ print.step_predictor_desirability <- function(x, width = max(20, options()$width #' @export tidy.step_predictor_desirability <- function(x, ...) { if (is_trained(x)) { - res <- tibble::tibble(terms = unname(x$removals)) + res <- + x$results |> + dplyr::select(-outcome, terms = predictor) + res$retained <- !(res$terms %in% x$removals) } else { term_names <- sel2char(x$terms) res <- tibble::tibble(terms = term_names) @@ -163,6 +274,8 @@ tidy.step_predictor_desirability <- function(x, ...) { res } +# TODO make a new dials parameter for prop_terms + #' @export tunable.step_predictor_desirability <- function(x, ...) { tibble::tibble( diff --git a/inst/WORDLIST b/inst/WORDLIST index 8e746a7..61a8479 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -4,6 +4,7 @@ CMD Codecov ORCID PBC +Suich doi funder importances diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index ad28c72..dfab680 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -10,7 +10,9 @@ step_predictor_desirability( score, role = NA, trained = FALSE, - prop_terms = 0.9, + prop_terms = 0.5, + update_prop = TRUE, + results = NULL, removals = NULL, skip = FALSE, id = rand_id("predictor_desirability") @@ -23,16 +25,26 @@ operations for this recipe.} \item{...}{One or more selector functions to choose variables for this step. See \code{\link[recipes:selections]{selections()}} for more details.} -\item{score}{A thing} +\item{score}{An object produced by \code{\link[desirability2:desirability]{desirability2::desirability()}} that uses +one or more score functions from the \pkg{filtro} package. See the Details +and Examples sections below. This argument \emph{should be named} when used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} -\item{prop_terms}{...} +\item{prop_terms}{The proportion of predictors that should be retained when +ordered by overall desirability.} -\item{removals}{A character string that contains the names of columns that +\item{update_prop}{A logical: should \code{prop_terms} be updated so that at least +one predictor will be retained?} + +\item{results}{A data frame of score and desirability values for each +predictor evaluated. These values are not determined until +\code{\link[recipes:prep]{recipes::prep()}} is called.} + +\item{removals}{A character string that contains the names of predictors that should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} is called.} @@ -49,8 +61,11 @@ An updated version of \code{recipe} with the new step added to the sequence of any existing operations. } \description{ -\code{step_predictor_desirability()} creates a \emph{specification} of a recipe step that will -perform feature selection by ... +\code{step_predictor_desirability()} creates a \emph{specification} of a recipe step +that uses one or more "score" functions to measure how how much each +predictor is related to the outcome value. These scores are combined into +a composite value using user-specified \emph{desirability} functions and a +proportion of the most desirable predictors are retained. } \details{ This step ... @@ -74,7 +89,7 @@ returned with columns \code{terms} and \code{id}: \section{Tuning Parameters}{ This step has 1 tuning parameters: \itemize{ -\item \code{prop_terms}: Threshold (type: double, default: 0.9) +\item \code{prop_terms}: Threshold (type: double, default: 0.5) } The underlying operation does not allow for case weights. @@ -82,7 +97,48 @@ The underlying operation does not allow for case weights. \examples{ library(recipes) +library(desirability2) + +if (rlang::is_installed("modeldata")) { + # The `ad_data` has a binary outcome column ("Class") and mostly numeric + # predictors. For these, we score the predictors using an analysis of + # variance model where the predicor is the outcome and the outcome class + # defines the groups. + # There is also a single factor predictor ("Genotype") and we'll use + # Fisher's Exact test to score it. NOTE that for scores using hypothesis + # tests, the -log10(pvalue) is returned so that larger values are more + # important. + + # The score_* objects here are from the filtro package. See Details above. + goals <- + desirability( + maximize(score_xtab_pval_fisher), + maximize(score_aov_pval) + ) + + rec <- + recipe(Class ~ ., data = modeldata::ad_data) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 1/2 + ) + rec -rec <- recipe(mpg ~ ., data = mtcars) |> - step_predictor_desirability(all_predictors()) + # Now evaluate the predictors and rank them via desirability: + rec_trained <- prep(rec) + rec_trained + + # Use the tidy() method to get the results: + predictor_scores <- tidy(rec, number = 1) + mean(predictor_scores$retained) + predictor_scores +} +} +\references{ +Derringer, G. and Suich, R. (1980), Simultaneous Optimization of +Several Response Variables. \emph{Journal of Quality Technology}, 12, 214-219. +} +\seealso{ +\code{\link[desirability2:desirability]{desirability2::desirability()}} } From 49fbea6add21b71895e58ce2d9c9ac23debf2002 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 20 Aug 2025 10:44:58 -0400 Subject: [PATCH 14/65] add case weight machinery and example --- R/recipe_utils.R | 22 ++++++++++-- R/step_predictor_desirability.R | 58 +++++++++++++++++++++++++++--- man/step_predictor_desirability.Rd | 52 +++++++++++++++++++++++++-- 3 files changed, 123 insertions(+), 9 deletions(-) diff --git a/R/recipe_utils.R b/R/recipe_utils.R index 4516543..f94ddba 100644 --- a/R/recipe_utils.R +++ b/R/recipe_utils.R @@ -34,12 +34,28 @@ pull_outcome_column_name <- function(x) { outcome_name } -compute_score <- function(score, args, form, data) { - fn <- find_score_object(score) +check_weights <- function(object, weights) { + if (object@case_weights & !is.null(weights)) { + res <- weights + } else { + res <- NULL + } + res +} + +compute_score <- function(score, args, form, data, weights = NULL) { + score_obj <- find_score_object(score) + + # Process case weights + weights <- check_weights(score_obj, weights) + if (!is.null(weights)) { + args$case_weights <- weights + } + cl <- rlang::call2( "fit", .ns = "generics", - object = quote(fn), + object = quote(score_obj), formula = quote(form), data = quote(data) ) diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 65d64e6..fb51e51 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -76,8 +76,9 @@ #' maximize(score_aov_pval) #' ) #' +#' example_data <- modeldata::ad_data #' rec <- -#' recipe(Class ~ ., data = modeldata::ad_data) |> +#' recipe(Class ~ ., data = example_data) |> #' step_predictor_desirability( #' all_predictors(), #' score = goals, @@ -90,9 +91,56 @@ #' rec_trained #' #' # Use the tidy() method to get the results: -#' predictor_scores <- tidy(rec, number = 1) +#' predictor_scores <- tidy(rec_trained, number = 1) #' mean(predictor_scores$retained) #' predictor_scores +#' +#' # -------------------------------------------------------------------------- +#' +#' # Case-weight example: use the hardhat package to create the appropriate type +#' # of case weights. Here, we'll increase the weights for the minority class and +#' # add them to the data frame. +#' +#' library(hardhat) +#' +#' example_weights <- example_data +#' weights <- ifelse(example_data$Class == "Impaired", 5, 1) +#' example_weights$weights <- importance_weights(weights) +#' +#' # To see if the scores can use case weights, load the filtro package and +#' # check the `case_weights` property: +#' +#' library(filtro) +#' +#' score_xtab_pval_fisher@case_weights +#' score_aov_pval@case_weights +#' +#' # The recipe will automatically find the case weights and will +#' # not treat them as predictors. +#' rec_wts <- +#' recipe(Class ~ ., data = example_weights) |> +#' step_predictor_desirability( +#' all_predictors(), +#' score = goals, +#' prop_terms = 1/2 +#' ) |> +#' prep() +#' rec_wts +#' +#' predictor_scores_wts <- +#' tidy(rec_wts, number = 1) |> +#' select(terms, .d_overall_weighted = .d_overall) +#' +#' library(dplyr) +#' library(ggplot2) +#' +#' # The selection did not substantually change with these case weights +#' full_join(predictor_scores, predictor_scores_wts, by = "terms") |> +#' ggplot(aes(.d_overall, .d_overall_weighted)) + +#' geom_abline(col = "darkgreen", lty = 2) + +#' geom_point(alpha = 1 / 2) + +#' coord_fixed(ratio = 1) + +#' labs(x = "Unweighted", y = "Class Weighted") #' } step_predictor_desirability <- function( recipe, @@ -189,8 +237,10 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { x$prop_terms <- update_prop(length(col_names), x$prop_terms) } + # First we check the _type_ of weight to see if it is used. Later, in + # `compute_score()`, we check to see if the score supports case weights. wts <- get_case_weights(info, training) - were_weights_used <- are_weights_used(wts, unsupervised = TRUE) + were_weights_used <- are_weights_used(wts, unsupervised = FALSE) if (isFALSE(were_weights_used)) { wts <- NULL } @@ -204,7 +254,7 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { score_objs <- purrr::map( score_names, - ~ compute_score(.x, list(), fm, training[ c(outcome_name, col_names)]) + ~ compute_score(.x, list(), fm, training[ c(outcome_name, col_names)], wts) ) |> filtro::fill_safe_values() # and then transform? diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index dfab680..0ae61ac 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -116,8 +116,9 @@ if (rlang::is_installed("modeldata")) { maximize(score_aov_pval) ) + example_data <- modeldata::ad_data rec <- - recipe(Class ~ ., data = modeldata::ad_data) |> + recipe(Class ~ ., data = example_data) |> step_predictor_desirability( all_predictors(), score = goals, @@ -130,9 +131,56 @@ if (rlang::is_installed("modeldata")) { rec_trained # Use the tidy() method to get the results: - predictor_scores <- tidy(rec, number = 1) + predictor_scores <- tidy(rec_trained, number = 1) mean(predictor_scores$retained) predictor_scores + + # -------------------------------------------------------------------------- + +# Case-weight example: use the hardhat package to create the appropriate type +# of case weights. Here, we'll increase the weights for the minority class and +# add them to the data frame. + +library(hardhat) + +example_weights <- example_data +weights <- ifelse(example_data$Class == "Impaired", 5, 1) +example_weights$weights <- importance_weights(weights) + +# To see if the scores can use case weights, load the filtro package and +# check the `case_weights` property: + +library(filtro) + +score_xtab_pval_fisher@case_weights +score_aov_pval@case_weights + +# The recipe will automatically find the case weights and will +# not treat them as predictors. +rec_wts <- + recipe(Class ~ ., data = example_weights) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 1/2 + ) |> + prep() +rec_wts + +predictor_scores_wts <- + tidy(rec_wts, number = 1) |> + select(terms, .d_overall_weighted = .d_overall) + +library(dplyr) +library(ggplot2) + +# The selection did not substantually change with these case weights +full_join(predictor_scores, predictor_scores_wts, by = "terms") |> + ggplot(aes(.d_overall, .d_overall_weighted)) + + geom_abline(col = "darkgreen", lty = 2) + + geom_point(alpha = 1 / 2) + + coord_fixed(ratio = 1) + + labs(x = "Unweighted", y = "Class Weighted") } } \references{ From 546062737d6d2614217459e10fd867351758cd0a Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 20 Aug 2025 10:54:43 -0400 Subject: [PATCH 15/65] update for CRAN hardhat --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 08286b4..fc1c26c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,7 +24,7 @@ Imports: filtro, generics, ggplot2, - hardhat (>= 1.4.0.9002), + hardhat (>= 1.4.2), purrr, rlang (>= 1.1.0), tibble, From eea445497d8d9d473c9ea6a32d0527712fb47021 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 20 Aug 2025 10:54:51 -0400 Subject: [PATCH 16/65] no longer needed --- R/recipe_utils.R | 6 ------ 1 file changed, 6 deletions(-) diff --git a/R/recipe_utils.R b/R/recipe_utils.R index f94ddba..5588f7b 100644 --- a/R/recipe_utils.R +++ b/R/recipe_utils.R @@ -1,9 +1,3 @@ -# Check input for step_predictor_desirability() -check_desirability_arg <- function(x) { - # check for `c()` - # check not empty -} - # For an expression or quosure, list the user-specificied score functions extract_score_names <- function(x, call = rlang::caller_env()) { if (rlang::is_quosure(x)) { From 0e843c01e0ece657e6da510c9e30ec9b13023159 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 20 Aug 2025 15:32:59 -0400 Subject: [PATCH 17/65] try removing postproc to deal with dependency issue --- DESCRIPTION | 2 +- NAMESPACE | 1 - R/importance_perm.R | 18 +++++++++++++----- R/important-package.R | 7 +------ 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e81017e..f59ce42 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -26,7 +26,7 @@ Imports: filtro, generics, ggplot2, - hardhat (>= 1.4.2), + hardhat (>= 1.4.1), purrr, rlang (>= 1.1.0), tibble, diff --git a/NAMESPACE b/NAMESPACE index bb0d7a9..66a1bc8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,7 +29,6 @@ importFrom(generics,augment) importFrom(generics,required_pkgs) importFrom(ggplot2,autoplot) importFrom(hardhat,extract_fit_parsnip) -importFrom(hardhat,extract_postprocessor) importFrom(stats,as.formula) importFrom(stats,predict) importFrom(stats,sd) diff --git a/R/importance_perm.R b/R/importance_perm.R index 798afe7..df134f3 100644 --- a/R/importance_perm.R +++ b/R/importance_perm.R @@ -368,11 +368,19 @@ predictions <- function(wflow, new_data, type, eval_time) { wflow |> extract_fit_parsnip() |> augment(new_data = new_data, eval_time = eval_time) - use_post <- has_postprocessor(wflow) - if (use_post) { - post_proc <- extract_postprocessor(wflow) - preds <- predict(post_proc, preds) - } + check_post(wflow) + # For once tailor and tune 2.0 are rleeased + # if (FALSE) { + # post_proc <- extract_postprocessor(wflow) + # preds <- predict(post_proc, preds) + # } } preds } + +check_post <- function(x) { + if (!identical(names(x$post$actions), character(0))) { + cli::cli_abort("{.pkg important} does not currently support postprocessors.") + } + invisible(NULL) +} diff --git a/R/important-package.R b/R/important-package.R index 09fb713..8ad58ad 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -5,7 +5,7 @@ #' @import recipes #' @import rlang #' @importFrom stats sd predict as.formula -#' @importFrom hardhat extract_fit_parsnip extract_postprocessor +#' @importFrom hardhat extract_fit_parsnip #' @importFrom ggplot2 autoplot #' @export @@ -38,8 +38,3 @@ utils::globalVariables( ## usethis namespace: end NULL -## From workflows -# nocov start -has_postprocessor <- function(x) has_postprocessor_tailor(x) -has_postprocessor_tailor <- function(x) "tailor" %in% names(x$post$actions) -# nocov end From 2e766f71350fb56a9ba91344d82962d8a3692772 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Wed, 20 Aug 2025 17:02:57 -0700 Subject: [PATCH 18/65] start step_predictor_retain() --- NAMESPACE | 11 +- ...tep-select-3.R => step-predictor-retain.R} | 134 +++++++++++++----- ...p_select_3.Rd => step_predictor_retain.Rd} | 30 ++-- ...p-select-3.md => step-predictor-retain.md} | 5 +- ...elect-3.R => test-step-predictor-retain.R} | 18 +-- 5 files changed, 126 insertions(+), 72 deletions(-) rename R/{step-select-3.R => step-predictor-retain.R} (51%) rename man/{step_select_3.Rd => step_predictor_retain.Rd} (83%) rename tests/testthat/_snaps/{step-select-3.md => step-predictor-retain.md} (91%) rename tests/testthat/{test-step-select-3.R => test-step-predictor-retain.R} (78%) diff --git a/NAMESPACE b/NAMESPACE index 66a1bc8..2a23f08 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,26 +3,25 @@ S3method(autoplot,importance_perm) S3method(bake,step_predictor_best) S3method(bake,step_predictor_desirability) -S3method(bake,step_select_3) +S3method(bake,step_predictor_retain) S3method(prep,step_predictor_best) S3method(prep,step_predictor_desirability) -S3method(prep,step_select_3) +S3method(prep,step_predictor_retain) S3method(print,step_predictor_best) S3method(print,step_predictor_desirability) -S3method(print,step_select_3) +S3method(print,step_predictor_retain) S3method(tidy,step_predictor_best) S3method(tidy,step_predictor_desirability) -S3method(tidy,step_select_3) +S3method(tidy,step_predictor_retain) S3method(tunable,step_predictor_best) S3method(tunable,step_predictor_desirability) -S3method(tunable,step_select_3) export(augment) export(autoplot) export(importance_perm) export(required_pkgs) export(step_predictor_best) export(step_predictor_desirability) -export(step_select_3) +export(step_predictor_retain) import(recipes) import(rlang) importFrom(generics,augment) diff --git a/R/step-select-3.R b/R/step-predictor-retain.R similarity index 51% rename from R/step-select-3.R rename to R/step-predictor-retain.R index de6e1d4..0774059 100644 --- a/R/step-select-3.R +++ b/R/step-predictor-retain.R @@ -1,10 +1,10 @@ #' Feature Selection #' -#' `step_select_3()` creates a *specification* of a recipe step that will +#' `step_predictor_retain()` creates a *specification* of a recipe step that will #' perform feature selection by ... #' #' @inheritParams recipes::step_center -#' @param threshold ... +#' @param score ... #' @param removals A character string that contains the names of columns that #' should be removed. These values are not determined until [recipes::prep()] #' is called. @@ -32,42 +32,39 @@ #' \item{id}{character, id of this step} #' } #' -#' ```{r, echo = FALSE, results="asis"} -#' step <- "step_select_3" -#' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") -#' cat(result) -#' ``` -#' #' The underlying operation does not allow for case weights. #' #' @examples #' library(recipes) #' #' rec <- recipe(mpg ~ ., data = mtcars) |> -#' step_select_3(all_predictors()) +#' step_predictor_retain( +#' all_predictors(), +#' score = cor_pearson >= 0.75 & imp_rf_oblique >= 0 +#' ) #' #' prepped <- prep(rec) #' #' bake(prepped, mtcars) #' #' tidy(prepped, 1) -step_select_3 <- function( +step_predictor_retain <- function( recipe, ..., + score, role = NA, trained = FALSE, - threshold = 0.9, removals = NULL, skip = FALSE, - id = rand_id("select_3") + id = rand_id("predictor_retain") ) { add_step( recipe, - step_select_3_new( + step_predictor_retain_new( terms = enquos(...), role = role, trained = trained, - threshold = threshold, + score = rlang::enexpr(score), removals = removals, skip = skip, id = id, @@ -76,23 +73,23 @@ step_select_3 <- function( ) } -step_select_3_new <- +step_predictor_retain_new <- function( terms, role, trained, - threshold, + score, removals, skip, id, case_weights ) { step( - subclass = "select_3", + subclass = "predictor_retain", terms = terms, role = role, trained = trained, - threshold = threshold, + score = score, removals = removals, skip = skip, id = id, @@ -101,10 +98,9 @@ step_select_3_new <- } #' @export -prep.step_select_3 <- function(x, training, info = NULL, ...) { +prep.step_predictor_retain <- function(x, training, info = NULL, ...) { col_names <- recipes_eval_select(x$terms, training, info) check_type(training[, col_names], types = c("double", "integer")) - check_number_decimal(x$threshold, min = 0, max = 1, arg = "threshold") wts <- get_case_weights(info, training) were_weights_used <- are_weights_used(wts, unsupervised = TRUE) @@ -113,16 +109,16 @@ prep.step_select_3 <- function(x, training, info = NULL, ...) { } if (length(col_names) > 1) { - filter <- character(0) + filter <- calculate_predictor_retain(x$score, data = training[, col_names]) } else { filter <- character(0) } - step_select_3_new( + step_predictor_retain_new( terms = x$terms, role = x$role, trained = TRUE, - threshold = x$threshold, + score = x$score, removals = filter, skip = x$skip, id = x$id, @@ -130,14 +126,87 @@ prep.step_select_3 <- function(x, training, info = NULL, ...) { ) } +# TODO: how to pass opts and or tune objects? Internal score set? + +# Assumes `data` has all predictors and the outcome columns (only) +calculate_predictor_retain <- function( + xpr, + outcome = character(0), + data, + opts = list() +) { + all_scores <- unique(all.vars(xpr)) + all_score_functions <- paste0("score_", all_scores) + + # ------------------------------------------------------------------------------ + # Find all known class_{score} and check against list + + # ------------------------------------------------------------------------------ + # Process any options + + opts <- make_opt_list(opts, all_scores) + + # ------------------------------------------------------------------------------ + + fm <- as.formula(paste(outcome, "~ .")) + + # ------------------------------------------------------------------------------ + + # Get list of args for each scoring method and use map2() + score_res <- purrr::map2( + all_score_functions, + opts, + compute_score, + form = fm, + data = data + ) + names(score_res) <- all_scores + + # ------------------------------------------------------------------------------ + # Fill in missings + + score_df <- # save for tidy method + score_res |> + filtro::fill_safe_values() + + # ------------------------------------------------------------------------------ + # filter predictors + + final_res <- score_df |> dplyr::filter(!!xpr) |> purrr::pluck("predictor") + + if (length(final_res) == 0) { + # final_res <- fallback_pred() + } + final_res +} + +fallback_pred <- function(scores) { + # get individual ranks + # save best average rank +} + +make_opt_list <- function(opts, scores) { + res <- purrr::map(scores, ~ list()) + names(res) <- scores + score_opts <- intersect(scores, names(opts)) + for (i in score_opts) { + res[[i]] <- opts[[i]] + } + res +} + #' @export -bake.step_select_3 <- function(object, new_data, ...) { +bake.step_predictor_retain <- function(object, new_data, ...) { new_data <- recipes_remove_cols(new_data, object) new_data } #' @export -print.step_select_3 <- function(x, width = max(20, options()$width - 36), ...) { +print.step_predictor_retain <- function( + x, + width = max(20, options()$width - 36), + ... +) { title <- "Feature selection on " print_step( x$removals, @@ -152,7 +221,7 @@ print.step_select_3 <- function(x, width = max(20, options()$width - 36), ...) { #' @usage NULL #' @export -tidy.step_select_3 <- function(x, ...) { +tidy.step_predictor_retain <- function(x, ...) { if (is_trained(x)) { res <- tibble::tibble(terms = unname(x$removals)) } else { @@ -162,16 +231,3 @@ tidy.step_select_3 <- function(x, ...) { res$id <- x$id res } - -#' @export -tunable.step_select_3 <- function(x, ...) { - tibble::tibble( - name = "threshold", - call_info = list( - list(pkg = "dials", fun = "threshold") - ), - source = "recipe", - component = "step_select_3", - component_id = x$id - ) -} diff --git a/man/step_select_3.Rd b/man/step_predictor_retain.Rd similarity index 83% rename from man/step_select_3.Rd rename to man/step_predictor_retain.Rd index 160bd54..b549d23 100644 --- a/man/step_select_3.Rd +++ b/man/step_predictor_retain.Rd @@ -1,18 +1,18 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/step-select-3.R -\name{step_select_3} -\alias{step_select_3} +% Please edit documentation in R/step-predictor-retain.R +\name{step_predictor_retain} +\alias{step_predictor_retain} \title{Feature Selection} \usage{ -step_select_3( +step_predictor_retain( recipe, ..., + score, role = NA, trained = FALSE, - threshold = 0.9, removals = NULL, skip = FALSE, - id = rand_id("select_3") + id = rand_id("predictor_retain") ) } \arguments{ @@ -22,13 +22,13 @@ operations for this recipe.} \item{...}{One or more selector functions to choose variables for this step. See \code{\link[recipes:selections]{selections()}} for more details.} +\item{score}{...} + \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} -\item{threshold}{...} - \item{removals}{A character string that contains the names of columns that should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} is called.} @@ -46,7 +46,7 @@ An updated version of \code{recipe} with the new step added to the sequence of any existing operations. } \description{ -\code{step_select_3()} creates a \emph{specification} of a recipe step that will +\code{step_predictor_retain()} creates a \emph{specification} of a recipe step that will perform feature selection by ... } \details{ @@ -66,13 +66,6 @@ returned with columns \code{terms} and \code{id}: \item{terms}{character, the selectors or variables selected to be removed} \item{id}{character, id of this step} } -} - -\section{Tuning Parameters}{ -This step has 1 tuning parameters: -\itemize{ -\item \code{threshold}: Threshold (type: double, default: 0.9) -} The underlying operation does not allow for case weights. } @@ -81,7 +74,10 @@ The underlying operation does not allow for case weights. library(recipes) rec <- recipe(mpg ~ ., data = mtcars) |> - step_select_3(all_predictors()) + step_predictor_retain( + all_predictors(), + score = cor_pearson >= 0.75 & imp_rf_oblique >= 0 + ) prepped <- prep(rec) diff --git a/tests/testthat/_snaps/step-select-3.md b/tests/testthat/_snaps/step-predictor-retain.md similarity index 91% rename from tests/testthat/_snaps/step-select-3.md rename to tests/testthat/_snaps/step-predictor-retain.md index 4ff1096..5e8df25 100644 --- a/tests/testthat/_snaps/step-select-3.md +++ b/tests/testthat/_snaps/step-predictor-retain.md @@ -69,9 +69,10 @@ # bad args Code - prep(step_select_3(recipe(mpg ~ ., mtcars), all_predictors(), threshold = 2)) + prep(step_predictor_retain(recipe(mpg ~ ., mtcars), all_predictors(), + threshold = 2)) Condition - Error in `step_select_3()`: + Error in `step_predictor_retain()`: Caused by error in `prep()`: ! `threshold` must be a number between 0 and 1, not the number 2. diff --git a/tests/testthat/test-step-select-3.R b/tests/testthat/test-step-predictor-retain.R similarity index 78% rename from tests/testthat/test-step-select-3.R rename to tests/testthat/test-step-predictor-retain.R index b314af6..5dae920 100644 --- a/tests/testthat/test-step-select-3.R +++ b/tests/testthat/test-step-predictor-retain.R @@ -1,14 +1,16 @@ +skip() + # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { # Here for completeness - # step_select_3() removes variables and thus does not care if they are not there. + # step_predictor_retain() removes variables and thus does not care if they are not there. expect_true(TRUE) }) test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) - rec <- step_select_3(rec) + rec <- step_predictor_retain(rec) expect_snapshot(rec) @@ -19,7 +21,7 @@ test_that("empty printing", { test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) - rec2 <- step_select_3(rec1) + rec2 <- step_predictor_retain(rec1) rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) @@ -32,7 +34,7 @@ test_that("empty selection prep/bake is a no-op", { test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) - rec <- step_select_3(rec) + rec <- step_predictor_retain(rec) expect <- tibble(terms = character(), id = character()) @@ -46,7 +48,7 @@ test_that("empty selection tidy method works", { test_that("printing", { set.seed(1) rec <- recipe(~., data = mtcars) |> - step_select_3(all_predictors()) + step_predictor_retain(all_predictors()) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) @@ -55,7 +57,7 @@ test_that("printing", { test_that("tunable is setup to work with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) |> - step_select_3(all_predictors(), threshold = hardhat::tune()) + step_predictor_retain(all_predictors(), threshold = hardhat::tune()) params <- extract_parameter_set_dials(rec) @@ -66,7 +68,7 @@ test_that("tunable is setup to work with extract_parameter_set_dials", { test_that("bad args", { expect_snapshot( recipe(mpg ~ ., mtcars) |> - step_select_3(all_predictors(), threshold = 2) |> + step_predictor_retain(all_predictors(), threshold = 2) |> prep(), error = TRUE ) @@ -75,7 +77,7 @@ test_that("bad args", { test_that("0 and 1 rows data work in bake method", { data <- mtcars rec <- recipe(~., data) |> - step_select_3(all_numeric_predictors()) |> + step_predictor_retain(all_numeric_predictors()) |> prep() expect_identical( From b172557f2688d4e696c5fe609b91f0ec66339d68 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Wed, 20 Aug 2025 17:30:48 -0700 Subject: [PATCH 19/65] use outcome_name in step_predictor_retain() --- R/step-predictor-retain.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index 0774059..c170b92 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -108,8 +108,14 @@ prep.step_predictor_retain <- function(x, training, info = NULL, ...) { wts <- NULL } + outcome_name <- pull_outcome_column_name(info) + if (length(col_names) > 1) { - filter <- calculate_predictor_retain(x$score, data = training[, col_names]) + filter <- calculate_predictor_retain( + xpr = x$score, + outcome = outcome_name, + data = training[, c(outcome_name, col_names)] + ) } else { filter <- character(0) } From 51f05e89cfdea13d27624a7bcc868b8832b62e51 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Wed, 20 Aug 2025 17:30:56 -0700 Subject: [PATCH 20/65] add some tests for step_predictor_retain --- .../testthat/_snaps/step-predictor-retain.md | 10 +- tests/testthat/test-step-predictor-retain.R | 116 +++++++++++++++--- 2 files changed, 106 insertions(+), 20 deletions(-) diff --git a/tests/testthat/_snaps/step-predictor-retain.md b/tests/testthat/_snaps/step-predictor-retain.md index 5e8df25..64e120a 100644 --- a/tests/testthat/_snaps/step-predictor-retain.md +++ b/tests/testthat/_snaps/step-predictor-retain.md @@ -43,7 +43,8 @@ -- Inputs Number of variables by role - predictor: 11 + outcome: 1 + predictor: 10 -- Operations * Feature selection on: all_predictors() @@ -58,13 +59,14 @@ -- Inputs Number of variables by role - predictor: 11 + outcome: 1 + predictor: 10 -- Training information Training data contained 32 data points and no incomplete rows. -- Operations - * Feature selection on: | Trained + * Feature selection on: cyl, disp, hp, wt | Trained # bad args @@ -74,5 +76,5 @@ Condition Error in `step_predictor_retain()`: Caused by error in `prep()`: - ! `threshold` must be a number between 0 and 1, not the number 2. + ! The following argument was specified but does not exist: `threshold`. diff --git a/tests/testthat/test-step-predictor-retain.R b/tests/testthat/test-step-predictor-retain.R index 5dae920..43b00a1 100644 --- a/tests/testthat/test-step-predictor-retain.R +++ b/tests/testthat/test-step-predictor-retain.R @@ -1,4 +1,96 @@ -skip() +test_that("step works", { + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= 0.75 & abs(cor_spearman) >= 0.6 + ) + + prepped <- prep(rec) + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + cor_pearson_res <- filtro::score_cor_pearson |> + filtro::fit(mpg ~ ., data = mtcars) + cor_spearman_res <- filtro::score_cor_spearman |> + filtro::fit(mpg ~ ., data = mtcars) + + exp <- filtro::fill_safe_values(list( + cor_pearson_res, + cor_spearman_res + )) |> + dplyr::filter(abs(cor_pearson) >= 0.75 & abs(cor_spearman) >= 0.6) |> + dplyr::pull(predictor) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(exp) + ) + expect_identical( + sort(res_tidy$terms), + sort(exp) + ) +}) + +test_that("step allows for no removals", { + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= 0.99 & abs(cor_spearman) >= 0.99 + ) + + prepped <- prep(rec) + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + cor_pearson_res <- filtro::score_cor_pearson |> + filtro::fit(mpg ~ ., data = mtcars) + cor_spearman_res <- filtro::score_cor_spearman |> + filtro::fit(mpg ~ ., data = mtcars) + + exp <- filtro::fill_safe_values(list( + cor_pearson_res, + cor_spearman_res + )) |> + dplyr::filter(abs(cor_pearson) >= 0.99 & abs(cor_spearman) >= 0.99) |> + dplyr::pull(predictor) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(exp) + ) + expect_identical( + sort(res_tidy$terms), + sort(exp) + ) +}) + +test_that("allows for one score", { + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= 0.7 + ) + + prepped <- prep(rec) + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + cor_pearson_res <- filtro::score_cor_pearson |> + filtro::fit(mpg ~ ., data = mtcars) + + exp <- cor_pearson_res@results |> + dplyr::filter(abs(score) >= 0.7) |> + dplyr::pull(predictor) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(exp) + ) + expect_identical( + sort(res_tidy$terms), + sort(exp) + ) +}) # Infrastructure --------------------------------------------------------------- @@ -47,24 +139,13 @@ test_that("empty selection tidy method works", { test_that("printing", { set.seed(1) - rec <- recipe(~., data = mtcars) |> - step_predictor_retain(all_predictors()) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain(all_predictors(), score = abs(cor_pearson) >= 0.75) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) -test_that("tunable is setup to work with extract_parameter_set_dials", { - skip_if_not_installed("dials") - rec <- recipe(~., data = mtcars) |> - step_predictor_retain(all_predictors(), threshold = hardhat::tune()) - - params <- extract_parameter_set_dials(rec) - - expect_s3_class(params, "parameters") - expect_identical(nrow(params), 1L) -}) - test_that("bad args", { expect_snapshot( recipe(mpg ~ ., mtcars) |> @@ -76,8 +157,11 @@ test_that("bad args", { test_that("0 and 1 rows data work in bake method", { data <- mtcars - rec <- recipe(~., data) |> - step_predictor_retain(all_numeric_predictors()) |> + rec <- recipe(mpg ~ ., data) |> + step_predictor_retain( + all_numeric_predictors(), + score = abs(cor_pearson) >= 0.75 + ) |> prep() expect_identical( From ac0ff8519c6f1a5caaf4f482535d33bfb8276dd5 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Wed, 20 Aug 2025 17:36:02 -0700 Subject: [PATCH 21/65] slight example update --- R/step-predictor-retain.R | 2 +- man/step_predictor_retain.Rd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index c170b92..7c35988 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -40,7 +40,7 @@ #' rec <- recipe(mpg ~ ., data = mtcars) |> #' step_predictor_retain( #' all_predictors(), -#' score = cor_pearson >= 0.75 & imp_rf_oblique >= 0 +#' score = cor_pearson >= 0.75 & cor_spearman >= 0.75 #' ) #' #' prepped <- prep(rec) diff --git a/man/step_predictor_retain.Rd b/man/step_predictor_retain.Rd index b549d23..9f74e90 100644 --- a/man/step_predictor_retain.Rd +++ b/man/step_predictor_retain.Rd @@ -76,7 +76,7 @@ library(recipes) rec <- recipe(mpg ~ ., data = mtcars) |> step_predictor_retain( all_predictors(), - score = cor_pearson >= 0.75 & imp_rf_oblique >= 0 + score = cor_pearson >= 0.75 & cor_spearman >= 0.75 ) prepped <- prep(rec) From f27f746b296996055e6e1b349a3b39831f52cbd2 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 21 Aug 2025 13:32:22 -0400 Subject: [PATCH 22/65] try moving desirability2 to suggests --- DESCRIPTION | 2 +- R/recipe_utils.R | 1 + R/step_predictor_desirability.R | 21 ++++++++++++++++++--- man/step_predictor_desirability.Rd | 21 ++++++++++++++++++--- 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f59ce42..ac167ea 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,7 +21,6 @@ Depends: recipes (>= 1.1.0) Imports: cli, - desirability2 (>= 0.1.0.9000), dplyr, filtro, generics, @@ -36,6 +35,7 @@ Imports: withr, workflows Suggests: + desirability2 (>= 0.1.0.9000), censored, future, future.apply, diff --git a/R/recipe_utils.R b/R/recipe_utils.R index 5588f7b..752a0bb 100644 --- a/R/recipe_utils.R +++ b/R/recipe_utils.R @@ -19,6 +19,7 @@ check_score_names <- function(x, call) { res } +# TODO this should go into recipes pull_outcome_column_name <- function(x) { outcome_name <- x$variable[x$role == "outcome"] num_outcomes <- length(outcome_name) diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index fb51e51..cee5731 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -50,16 +50,29 @@ #' cat(result) #' ``` #' -#' The underlying operation does not allow for case weights. +#' ## Ties +#' +#' Note that `dplyr::slice_max()` with the argument `with_ties = TRUE ` is used +#' to select predictors. If there are many ties in overall desirability, the +#' proportion selected can be larger than the value given to `prep_terms()`. +#' +#' ## Case Weights +#' +#' Case weights can be used by some scoring functions. To learn more, load the +#' \pkg{filtro} package and check the `case_weights` property of the score +#' object (see Examples below). For a recipe, use one of the tidymodels case +#' weight functions such as [hardhat::importance_weights()] or +#' [hardhat:: frequency_weights()], to assign the correct data type to the +#' vector of case weights. A recipe will then interpret that class to be a case +#' weight (and no other role). A full example is below. #' #' @seealso [desirability2::desirability()] #' @references Derringer, G. and Suich, R. (1980), Simultaneous Optimization of #' Several Response Variables. _Journal of Quality Technology_, 12, 214-219. #' @examples #' library(recipes) -#' library(desirability2) #' -#' if (rlang::is_installed("modeldata")) { +#' if (rlang::is_installed(c("modeldata", "desirability2"))) { #' # The `ad_data` has a binary outcome column ("Class") and mostly numeric #' # predictors. For these, we score the predictors using an analysis of #' # variance model where the predicor is the outcome and the outcome class @@ -69,6 +82,7 @@ #' # tests, the -log10(pvalue) is returned so that larger values are more #' # important. #' +#' library(desirability2) #' # The score_* objects here are from the filtro package. See Details above. #' goals <- #' desirability( @@ -211,6 +225,7 @@ step_predictor_desirability_new <- #' @export prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { + rlang::check_installed("desirability2") col_names <- recipes_eval_select(x$terms, training, info) check_type(training[, col_names], types = c("double", "integer", "factor")) check_number_decimal(x$prop_terms, min = .Machine$double.eps, max = 1, arg = "prop_terms") diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index 0ae61ac..6a1d9c3 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -91,15 +91,29 @@ This step has 1 tuning parameters: \itemize{ \item \code{prop_terms}: Threshold (type: double, default: 0.5) } +\subsection{Ties}{ -The underlying operation does not allow for case weights. +Note that \code{dplyr::slice_max()} with the argument \code{with_ties = TRUE } is used +to select predictors. If there are many ties in overall desirability, the +proportion selected can be larger than the value given to \code{prep_terms()}. +} + +\subsection{Case Weights}{ + +Case weights can be used by some scoring functions. To learn more, load the +\pkg{filtro} package and check the \code{case_weights} property of the score +object (see Examples below). For a recipe, use one of the tidymodels case +weight functions such as \code{\link[hardhat:importance_weights]{hardhat::importance_weights()}} or +\code{\link[hardhat:frequency_weights]{hardhat:: frequency_weights()}}, to assign the correct data type to the +vector of case weights. A recipe will then interpret that class to be a case +weight (and no other role). A full example is below. +} } \examples{ library(recipes) -library(desirability2) -if (rlang::is_installed("modeldata")) { +if (rlang::is_installed(c("modeldata", "desirability2"))) { # The `ad_data` has a binary outcome column ("Class") and mostly numeric # predictors. For these, we score the predictors using an analysis of # variance model where the predicor is the outcome and the outcome class @@ -109,6 +123,7 @@ if (rlang::is_installed("modeldata")) { # tests, the -log10(pvalue) is returned so that larger values are more # important. + library(desirability2) # The score_* objects here are from the filtro package. See Details above. goals <- desirability( From 67bc87418ab3d6b5b0d9c15b734d996625920424 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 21 Aug 2025 14:32:02 -0400 Subject: [PATCH 23/65] more documentation --- NAMESPACE | 1 + R/step_predictor_desirability.R | 203 ++++++++++++++++------------- man/step_predictor_desirability.Rd | 192 +++++++++++++++------------ 3 files changed, 225 insertions(+), 171 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 2a23f08..2d8aae2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ S3method(prep,step_predictor_retain) S3method(print,step_predictor_best) S3method(print,step_predictor_desirability) S3method(print,step_predictor_retain) +S3method(required_pkgs,step_predictor_desirability) S3method(tidy,step_predictor_best) S3method(tidy,step_predictor_desirability) S3method(tidy,step_predictor_retain) diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index cee5731..a74f38e 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -1,54 +1,62 @@ -#' Feature Selection +#'Feature Selection #' #' `step_predictor_desirability()` creates a *specification* of a recipe step -#' that uses one or more "score" functions to measure how how much each -#' predictor is related to the outcome value. These scores are combined into -#' a composite value using user-specified _desirability_ functions and a -#' proportion of the most desirable predictors are retained. +#' that uses one or more "score" functions to measure how how much each predictor +#' is related to the outcome value. These scores are combined into a composite +#' value using user-specified _desirability_ functions and a proportion of the +#' most desirable predictors are retained. #' #' @inheritParams recipes::step_center #' @param score An object produced by [desirability2::desirability()] that uses #' one or more score functions from the \pkg{filtro} package. See the Details #' and Examples sections below. This argument *should be named* when used. #' @param prop_terms The proportion of predictors that should be retained when -#' ordered by overall desirability. +#' ordered by overall desirability. A value of [hardhat::tune()] can also be +#' used. #' @param update_prop A logical: should `prop_terms` be updated so that at least #' one predictor will be retained? #' @param removals A character string that contains the names of predictors that #' should be removed. These values are not determined until [recipes::prep()] #' is called. #' @param results A data frame of score and desirability values for each -#' predictor evaluated. These values are not determined until -#' [recipes::prep()] is called. -#' @return An updated version of `recipe` with the new step added to the -#' sequence of any existing operations. +#' predictor evaluated. These values are not determined until [recipes::prep()] +#' is called. #' @export #' #' @details #' -#' This step ... +#' This recipe step can compute one or more scores and conduct a simultaneous +#' selection of the top predictors using _desirability functions_. These are +#' functions that, for some type of goal, translate the score's values to a +#' scale of `[0, 1]`, where 1.0 is the best result and 0.0 is unacceptable. +#' Once we have these for each score, the overall desirability is computed +#' using the geometric mean of the individual desirabilities. See the examples +#' in [desirability2::d_overall()] and [desirability2::d_max()]. #' -#' This step can potentially remove columns from the data set. This may -#' cause issues for subsequent steps in your recipe if the missing columns are -#' specifically referenced by name. To avoid this, see the advice in the -#' _Tips for saving recipes and filtering columns_ section of -#' [recipes::selections]. +#' To define desirabilities, use [desirability2::desirability()] function to +#' define _goals_ for each score and pass that to the recipe in the `score` +#' argument. #' -#' # Tidying +#' Some important notes: #' -#' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is -#' returned with columns `terms` and `id`: +#' - Scores that are p-values are automatically transformed by \pkg{filtro} to +#' be in the format `-log10(pvalue)` so that a p-value of 0.1 is converted to +#' 1.0. For these, use the `maximize()` goal. #' -#' \describe{ -#' \item{terms}{character, the selectors or variables selected to be removed} -#' \item{id}{character, id of this step} -#' } +#' - Other scores are also transformed in the data. For example, the correlation +#' scores given to the recipe step are in absolute value format. See the +#' \pkg{filtro} documentation for each score. +#' +#' - You can use some in-line functions using base R functions. For example, +#' `maximize(max(score_cor_spearman))`. +#' +#' - If a predictor cannot be computed for all scores, it is given a "fallback +#' value" that will prevent it from being excluded for this reason. #' -#' ```{r, echo = FALSE, results="asis"} -#' step <- "step_predictor_desirability" -#' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") -#' cat(result) -#' ``` +#' This step can potentially remove columns from the data set. This may cause +#' issues for subsequent steps in your recipe if the missing columns are +#' specifically referenced by name. To avoid this, see the advice in the _Tips +#' for saving recipes and filtering columns_ section of [recipes::selections]. #' #' ## Ties #' @@ -59,16 +67,35 @@ #' ## Case Weights #' #' Case weights can be used by some scoring functions. To learn more, load the -#' \pkg{filtro} package and check the `case_weights` property of the score -#' object (see Examples below). For a recipe, use one of the tidymodels case -#' weight functions such as [hardhat::importance_weights()] or -#' [hardhat:: frequency_weights()], to assign the correct data type to the -#' vector of case weights. A recipe will then interpret that class to be a case -#' weight (and no other role). A full example is below. +#' \pkg{filtro} package and check the `case_weights` property of the score object +#' (see Examples below). For a recipe, use one of the tidymodels case weight +#' functions such as [hardhat::importance_weights()] or +#' [hardhat::frequency_weights], to assign the correct data type to the vector of case +#' weights. A recipe will then interpret that class to be a case weight (and no +#' other role). A full example is below. +#' +#' ## Tidy method +#' +#' For a trained recipe, the `tidy()` method will return a tibble with columns +#' `terms` (the predictor names), `id`, columns for the estimated scores, and +#' the desirability results. The desirability columns will have the same name +#' as the scores with an additional prefix of `.d_`. The overall desirability +#' column is called `.d_overall`. +#' +#' @return An updated version of `recipe` with the new step added to the +#' sequence of any existing operations. When you +#' [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is returned +#' with columns `terms` and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected to be removed} +#' \item{id}{character, id of this step} +#' } +#' Once trained, additional columns are included (see Details section). #' #' @seealso [desirability2::desirability()] #' @references Derringer, G. and Suich, R. (1980), Simultaneous Optimization of -#' Several Response Variables. _Journal of Quality Technology_, 12, 214-219. +#' Several Response Variables. _Journal of Quality Technology_, 12, 214-219. #' @examples #' library(recipes) #' @@ -84,11 +111,11 @@ #' #' library(desirability2) #' # The score_* objects here are from the filtro package. See Details above. -#' goals <- -#' desirability( -#' maximize(score_xtab_pval_fisher), -#' maximize(score_aov_pval) -#' ) +#' goals <- +#' desirability( +#' maximize(score_xtab_pval_fisher), +#' maximize(score_aov_pval) +#' ) #' #' example_data <- modeldata::ad_data #' rec <- @@ -111,50 +138,50 @@ #' #' # -------------------------------------------------------------------------- #' -#' # Case-weight example: use the hardhat package to create the appropriate type -#' # of case weights. Here, we'll increase the weights for the minority class and -#' # add them to the data frame. -#' -#' library(hardhat) -#' -#' example_weights <- example_data -#' weights <- ifelse(example_data$Class == "Impaired", 5, 1) -#' example_weights$weights <- importance_weights(weights) -#' -#' # To see if the scores can use case weights, load the filtro package and -#' # check the `case_weights` property: -#' -#' library(filtro) -#' -#' score_xtab_pval_fisher@case_weights -#' score_aov_pval@case_weights -#' -#' # The recipe will automatically find the case weights and will -#' # not treat them as predictors. -#' rec_wts <- -#' recipe(Class ~ ., data = example_weights) |> -#' step_predictor_desirability( -#' all_predictors(), -#' score = goals, -#' prop_terms = 1/2 -#' ) |> -#' prep() -#' rec_wts -#' -#' predictor_scores_wts <- -#' tidy(rec_wts, number = 1) |> -#' select(terms, .d_overall_weighted = .d_overall) -#' -#' library(dplyr) -#' library(ggplot2) -#' -#' # The selection did not substantually change with these case weights -#' full_join(predictor_scores, predictor_scores_wts, by = "terms") |> -#' ggplot(aes(.d_overall, .d_overall_weighted)) + -#' geom_abline(col = "darkgreen", lty = 2) + -#' geom_point(alpha = 1 / 2) + -#' coord_fixed(ratio = 1) + -#' labs(x = "Unweighted", y = "Class Weighted") +#' # Case-weight example: use the hardhat package to create the appropriate type +#' # of case weights. Here, we'll increase the weights for the minority class and +#' # add them to the data frame. +#' +#' library(hardhat) +#' +#' example_weights <- example_data +#' weights <- ifelse(example_data$Class == "Impaired", 5, 1) +#' example_weights$weights <- importance_weights(weights) +#' +#' # To see if the scores can use case weights, load the filtro package and +#' # check the `case_weights` property: +#' +#' library(filtro) +#' +#' score_xtab_pval_fisher@case_weights +#' score_aov_pval@case_weights +#' +#' # The recipe will automatically find the case weights and will +#' # not treat them as predictors. +#' rec_wts <- +#' recipe(Class ~ ., data = example_weights) |> +#' step_predictor_desirability( +#' all_predictors(), +#' score = goals, +#' prop_terms = 1/2 +#' ) |> +#' prep() +#' rec_wts +#' +#' predictor_scores_wts <- +#' tidy(rec_wts, number = 1) |> +#' select(terms, .d_overall_weighted = .d_overall) +#' +#' library(dplyr) +#' library(ggplot2) +#' +#' # The selection did not substantually change with these case weights +#' full_join(predictor_scores, predictor_scores_wts, by = "terms") |> +#' ggplot(aes(.d_overall, .d_overall_weighted)) + +#' geom_abline(col = "darkgreen", lty = 2) + +#' geom_point(alpha = 1 / 2) + +#' coord_fixed(ratio = 1) + +#' labs(x = "Unweighted", y = "Class Weighted") #' } step_predictor_desirability <- function( recipe, @@ -265,7 +292,6 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { fm <- paste(outcome_name, "~ .") fm <- stats::as.formula(fm) - # TODO ooof... how to handle case weights? score_objs <- purrr::map( score_names, @@ -339,8 +365,6 @@ tidy.step_predictor_desirability <- function(x, ...) { res } -# TODO make a new dials parameter for prop_terms - #' @export tunable.step_predictor_desirability <- function(x, ...) { tibble::tibble( @@ -353,3 +377,8 @@ tunable.step_predictor_desirability <- function(x, ...) { component_id = x$id ) } + +#' @export +required_pkgs.step_predictor_desirability <- function(x, ...) { + c("important", "filtro", "desirability2") +} diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index 6a1d9c3..9598d3d 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -35,14 +35,15 @@ and Examples sections below. This argument \emph{should be named} when used.} been estimated.} \item{prop_terms}{The proportion of predictors that should be retained when -ordered by overall desirability.} +ordered by overall desirability. A value of \code{\link[hardhat:tune]{hardhat::tune()}} can also be +used.} \item{update_prop}{A logical: should \code{prop_terms} be updated so that at least one predictor will be retained?} \item{results}{A data frame of score and desirability values for each -predictor evaluated. These values are not determined until -\code{\link[recipes:prep]{recipes::prep()}} is called.} +predictor evaluated. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} \item{removals}{A character string that contains the names of predictors that should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} @@ -58,39 +59,54 @@ may affect the computations for subsequent operations.} } \value{ An updated version of \code{recipe} with the new step added to the -sequence of any existing operations. -} -\description{ -\code{step_predictor_desirability()} creates a \emph{specification} of a recipe step -that uses one or more "score" functions to measure how how much each -predictor is related to the outcome value. These scores are combined into -a composite value using user-specified \emph{desirability} functions and a -proportion of the most desirable predictors are retained. -} -\details{ -This step ... - -This step can potentially remove columns from the data set. This may -cause issues for subsequent steps in your recipe if the missing columns are -specifically referenced by name. To avoid this, see the advice in the -\emph{Tips for saving recipes and filtering columns} section of -\link[recipes:selections]{recipes::selections}. -} -\section{Tidying}{ -When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is -returned with columns \code{terms} and \code{id}: +sequence of any existing operations. When you +\code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is returned +with columns \code{terms} and \code{id}: \describe{ \item{terms}{character, the selectors or variables selected to be removed} \item{id}{character, id of this step} } +Once trained, additional columns are included (see Details section). } - -\section{Tuning Parameters}{ -This step has 1 tuning parameters: +\description{ +\code{step_predictor_desirability()} creates a \emph{specification} of a recipe step +that uses one or more "score" functions to measure how how much each predictor +is related to the outcome value. These scores are combined into a composite +value using user-specified \emph{desirability} functions and a proportion of the +most desirable predictors are retained. +} +\details{ +This recipe step can compute one or more scores and conduct a simultaneous +selection of the top predictors using \emph{desirability functions}. These are +functions that, for some type of goal, translate the score's values to a +scale of \verb{[0, 1]}, where 1.0 is the best result and 0.0 is unacceptable. +Once we have these for each score, the overall desirability is computed +using the geometric mean of the individual desirabilities. See the examples +in \code{\link[desirability2:d_overall]{desirability2::d_overall()}} and \code{\link[desirability2:inline_desirability]{desirability2::d_max()}}. + +To define desirabilities, use \code{\link[desirability2:desirability]{desirability2::desirability()}} function to +define \emph{goals} for each score and pass that to the recipe in the \code{score} +argument. + +Some important notes: \itemize{ -\item \code{prop_terms}: Threshold (type: double, default: 0.5) +\item Scores that are p-values are automatically transformed by \pkg{filtro} to +be in the format \code{-log10(pvalue)} so that a p-value of 0.1 is converted to +1.0. For these, use the \code{maximize()} goal. +\item Other scores are also transformed in the data. For example, the correlation +scores given to the recipe step are in absolute value format. See the +\pkg{filtro} documentation for each score. +\item You can use some in-line functions using base R functions. For example, +\code{maximize(max(score_cor_spearman))}. +\item If a predictor cannot be computed for all scores, it is given a "fallback +value" that will prevent it from being excluded for this reason. } + +This step can potentially remove columns from the data set. This may cause +issues for subsequent steps in your recipe if the missing columns are +specifically referenced by name. To avoid this, see the advice in the \emph{Tips +for saving recipes and filtering columns} section of \link[recipes:selections]{recipes::selections}. \subsection{Ties}{ Note that \code{dplyr::slice_max()} with the argument \code{with_ties = TRUE } is used @@ -101,15 +117,23 @@ proportion selected can be larger than the value given to \code{prep_terms()}. \subsection{Case Weights}{ Case weights can be used by some scoring functions. To learn more, load the -\pkg{filtro} package and check the \code{case_weights} property of the score -object (see Examples below). For a recipe, use one of the tidymodels case -weight functions such as \code{\link[hardhat:importance_weights]{hardhat::importance_weights()}} or -\code{\link[hardhat:frequency_weights]{hardhat:: frequency_weights()}}, to assign the correct data type to the -vector of case weights. A recipe will then interpret that class to be a case -weight (and no other role). A full example is below. -} +\pkg{filtro} package and check the \code{case_weights} property of the score object +(see Examples below). For a recipe, use one of the tidymodels case weight +functions such as \code{\link[hardhat:importance_weights]{hardhat::importance_weights()}} or +\link[hardhat:frequency_weights]{hardhat::frequency_weights}, to assign the correct data type to the vector of case +weights. A recipe will then interpret that class to be a case weight (and no +other role). A full example is below. } +\subsection{Tidy method}{ + +For a trained recipe, the \code{tidy()} method will return a tibble with columns +\code{terms} (the predictor names), \code{id}, columns for the estimated scores, and +the desirability results. The desirability columns will have the same name +as the scores with an additional prefix of \code{.d_}. The overall desirability +column is called \code{.d_overall}. +} +} \examples{ library(recipes) @@ -125,11 +149,11 @@ if (rlang::is_installed(c("modeldata", "desirability2"))) { library(desirability2) # The score_* objects here are from the filtro package. See Details above. - goals <- - desirability( - maximize(score_xtab_pval_fisher), - maximize(score_aov_pval) - ) + goals <- + desirability( + maximize(score_xtab_pval_fisher), + maximize(score_aov_pval) + ) example_data <- modeldata::ad_data rec <- @@ -152,50 +176,50 @@ if (rlang::is_installed(c("modeldata", "desirability2"))) { # -------------------------------------------------------------------------- -# Case-weight example: use the hardhat package to create the appropriate type -# of case weights. Here, we'll increase the weights for the minority class and -# add them to the data frame. - -library(hardhat) - -example_weights <- example_data -weights <- ifelse(example_data$Class == "Impaired", 5, 1) -example_weights$weights <- importance_weights(weights) - -# To see if the scores can use case weights, load the filtro package and -# check the `case_weights` property: - -library(filtro) - -score_xtab_pval_fisher@case_weights -score_aov_pval@case_weights - -# The recipe will automatically find the case weights and will -# not treat them as predictors. -rec_wts <- - recipe(Class ~ ., data = example_weights) |> - step_predictor_desirability( - all_predictors(), - score = goals, - prop_terms = 1/2 - ) |> - prep() -rec_wts - -predictor_scores_wts <- - tidy(rec_wts, number = 1) |> - select(terms, .d_overall_weighted = .d_overall) - -library(dplyr) -library(ggplot2) - -# The selection did not substantually change with these case weights -full_join(predictor_scores, predictor_scores_wts, by = "terms") |> - ggplot(aes(.d_overall, .d_overall_weighted)) + - geom_abline(col = "darkgreen", lty = 2) + - geom_point(alpha = 1 / 2) + - coord_fixed(ratio = 1) + - labs(x = "Unweighted", y = "Class Weighted") + # Case-weight example: use the hardhat package to create the appropriate type + # of case weights. Here, we'll increase the weights for the minority class and + # add them to the data frame. + + library(hardhat) + + example_weights <- example_data + weights <- ifelse(example_data$Class == "Impaired", 5, 1) + example_weights$weights <- importance_weights(weights) + + # To see if the scores can use case weights, load the filtro package and + # check the `case_weights` property: + + library(filtro) + + score_xtab_pval_fisher@case_weights + score_aov_pval@case_weights + + # The recipe will automatically find the case weights and will + # not treat them as predictors. + rec_wts <- + recipe(Class ~ ., data = example_weights) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 1/2 + ) |> + prep() + rec_wts + + predictor_scores_wts <- + tidy(rec_wts, number = 1) |> + select(terms, .d_overall_weighted = .d_overall) + + library(dplyr) + library(ggplot2) + + # The selection did not substantually change with these case weights + full_join(predictor_scores, predictor_scores_wts, by = "terms") |> + ggplot(aes(.d_overall, .d_overall_weighted)) + + geom_abline(col = "darkgreen", lty = 2) + + geom_point(alpha = 1 / 2) + + coord_fixed(ratio = 1) + + labs(x = "Unweighted", y = "Class Weighted") } } \references{ From 37494b89944819b6cac0659cfc76604d265d0908 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 21 Aug 2025 16:02:56 -0400 Subject: [PATCH 24/65] remotes WITH AN S --- DESCRIPTION | 4 ++-- R/step_predictor_desirability.R | 4 ++-- man/step_predictor_desirability.Rd | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ac167ea..eba3833 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,6 +21,7 @@ Depends: recipes (>= 1.1.0) Imports: cli, + desirability2 (>= 0.1.0.9000), dplyr, filtro, generics, @@ -35,7 +36,6 @@ Imports: withr, workflows Suggests: - desirability2 (>= 0.1.0.9000), censored, future, future.apply, @@ -46,7 +46,7 @@ Suggests: survival, testthat (>= 3.0.0), yardstick -Remote: +Remotes: tidymodels/desirability2 Config/Needs/website: tidyverse/tidytemplate, tidymodels Config/testthat/edition: 3 diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index a74f38e..b263ae7 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -98,8 +98,9 @@ #' Several Response Variables. _Journal of Quality Technology_, 12, 214-219. #' @examples #' library(recipes) +#' library(desirability2) #' -#' if (rlang::is_installed(c("modeldata", "desirability2"))) { +#' if (rlang::is_installed("modeldata")) { #' # The `ad_data` has a binary outcome column ("Class") and mostly numeric #' # predictors. For these, we score the predictors using an analysis of #' # variance model where the predicor is the outcome and the outcome class @@ -109,7 +110,6 @@ #' # tests, the -log10(pvalue) is returned so that larger values are more #' # important. #' -#' library(desirability2) #' # The score_* objects here are from the filtro package. See Details above. #' goals <- #' desirability( diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index 9598d3d..7845b9e 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -136,8 +136,9 @@ column is called \code{.d_overall}. } \examples{ library(recipes) +library(desirability2) -if (rlang::is_installed(c("modeldata", "desirability2"))) { +if (rlang::is_installed("modeldata")) { # The `ad_data` has a binary outcome column ("Class") and mostly numeric # predictors. For these, we score the predictors using an analysis of # variance model where the predicor is the outcome and the outcome class @@ -147,7 +148,6 @@ if (rlang::is_installed(c("modeldata", "desirability2"))) { # tests, the -log10(pvalue) is returned so that larger values are more # important. - library(desirability2) # The score_* objects here are from the filtro package. See Details above. goals <- desirability( From 6c25f14256659b7c62ff48dcaff4012488e80908 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 21 Aug 2025 16:24:02 -0400 Subject: [PATCH 25/65] fitro in Remotes --- DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index eba3833..ed3b365 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,7 +47,8 @@ Suggests: testthat (>= 3.0.0), yardstick Remotes: - tidymodels/desirability2 + tidymodels/desirability2, + tidymodels/filtro Config/Needs/website: tidyverse/tidytemplate, tidymodels Config/testthat/edition: 3 Config/usethis/last-upkeep: 2025-06-09 From 9b67b6d8c5c0a478bc4fa343c9567a630426fed9 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 21 Aug 2025 17:12:48 -0400 Subject: [PATCH 26/65] load two other packages in workers --- tests/testthat/helper-objects.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/testthat/helper-objects.R b/tests/testthat/helper-objects.R index 7ea19d9..1466704 100644 --- a/tests/testthat/helper-objects.R +++ b/tests/testthat/helper-objects.R @@ -4,6 +4,8 @@ suppressPackageStartupMessages(library(dplyr)) suppressPackageStartupMessages(library(purrr)) suppressPackageStartupMessages(library(parsnip)) # imported by tune suppressPackageStartupMessages(library(yardstick)) # imported by tune +suppressPackageStartupMessages(library(filtro)) +suppressPackageStartupMessages(library(desirability2)) # ------------------------------------------------------------------------------ # regression examples From fb38af56abb33964be7c25f8597386372668b1db Mon Sep 17 00:00:00 2001 From: Frances Lin <37535633+franceslinyc@users.noreply.github.com> Date: Fri, 22 Aug 2025 09:36:32 -0700 Subject: [PATCH 27/65] WIP --- R/step-predictor-best.R | 61 +++++++++++++---------- tests/testthat/test-step-predictor-best.R | 6 ++- 2 files changed, 41 insertions(+), 26 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index 05f60a3..42f2c44 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -4,7 +4,12 @@ #' perform feature selection by ... #' #' @inheritParams recipes::step_center -#' @param threshold ... +#' @param score ... +#' @param prop_terms The proportion of predictors that should be retained when +#' ordered by overall desirability. A value of [hardhat::tune()] can also be +#' used. +#' @param update_prop A logical: should `prop_terms` be updated so that at least +#' one predictor will be retained? #' @param removals A character string that contains the names of columns that #' should be removed. These values are not determined until [recipes::prep()] #' is called. @@ -32,20 +37,9 @@ #' \item{id}{character, id of this step} #' } #' -#' ```{r, echo = FALSE, results="asis"} -#' step <- "step_predictor_best" -#' result <- knitr::knit_child("man/rmd/tunable-args.Rmd") -#' cat(result) -#' ``` -#' -#' The underlying operation does not allow for case weights. -#' #' @examples #' library(recipes) #' -#' rec <- recipe(mpg ~ ., data = mtcars) |> -#' step_predictor_best(all_predictors()) -#' #' prepped <- prep(rec) #' #' bake(prepped, mtcars) @@ -54,9 +48,11 @@ step_predictor_best <- function( recipe, ..., + score, role = NA, trained = FALSE, - threshold = 0.9, + prop_terms = 0.5, + update_prop = TRUE, removals = NULL, skip = FALSE, id = rand_id("predictor_best") @@ -65,9 +61,11 @@ step_predictor_best <- function( recipe, step_predictor_best_new( terms = enquos(...), + score = rlang::enexpr(score), role = role, trained = trained, - threshold = threshold, + prop_terms = prop_terms, + update_prop = update_prop, removals = removals, skip = skip, id = id, @@ -79,9 +77,12 @@ step_predictor_best <- function( step_predictor_best_new <- function( terms, + score, role, trained, - threshold, + prop_terms, + update_prop = update_prop, + results, removals, skip, id, @@ -90,9 +91,12 @@ step_predictor_best_new <- step( subclass = "predictor_best", terms = terms, + score = score, role = role, trained = trained, - threshold = threshold, + prop_terms = prop_terms, + update_prop = update_prop, + results = results, removals = removals, skip = skip, id = id, @@ -104,7 +108,6 @@ step_predictor_best_new <- prep.step_predictor_best <- function(x, training, info = NULL, ...) { col_names <- recipes_eval_select(x$terms, training, info) check_type(training[, col_names], types = c("double", "integer")) - check_number_decimal(x$threshold, min = 0, max = 1, arg = "threshold") wts <- get_case_weights(info, training) were_weights_used <- are_weights_used(wts, unsupervised = TRUE) @@ -112,17 +115,20 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { wts <- NULL } - if (length(col_names) > 1) { - filter <- character(0) - } else { - filter <- character(0) - } + # if (length(col_names) > 1) { + # filter <- character(0) + # } else { + # filter <- character(0) + # } step_predictor_best_new( terms = x$terms, + score = x$score, role = x$role, trained = TRUE, - threshold = x$threshold, + results = score_objs, + prop_terms = x$prop_terms, + update_prop = x$update_prop, removals = filter, skip = x$skip, id = x$id, @@ -142,7 +148,7 @@ print.step_predictor_best <- function( width = max(20, options()$width - 36), ... ) { - title <- "Feature selection on" + title <- "Feature selection on " print_step( x$removals, x$terms, @@ -170,7 +176,7 @@ tidy.step_predictor_best <- function(x, ...) { #' @export tunable.step_predictor_best <- function(x, ...) { tibble::tibble( - name = "threshold", + name = "prop_terms", call_info = list( list(pkg = "dials", fun = "threshold") ), @@ -179,3 +185,8 @@ tunable.step_predictor_best <- function(x, ...) { component_id = x$id ) } + +#' @export +required_pkgs.step_predictor_desirability <- function(x, ...) { + c("important", "filtro") +} diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index a09d47b..eeb5889 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -1,6 +1,10 @@ -# Infrastructure --------------------------------------------------------------- skip() +rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best(all_predictors(), score = cor_pearson, prop_terms = 1 / 2) + + +# Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { # Here for completeness # step_predictor_best() removes variables and thus does not care if they are not there. From d5cf3e2ac0a57a107b451b8ba36df601a8f74c0b Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Fri, 22 Aug 2025 15:09:22 -0700 Subject: [PATCH 28/65] make GHA run clean --- R/important-package.R | 4 ++-- R/step-predictor-best.R | 20 +++++++++++++++----- man/step_predictor_best.Rd | 28 ++++++++++++++++------------ 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/R/important-package.R b/R/important-package.R index 8ad58ad..f35b04b 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -32,9 +32,9 @@ utils::globalVariables( "std_err", "score", ".d_overall", - "outcome" + "outcome", + "score_objs" ) ) ## usethis namespace: end NULL - diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index 42f2c44..5538788 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -10,6 +10,8 @@ #' used. #' @param update_prop A logical: should `prop_terms` be updated so that at least #' one predictor will be retained? +#' @param results Fitted filtro objects. These values are not determined until +#' [recipes::prep()] is called. #' @param removals A character string that contains the names of columns that #' should be removed. These values are not determined until [recipes::prep()] #' is called. @@ -40,6 +42,9 @@ #' @examples #' library(recipes) #' +#' rec <- recipe(mpg ~ ., data = mtcars) |> +#' step_predictor_best() +#' #' prepped <- prep(rec) #' #' bake(prepped, mtcars) @@ -53,6 +58,7 @@ step_predictor_best <- function( trained = FALSE, prop_terms = 0.5, update_prop = TRUE, + results = NULL, removals = NULL, skip = FALSE, id = rand_id("predictor_best") @@ -66,6 +72,7 @@ step_predictor_best <- function( trained = trained, prop_terms = prop_terms, update_prop = update_prop, + results = results, removals = removals, skip = skip, id = id, @@ -115,11 +122,14 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { wts <- NULL } - # if (length(col_names) > 1) { - # filter <- character(0) - # } else { - # filter <- character(0) - # } + if (length(col_names) > 1) { + filter <- character(0) + } else { + filter <- character(0) + } + + # TODO: generate these + score_objs <- NULL step_predictor_best_new( terms = x$terms, diff --git a/man/step_predictor_best.Rd b/man/step_predictor_best.Rd index 4e526da..b783569 100644 --- a/man/step_predictor_best.Rd +++ b/man/step_predictor_best.Rd @@ -7,9 +7,12 @@ step_predictor_best( recipe, ..., + score, role = NA, trained = FALSE, - threshold = 0.9, + prop_terms = 0.5, + update_prop = TRUE, + results = NULL, removals = NULL, skip = FALSE, id = rand_id("predictor_best") @@ -22,12 +25,22 @@ operations for this recipe.} \item{...}{One or more selector functions to choose variables for this step. See \code{\link[recipes:selections]{selections()}} for more details.} +\item{score}{...} + \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} -\item{threshold}{...} +\item{prop_terms}{The proportion of predictors that should be retained when +ordered by overall desirability. A value of \code{\link[hardhat:tune]{hardhat::tune()}} can also be +used.} + +\item{update_prop}{A logical: should \code{prop_terms} be updated so that at least +one predictor will be retained?} + +\item{results}{Fitted filtro objects. These values are not determined until +\code{\link[recipes:prep]{recipes::prep()}} is called.} \item{removals}{A character string that contains the names of columns that should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} @@ -68,20 +81,11 @@ returned with columns \code{terms} and \code{id}: } } -\section{Tuning Parameters}{ -This step has 1 tuning parameters: -\itemize{ -\item \code{threshold}: Threshold (type: double, default: 0.9) -} - -The underlying operation does not allow for case weights. -} - \examples{ library(recipes) rec <- recipe(mpg ~ ., data = mtcars) |> - step_predictor_best(all_predictors()) + step_predictor_best() prepped <- prep(rec) From d531ffcd22442aacb315cdb3247c685ed35f0630 Mon Sep 17 00:00:00 2001 From: Frances Lin <37535633+franceslinyc@users.noreply.github.com> Date: Mon, 25 Aug 2025 16:08:46 -0700 Subject: [PATCH 29/65] Prototype code; Have not been ran nor tested --- R/step-predictor-best.R | 68 ++++++++++++++++++++--- tests/testthat/test-step-predictor-best.R | 3 + 2 files changed, 64 insertions(+), 7 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index 5538788..508505f 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -67,7 +67,7 @@ step_predictor_best <- function( recipe, step_predictor_best_new( terms = enquos(...), - score = rlang::enexpr(score), + score = rlang::enexpr(score), # Or score = score? role = role, trained = trained, prop_terms = prop_terms, @@ -114,7 +114,17 @@ step_predictor_best_new <- #' @export prep.step_predictor_best <- function(x, training, info = NULL, ...) { col_names <- recipes_eval_select(x$terms, training, info) - check_type(training[, col_names], types = c("double", "integer")) + check_type(training[, col_names], types = c("double", "integer", "factor")) + check_number_decimal( + x$prop_terms, + min = .Machine$double.eps, + max = 1, + arg = "prop_terms" + ) + + if (x$update_prop) { + x$prop_terms <- update_prop(length(col_names), x$prop_terms) + } wts <- get_case_weights(info, training) were_weights_used <- are_weights_used(wts, unsupervised = TRUE) @@ -122,15 +132,19 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { wts <- NULL } + outcome_name <- pull_outcome_column_name(info) + if (length(col_names) > 1) { - filter <- character(0) + filter <- calculate_predictor_best( + score = x$score, + prop_terms = x$prop_terms, + outcome = outcome_name, + data = training[, c(outcome_name, col_names)] + ) } else { filter <- character(0) } - # TODO: generate these - score_objs <- NULL - step_predictor_best_new( terms = x$terms, score = x$score, @@ -139,13 +153,53 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { results = score_objs, prop_terms = x$prop_terms, update_prop = x$update_prop, - removals = filter, + removals = removals, skip = x$skip, id = x$id, case_weights = were_weights_used ) } +calculate_predictor_best <- function( + score, + outcome = character(0), + data, + opts = list() +) { + score_function <- paste0("score_", score) + + opts <- make_opt_list(opts, score) + + fm <- stats::as.formula(paste(outcome, "~ .")) + + score_res <- compute_score( + score_function, + args = opts, + form = fm, + data = training[c(outcome_name, col_names)], + weights = wts + ) + + # ------------------------------------------------------------------------------ + # Fill in missings + # The current filtro::fill_safe_value() only applies to class_score, not df. + + # score_df <- # save for tidy method + # score_res |> + # filtro::fill_safe_value(return_results = TRUE) + + # ------------------------------------------------------------------------------ + # filter predictors + final_res <- score_df |> + dplyr::slice_max(score, prop = x$prop_terms, with_ties = TRUE) |> + dplyr::pull("predictor") + + if (length(final_res) == 0) { + # final_res <- fallback_pred() + } + final_res +} + #' @export bake.step_predictor_best <- function(object, new_data, ...) { new_data <- recipes_remove_cols(new_data, object) diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index eeb5889..5ebb876 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -3,6 +3,9 @@ skip() rec <- recipe(mpg ~ ., data = mtcars) |> step_predictor_best(all_predictors(), score = cor_pearson, prop_terms = 1 / 2) +prepped <- prep(rec) +res_bake <- bake(prepped, mtcars) +res_tidy <- tidy(prepped, 1) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { From d4c0bd5fa076d2f725184e3dfe637797fbcf795f Mon Sep 17 00:00:00 2001 From: Frances Lin <37535633+franceslinyc@users.noreply.github.com> Date: Tue, 26 Aug 2025 10:52:54 -0700 Subject: [PATCH 30/65] Can't find method for `fill_safe_value(S3)`. --- R/step-predictor-best.R | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index 508505f..13dea6b 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -42,9 +42,6 @@ #' @examples #' library(recipes) #' -#' rec <- recipe(mpg ~ ., data = mtcars) |> -#' step_predictor_best() -#' #' prepped <- prep(rec) #' #' bake(prepped, mtcars) @@ -162,31 +159,30 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { calculate_predictor_best <- function( score, + prop_terms, outcome = character(0), - data, - opts = list() + data ) { score_function <- paste0("score_", score) - opts <- make_opt_list(opts, score) - fm <- stats::as.formula(paste(outcome, "~ .")) score_res <- compute_score( score_function, - args = opts, + args = list(), form = fm, - data = training[c(outcome_name, col_names)], - weights = wts + data = training[c(outcome_name, col_names)] + #weights = wts ) # ------------------------------------------------------------------------------ # Fill in missings + # The current filtro::fill_safe_value() only applies to class_score, not df. - # score_df <- # save for tidy method - # score_res |> - # filtro::fill_safe_value(return_results = TRUE) + score_df <- # save for tidy method + score_res |> + filtro::fill_safe_value(return_results = TRUE) # ------------------------------------------------------------------------------ # filter predictors From c8d808ed12a567253b9f94af610b5ebfe5da42de Mon Sep 17 00:00:00 2001 From: Frances Lin <37535633+franceslinyc@users.noreply.github.com> Date: Tue, 26 Aug 2025 15:11:46 -0700 Subject: [PATCH 31/65] Remove #' @examples for now --- R/step-predictor-best.R | 13 ++----------- man/step_predictor_best.Rd | 12 ------------ tests/testthat/test-step-predictor-best.R | 2 +- 3 files changed, 3 insertions(+), 24 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index 13dea6b..f62381e 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -38,15 +38,6 @@ #' \item{terms}{character, the selectors or variables selected to be removed} #' \item{id}{character, id of this step} #' } -#' -#' @examples -#' library(recipes) -#' -#' prepped <- prep(rec) -#' -#' bake(prepped, mtcars) -#' -#' tidy(prepped, 1) step_predictor_best <- function( recipe, ..., @@ -171,14 +162,14 @@ calculate_predictor_best <- function( score_function, args = list(), form = fm, - data = training[c(outcome_name, col_names)] + data = training[c(outcome_name, col_names)] #, #weights = wts ) # ------------------------------------------------------------------------------ # Fill in missings - # The current filtro::fill_safe_value() only applies to class_score, not df. + # The current filtro::fill_safe_value() only applies to class_score, not df nor tibble. score_df <- # save for tidy method score_res |> diff --git a/man/step_predictor_best.Rd b/man/step_predictor_best.Rd index b783569..d7b7a6b 100644 --- a/man/step_predictor_best.Rd +++ b/man/step_predictor_best.Rd @@ -81,15 +81,3 @@ returned with columns \code{terms} and \code{id}: } } -\examples{ -library(recipes) - -rec <- recipe(mpg ~ ., data = mtcars) |> - step_predictor_best() - -prepped <- prep(rec) - -bake(prepped, mtcars) - -tidy(prepped, 1) -} diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index 5ebb876..1720659 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -1,5 +1,5 @@ skip() - +library(filtro) rec <- recipe(mpg ~ ., data = mtcars) |> step_predictor_best(all_predictors(), score = cor_pearson, prop_terms = 1 / 2) From f3d38fb3299f5eff3b26107227a0f16c8fc401a4 Mon Sep 17 00:00:00 2001 From: Frances Lin <37535633+franceslinyc@users.noreply.github.com> Date: Wed, 27 Aug 2025 15:33:49 -0700 Subject: [PATCH 32/65] Working; Still need to test/verify --- R/step-predictor-best.R | 16 ++++++++-------- tests/testthat/test-step-predictor-best.R | 7 ++++++- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index f62381e..dcc6701 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -46,7 +46,7 @@ step_predictor_best <- function( trained = FALSE, prop_terms = 0.5, update_prop = TRUE, - results = NULL, + #results = NULL, removals = NULL, skip = FALSE, id = rand_id("predictor_best") @@ -60,7 +60,7 @@ step_predictor_best <- function( trained = trained, prop_terms = prop_terms, update_prop = update_prop, - results = results, + #results = results, removals = removals, skip = skip, id = id, @@ -77,7 +77,7 @@ step_predictor_best_new <- trained, prop_terms, update_prop = update_prop, - results, + #results, removals, skip, id, @@ -91,7 +91,7 @@ step_predictor_best_new <- trained = trained, prop_terms = prop_terms, update_prop = update_prop, - results = results, + #results = results, removals = removals, skip = skip, id = id, @@ -138,10 +138,10 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { score = x$score, role = x$role, trained = TRUE, - results = score_objs, + #results = score_objs, prop_terms = x$prop_terms, update_prop = x$update_prop, - removals = removals, + removals = filter, skip = x$skip, id = x$id, case_weights = were_weights_used @@ -162,7 +162,7 @@ calculate_predictor_best <- function( score_function, args = list(), form = fm, - data = training[c(outcome_name, col_names)] #, + data = data #, #weights = wts ) @@ -178,7 +178,7 @@ calculate_predictor_best <- function( # ------------------------------------------------------------------------------ # filter predictors final_res <- score_df |> - dplyr::slice_max(score, prop = x$prop_terms, with_ties = TRUE) |> + dplyr::slice_max(score, prop = prop_terms, with_ties = TRUE) |> dplyr::pull("predictor") if (length(final_res) == 0) { diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index 1720659..24148fc 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -1,7 +1,12 @@ skip() + library(filtro) rec <- recipe(mpg ~ ., data = mtcars) |> - step_predictor_best(all_predictors(), score = cor_pearson, prop_terms = 1 / 2) + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 1 / 2 + ) prepped <- prep(rec) res_bake <- bake(prepped, mtcars) From b7ed1192cd30512c672f56894a00c8b348a4cc1f Mon Sep 17 00:00:00 2001 From: Frances Lin <37535633+franceslinyc@users.noreply.github.com> Date: Wed, 27 Aug 2025 15:43:33 -0700 Subject: [PATCH 33/65] Add test; Need more tests --- man/step_predictor_best.Rd | 7 ++-- tests/testthat/test-step-predictor-best.R | 41 +++++++++++++++++------ 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/man/step_predictor_best.Rd b/man/step_predictor_best.Rd index d7b7a6b..33377ba 100644 --- a/man/step_predictor_best.Rd +++ b/man/step_predictor_best.Rd @@ -12,7 +12,6 @@ step_predictor_best( trained = FALSE, prop_terms = 0.5, update_prop = TRUE, - results = NULL, removals = NULL, skip = FALSE, id = rand_id("predictor_best") @@ -39,9 +38,6 @@ used.} \item{update_prop}{A logical: should \code{prop_terms} be updated so that at least one predictor will be retained?} -\item{results}{Fitted filtro objects. These values are not determined until -\code{\link[recipes:prep]{recipes::prep()}} is called.} - \item{removals}{A character string that contains the names of columns that should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} is called.} @@ -53,6 +49,9 @@ outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{id}{A character string that is unique to this step to identify it.} + +\item{results}{Fitted filtro objects. These values are not determined until +\code{\link[recipes:prep]{recipes::prep()}} is called.} } \value{ An updated version of \code{recipe} with the new step added to the diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index 24148fc..67fb5da 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -1,16 +1,37 @@ -skip() +test_that("step works", { + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 1 / 2 + ) + + prepped <- prep(rec) + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + cor_pearson_res <- filtro::score_cor_pearson |> + filtro::fit(mpg ~ ., data = mtcars) + + cor_pearson_res <- cor_pearson_res |> filtro::fill_safe_value() + exp <- cor_pearson_res@results |> + dplyr::slice_max(score, prop = 1 / 2, with_ties = TRUE) |> + dplyr::pull("predictor") + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(exp) + ) -library(filtro) -rec <- recipe(mpg ~ ., data = mtcars) |> - step_predictor_best( - all_predictors(), - score = "cor_pearson", - prop_terms = 1 / 2 + expect_identical( + sort(res_tidy$terms), + sort(exp) ) +}) -prepped <- prep(rec) -res_bake <- bake(prepped, mtcars) -res_tidy <- tidy(prepped, 1) +# TODO Add more tests + +skip() # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { From d53b1188b44fc423116ec49188dd67f0df9aa29d Mon Sep 17 00:00:00 2001 From: topepo Date: Sat, 30 Aug 2025 18:38:35 -0400 Subject: [PATCH 34/65] version bumps --- DESCRIPTION | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ed3b365..a36220a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -21,9 +21,9 @@ Depends: recipes (>= 1.1.0) Imports: cli, - desirability2 (>= 0.1.0.9000), + desirability2 (>= 0.2.0), dplyr, - filtro, + filtro (>= 0.2.0), generics, ggplot2, hardhat (>= 1.4.1), @@ -46,9 +46,6 @@ Suggests: survival, testthat (>= 3.0.0), yardstick -Remotes: - tidymodels/desirability2, - tidymodels/filtro Config/Needs/website: tidyverse/tidytemplate, tidymodels Config/testthat/edition: 3 Config/usethis/last-upkeep: 2025-06-09 From 609e945f60dc10be7622c69fc4ff2fe39d360586 Mon Sep 17 00:00:00 2001 From: topepo Date: Sun, 31 Aug 2025 13:39:51 -0400 Subject: [PATCH 35/65] update docs --- R/step_predictor_desirability.R | 124 ++++++++++++++++++----------- man/step_predictor_desirability.Rd | 27 +++++-- 2 files changed, 96 insertions(+), 55 deletions(-) diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index b263ae7..0a2b95e 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -1,4 +1,4 @@ -#'Feature Selection +#' Multivariable Feature Selection #' #' `step_predictor_desirability()` creates a *specification* of a recipe step #' that uses one or more "score" functions to measure how how much each predictor @@ -60,7 +60,7 @@ #' #' ## Ties #' -#' Note that `dplyr::slice_max()` with the argument `with_ties = TRUE ` is used +#' Note that [dplyr::slice_max()] with the argument `with_ties = TRUE ` is used #' to select predictors. If there are many ties in overall desirability, the #' proportion selected can be larger than the value given to `prep_terms()`. #' @@ -78,9 +78,20 @@ #' #' For a trained recipe, the `tidy()` method will return a tibble with columns #' `terms` (the predictor names), `id`, columns for the estimated scores, and -#' the desirability results. The desirability columns will have the same name -#' as the scores with an additional prefix of `.d_`. The overall desirability -#' column is called `.d_overall`. +#' the desirability results. +#' +#' There are two versions of the score results. The columns prefixed with +#' `"score_1"` have been altered with their transformation (see the Details page +#' for each score) and have had missing values filled with "safe" values to +#' prevent them from being missing. The other set of scores lack the prefix and +#' are the original, raw score values. +#' +#' The desirability columns will have the same name as the scores with an +#' additional prefix of `.d_`. The overall desirability column is called +#' `.d_overall`. +#' +#' There is an additional local column called `retain` that notes whether the +#' predictor passed the filter and is retained after this step is executed. #' #' @return An updated version of `recipe` with the new step added to the #' sequence of any existing operations. When you @@ -111,13 +122,13 @@ #' # important. #' #' # The score_* objects here are from the filtro package. See Details above. -#' goals <- +#' goals <- #' desirability( #' maximize(score_xtab_pval_fisher), #' maximize(score_aov_pval) -#' ) +#' ) #' -#' example_data <- modeldata::ad_data +#' example_data <- modeldata::ad_data #' rec <- #' recipe(Class ~ ., data = example_data) |> #' step_predictor_desirability( @@ -196,12 +207,12 @@ step_predictor_desirability <- function( skip = FALSE, id = rand_id("predictor_desirability") ) { - if (!inherits(score, "desirability2::desirability_set")) { - cli::cli_abort( - "Please use the {.fn desirability} function in the {.pkg desirability2} + if (!inherits(score, "desirability2::desirability_set")) { + cli::cli_abort( + "Please use the {.fn desirability} function in the {.pkg desirability2} package to create an object to pass to {.arg score}." - ) - } + ) + } add_step( recipe, step_predictor_desirability_new( @@ -252,31 +263,36 @@ step_predictor_desirability_new <- #' @export prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { - rlang::check_installed("desirability2") + rlang::check_installed("desirability2") col_names <- recipes_eval_select(x$terms, training, info) check_type(training[, col_names], types = c("double", "integer", "factor")) - check_number_decimal(x$prop_terms, min = .Machine$double.eps, max = 1, arg = "prop_terms") + check_number_decimal( + x$prop_terms, + min = .Machine$double.eps, + max = 1, + arg = "prop_terms" + ) if (length(col_names) < 2) { - res <- - step_predictor_desirability_new( - terms = x$terms, - score = score, - role = x$role, - trained = TRUE, - prop_terms = x$prop_terms, - update_prop = x$update_prop, - results = NULL, - removals = character(0), - skip = x$skip, - id = x$id, - case_weights = were_weights_used - ) - return(res) + res <- + step_predictor_desirability_new( + terms = x$terms, + score = score, + role = x$role, + trained = TRUE, + prop_terms = x$prop_terms, + update_prop = x$update_prop, + results = NULL, + removals = character(0), + skip = x$skip, + id = x$id, + case_weights = were_weights_used + ) + return(res) } if (x$update_prop) { - x$prop_terms <- update_prop(length(col_names), x$prop_terms) + x$prop_terms <- update_prop(length(col_names), x$prop_terms) } # First we check the _type_ of weight to see if it is used. Later, in @@ -293,33 +309,43 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { fm <- stats::as.formula(fm) score_objs <- - purrr::map( - score_names, - ~ compute_score(.x, list(), fm, training[ c(outcome_name, col_names)], wts) - ) |> - filtro::fill_safe_values() # and then transform? + purrr::map( + score_names, + ~ compute_score(.x, list(), fm, training[c(outcome_name, col_names)], wts) + ) + raw_scores <- filtro::bind_scores(score_objs) + + score_objs <- + score_objs |> + filtro::fill_safe_values() # <- transforms too # The score names include "score_" but the column names don't rm_vec <- gsub("^score_", "", score_names) names(rm_vec) <- score_names score_objs <- dplyr::rename(score_objs, rm_vec) - # make desirability expression/eval quosre - score_objs <- desirability2::make_desirability_cols(x$score, score_objs) + # make desirability expression/eval quosure + score_df <- desirability2::make_desirability_cols(x$score, score_objs) keep_list <- - score_objs |> - dplyr::slice_max(.d_overall, prop = x$prop_terms, with_ties = TRUE) + score_df |> + dplyr::slice_max(.d_overall, prop = x$prop_terms, with_ties = TRUE) rm_list <- - dplyr::anti_join(score_objs, keep_list[, "predictor"], by = "predictor") |> - purrr::pluck("predictor") + dplyr::anti_join(score_df, keep_list[, "predictor"], by = "predictor") |> + purrr::pluck("predictor") + + score_df$retain <- score_df$predictor %in% rm_list + + score_df <- score_df |> + dplyr::full_join(raw_scores, by = c("outcome", "predictor")) |> + dplyr::relocate(retain, .after = "predictor") step_predictor_desirability_new( terms = x$terms, score = x$score, role = x$role, trained = TRUE, - results = score_objs, + results = score_df, prop_terms = x$prop_terms, update_prop = x$update_prop, removals = rm_list, @@ -336,7 +362,11 @@ bake.step_predictor_desirability <- function(object, new_data, ...) { } #' @export -print.step_predictor_desirability <- function(x, width = max(20, options()$width - 36), ...) { +print.step_predictor_desirability <- function( + x, + width = max(20, options()$width - 36), + ... +) { title <- "Feature selection via desirability functions on" print_step( x$removals, @@ -354,8 +384,8 @@ print.step_predictor_desirability <- function(x, width = max(20, options()$width tidy.step_predictor_desirability <- function(x, ...) { if (is_trained(x)) { res <- - x$results |> - dplyr::select(-outcome, terms = predictor) + x$results |> + dplyr::select(-outcome, terms = predictor) res$retained <- !(res$terms %in% x$removals) } else { term_names <- sel2char(x$terms) @@ -380,5 +410,5 @@ tunable.step_predictor_desirability <- function(x, ...) { #' @export required_pkgs.step_predictor_desirability <- function(x, ...) { - c("important", "filtro", "desirability2") + c("important", "filtro", "desirability2") } diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index 7845b9e..af26fea 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/step_predictor_desirability.R \name{step_predictor_desirability} \alias{step_predictor_desirability} -\title{Feature Selection} +\title{Multivariable Feature Selection} \usage{ step_predictor_desirability( recipe, @@ -109,7 +109,7 @@ specifically referenced by name. To avoid this, see the advice in the \emph{Tips for saving recipes and filtering columns} section of \link[recipes:selections]{recipes::selections}. \subsection{Ties}{ -Note that \code{dplyr::slice_max()} with the argument \code{with_ties = TRUE } is used +Note that \code{\link[dplyr:slice]{dplyr::slice_max()}} with the argument \code{with_ties = TRUE } is used to select predictors. If there are many ties in overall desirability, the proportion selected can be larger than the value given to \code{prep_terms()}. } @@ -129,9 +129,20 @@ other role). A full example is below. For a trained recipe, the \code{tidy()} method will return a tibble with columns \code{terms} (the predictor names), \code{id}, columns for the estimated scores, and -the desirability results. The desirability columns will have the same name -as the scores with an additional prefix of \code{.d_}. The overall desirability -column is called \code{.d_overall}. +the desirability results. + +There are two versions of the score results. The columns prefixed with +\code{"score_1"} have been altered with their transformation (see the Details page +for each score) and have had missing values filled with "safe" values to +prevent them from being missing. The other set of scores lack the prefix and +are the original, raw score values. + +The desirability columns will have the same name as the scores with an +additional prefix of \code{.d_}. The overall desirability column is called +\code{.d_overall}. + +There is an additional local column called \code{retain} that notes whether the +predictor passed the filter and is retained after this step is executed. } } \examples{ @@ -149,13 +160,13 @@ if (rlang::is_installed("modeldata")) { # important. # The score_* objects here are from the filtro package. See Details above. - goals <- + goals <- desirability( maximize(score_xtab_pval_fisher), maximize(score_aov_pval) - ) + ) - example_data <- modeldata::ad_data + example_data <- modeldata::ad_data rec <- recipe(Class ~ ., data = example_data) |> step_predictor_desirability( From ccd256704266578790687cc829a4220c24f67749 Mon Sep 17 00:00:00 2001 From: topepo Date: Sun, 31 Aug 2025 13:53:18 -0400 Subject: [PATCH 36/65] standardize some documentation --- R/step-predictor-best.R | 23 ++++++++--------------- R/step-predictor-retain.R | 16 ++++++++-------- R/step_predictor_desirability.R | 2 +- man/step_predictor_best.Rd | 16 ++++++---------- man/step_predictor_desirability.Rd | 2 +- man/step_predictor_retain.Rd | 14 +++++++------- 6 files changed, 31 insertions(+), 42 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index dcc6701..f17ed0c 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -1,22 +1,15 @@ -#' Feature Selection +#' Supervised Feature Selection via Choosing the Top Predictors #' #' `step_predictor_best()` creates a *specification* of a recipe step that will #' perform feature selection by ... #' -#' @inheritParams recipes::step_center -#' @param score ... -#' @param prop_terms The proportion of predictors that should be retained when -#' ordered by overall desirability. A value of [hardhat::tune()] can also be -#' used. -#' @param update_prop A logical: should `prop_terms` be updated so that at least -#' one predictor will be retained? -#' @param results Fitted filtro objects. These values are not determined until -#' [recipes::prep()] is called. -#' @param removals A character string that contains the names of columns that -#' should be removed. These values are not determined until [recipes::prep()] -#' is called. -#' @return An updated version of `recipe` with the new step added to the -#' sequence of any existing operations. +#' @inheritParams step_predictor_desirability +#' +#' @param score The name of a single score function from the \pkg{filtro} +#' package, such as [filtro::score_imp_rf()], [filtro:: score_roc_auc()], etc. +#' See the Details and Examples sections below. This argument *should be named* +#' when used. +#' #' @export #' #' @details diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index 7c35988..83f4e1f 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -1,15 +1,15 @@ -#' Feature Selection +#' Supervised Feature Selection via A Single Filter #' #' `step_predictor_retain()` creates a *specification* of a recipe step that will #' perform feature selection by ... #' -#' @inheritParams recipes::step_center -#' @param score ... -#' @param removals A character string that contains the names of columns that -#' should be removed. These values are not determined until [recipes::prep()] -#' is called. -#' @return An updated version of `recipe` with the new step added to the -#' sequence of any existing operations. +#' @inheritParams step_predictor_desirability +#' @param score A valid R expression that produces a logical result. The +#' equation can contain the names of one or more score functions from the +#' \pkg{filtro} package, such as [filtro::score_imp_rf()], +#' [filtro:: score_roc_auc()]. See the Details and Examples sections below. +#' This argument *should be named* when used. +#' #' @export #' #' @details diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 0a2b95e..c941d46 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -1,4 +1,4 @@ -#' Multivariable Feature Selection +#' Supervised Multivariable Feature Selection via Desirability Functions #' #' `step_predictor_desirability()` creates a *specification* of a recipe step #' that uses one or more "score" functions to measure how how much each predictor diff --git a/man/step_predictor_best.Rd b/man/step_predictor_best.Rd index 33377ba..870a723 100644 --- a/man/step_predictor_best.Rd +++ b/man/step_predictor_best.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/step-predictor-best.R \name{step_predictor_best} \alias{step_predictor_best} -\title{Feature Selection} +\title{Supervised Feature Selection via Choosing the Top Predictors} \usage{ step_predictor_best( recipe, @@ -24,7 +24,10 @@ operations for this recipe.} \item{...}{One or more selector functions to choose variables for this step. See \code{\link[recipes:selections]{selections()}} for more details.} -\item{score}{...} +\item{score}{The name of a single score function from the \pkg{filtro} +package, such as \code{\link[filtro:score_imp_rf]{filtro::score_imp_rf()}}, \code{\link[filtro:score_roc_auc]{filtro:: score_roc_auc()}}, etc. +See the Details and Examples sections below. This argument \emph{should be named} +when used.} \item{role}{Not used by this step since no new variables are created.} @@ -38,7 +41,7 @@ used.} \item{update_prop}{A logical: should \code{prop_terms} be updated so that at least one predictor will be retained?} -\item{removals}{A character string that contains the names of columns that +\item{removals}{A character string that contains the names of predictors that should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} is called.} @@ -49,13 +52,6 @@ outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{id}{A character string that is unique to this step to identify it.} - -\item{results}{Fitted filtro objects. These values are not determined until -\code{\link[recipes:prep]{recipes::prep()}} is called.} -} -\value{ -An updated version of \code{recipe} with the new step added to the -sequence of any existing operations. } \description{ \code{step_predictor_best()} creates a \emph{specification} of a recipe step that will diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index af26fea..28212d9 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/step_predictor_desirability.R \name{step_predictor_desirability} \alias{step_predictor_desirability} -\title{Multivariable Feature Selection} +\title{Supervised Multivariable Feature Selection via Desirability Functions} \usage{ step_predictor_desirability( recipe, diff --git a/man/step_predictor_retain.Rd b/man/step_predictor_retain.Rd index 9f74e90..d0805c0 100644 --- a/man/step_predictor_retain.Rd +++ b/man/step_predictor_retain.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/step-predictor-retain.R \name{step_predictor_retain} \alias{step_predictor_retain} -\title{Feature Selection} +\title{Supervised Feature Selection via A Single Filter} \usage{ step_predictor_retain( recipe, @@ -22,14 +22,18 @@ operations for this recipe.} \item{...}{One or more selector functions to choose variables for this step. See \code{\link[recipes:selections]{selections()}} for more details.} -\item{score}{...} +\item{score}{A valid R expression that produces a logical result. The +equation can contain the names of one or more score functions from the +\pkg{filtro} package, such as \code{\link[filtro:score_imp_rf]{filtro::score_imp_rf()}}, +\code{\link[filtro:score_roc_auc]{filtro:: score_roc_auc()}}. See the Details and Examples sections below. +This argument \emph{should be named} when used.} \item{role}{Not used by this step since no new variables are created.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} -\item{removals}{A character string that contains the names of columns that +\item{removals}{A character string that contains the names of predictors that should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} is called.} @@ -41,10 +45,6 @@ may affect the computations for subsequent operations.} \item{id}{A character string that is unique to this step to identify it.} } -\value{ -An updated version of \code{recipe} with the new step added to the -sequence of any existing operations. -} \description{ \code{step_predictor_retain()} creates a \emph{specification} of a recipe step that will perform feature selection by ... From c67615b665574436998f2431eb16fa9210c808b3 Mon Sep 17 00:00:00 2001 From: topepo Date: Tue, 2 Sep 2025 10:49:36 -0400 Subject: [PATCH 37/65] importFrom("S7", "@") to avoid errors --- NAMESPACE | 1 + R/important-package.R | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 2d8aae2..ef0fab7 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ export(required_pkgs) export(step_predictor_best) export(step_predictor_desirability) export(step_predictor_retain) +if (getRversion() < "4.3.0") importFrom("S7", "@") import(recipes) import(rlang) importFrom(generics,augment) diff --git a/R/important-package.R b/R/important-package.R index f35b04b..41c7546 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -36,5 +36,10 @@ utils::globalVariables( "score_objs" ) ) + +# enable usage of @name in package code +#' @rawNamespace if (getRversion() < "4.3.0") importFrom("S7", "@") +NULL + ## usethis namespace: end NULL From faaa7f6b3750ced82855d003571679b55823d04a Mon Sep 17 00:00:00 2001 From: topepo Date: Tue, 2 Sep 2025 10:54:58 -0400 Subject: [PATCH 38/65] formally import S7 --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index a36220a..3a5472a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,6 +29,7 @@ Imports: hardhat (>= 1.4.1), purrr, rlang (>= 1.1.0), + S7, tibble, tidyr, tune, From c1b8b7e16ea51ad948ee2bf774821fb6a8e6a01d Mon Sep 17 00:00:00 2001 From: topepo Date: Tue, 2 Sep 2025 13:44:52 -0400 Subject: [PATCH 39/65] update dials info --- R/step-predictor-best.R | 2 +- R/step_predictor_desirability.R | 4 ++-- man/step_predictor_desirability.Rd | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index f17ed0c..6221c66 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -222,7 +222,7 @@ tunable.step_predictor_best <- function(x, ...) { tibble::tibble( name = "prop_terms", call_info = list( - list(pkg = "dials", fun = "threshold") + list(pkg = "dials", fun = "prop_terms") ), source = "recipe", component = "step_predictor_best", diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index c941d46..10628c0 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -186,7 +186,7 @@ #' library(dplyr) #' library(ggplot2) #' -#' # The selection did not substantually change with these case weights +#' # The selection did not substantially change with these case weights #' full_join(predictor_scores, predictor_scores_wts, by = "terms") |> #' ggplot(aes(.d_overall, .d_overall_weighted)) + #' geom_abline(col = "darkgreen", lty = 2) + @@ -400,7 +400,7 @@ tunable.step_predictor_desirability <- function(x, ...) { tibble::tibble( name = "prop_terms", call_info = list( - list(pkg = "dials", fun = "threshold") + list(pkg = "dials", fun = "prop_terms") ), source = "recipe", component = "step_predictor_desirability", diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index 28212d9..5af06ad 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -224,7 +224,7 @@ if (rlang::is_installed("modeldata")) { library(dplyr) library(ggplot2) - # The selection did not substantually change with these case weights + # The selection did not substantially change with these case weights full_join(predictor_scores, predictor_scores_wts, by = "terms") |> ggplot(aes(.d_overall, .d_overall_weighted)) + geom_abline(col = "darkgreen", lty = 2) + From 950d87958a821db810e674f0fc4cc96d228ab8a2 Mon Sep 17 00:00:00 2001 From: topepo Date: Tue, 2 Sep 2025 13:54:17 -0400 Subject: [PATCH 40/65] update required_pkgs --- R/step-predictor-best.R | 3 ++- R/step-predictor-retain.R | 6 ++++++ R/step_predictor_desirability.R | 8 ++++++++ man/required_pkgs.important.Rd | 25 +++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 1 deletion(-) create mode 100644 man/required_pkgs.important.Rd diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index 6221c66..b006948 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -230,7 +230,8 @@ tunable.step_predictor_best <- function(x, ...) { ) } +#' @rdname required_pkgs.important #' @export -required_pkgs.step_predictor_desirability <- function(x, ...) { +required_pkgs.step_predictor_best <- function(x, ...) { c("important", "filtro") } diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index 83f4e1f..2f5ba04 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -237,3 +237,9 @@ tidy.step_predictor_retain <- function(x, ...) { res$id <- x$id res } + +#' @rdname required_pkgs.important +#' @export +required_pkgs.step_predictor_desirability <- function(x, ...) { + c("important", "filtro") +} diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 10628c0..a3552e1 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -408,6 +408,14 @@ tunable.step_predictor_desirability <- function(x, ...) { ) } +#' S3 methods for tracking which additional packages are needed for steps. +#' +#' Recipe-adjacent packages always list themselves as a required package so that +#' the steps can function properly within parallel processing schemes. +#' @param x A recipe step +#' @return A character vector +#' @name required_pkgs.important +#' @keywords internal #' @export required_pkgs.step_predictor_desirability <- function(x, ...) { c("important", "filtro", "desirability2") diff --git a/man/required_pkgs.important.Rd b/man/required_pkgs.important.Rd new file mode 100644 index 0000000..2282276 --- /dev/null +++ b/man/required_pkgs.important.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step-predictor-best.R, +% R/step-predictor-retain.R, R/step_predictor_desirability.R +\name{required_pkgs.step_predictor_desirability} +\alias{required_pkgs.step_predictor_desirability} +\alias{required_pkgs.important} +\title{S3 methods for tracking which additional packages are needed for steps.} +\usage{ +\method{required_pkgs}{step_predictor_desirability}(x, ...) + +\method{required_pkgs}{step_predictor_desirability}(x, ...) + +\method{required_pkgs}{step_predictor_desirability}(x, ...) +} +\arguments{ +\item{x}{A recipe step} +} +\value{ +A character vector +} +\description{ +Recipe-adjacent packages always list themselves as a required package so that +the steps can function properly within parallel processing schemes. +} +\keyword{internal} From d3bd281a4277572fd08da43e38f6db1d5fd4b584 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 3 Sep 2025 12:05:01 -0400 Subject: [PATCH 41/65] put score names in print method --- NAMESPACE | 2 ++ R/important-package.R | 4 +++- R/step-predictor-best.R | 5 ++++- R/step-predictor-retain.R | 6 +++++- R/step_predictor_desirability.R | 9 +++++++-- inst/WORDLIST | 1 + man/required_pkgs.important.Rd | 5 +++-- man/step_predictor_desirability.Rd | 2 +- tests/testthat/_snaps/step-predictor-retain.md | 8 ++++---- 9 files changed, 30 insertions(+), 12 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index ef0fab7..e8190a3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -10,6 +10,7 @@ S3method(prep,step_predictor_retain) S3method(print,step_predictor_best) S3method(print,step_predictor_desirability) S3method(print,step_predictor_retain) +S3method(required_pkgs,step_predictor_best) S3method(required_pkgs,step_predictor_desirability) S3method(tidy,step_predictor_best) S3method(tidy,step_predictor_desirability) @@ -26,6 +27,7 @@ export(step_predictor_retain) if (getRversion() < "4.3.0") importFrom("S7", "@") import(recipes) import(rlang) +importFrom(S7,check_is_S7) importFrom(generics,augment) importFrom(generics,required_pkgs) importFrom(ggplot2,autoplot) diff --git a/R/important-package.R b/R/important-package.R index 41c7546..b82270d 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -6,6 +6,7 @@ #' @import rlang #' @importFrom stats sd predict as.formula #' @importFrom hardhat extract_fit_parsnip +#' @importFrom S7 check_is_S7 #' @importFrom ggplot2 autoplot #' @export @@ -33,7 +34,8 @@ utils::globalVariables( "score", ".d_overall", "outcome", - "score_objs" + "score_objs", + "retain" ) ) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index b006948..bf88b00 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -192,7 +192,10 @@ print.step_predictor_best <- function( width = max(20, options()$width - 36), ... ) { - title <- "Feature selection on " + scores <- unique(x$score) + title <- cli::format_inline( + "Feature selection via {.code {scores}} on" + ) print_step( x$removals, x$terms, diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index 2f5ba04..fd4b583 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -213,7 +213,11 @@ print.step_predictor_retain <- function( width = max(20, options()$width - 36), ... ) { - title <- "Feature selection on " + scores <- unique(all.vars(x$score)) + + title <- cli::format_inline( + "Feature selection using {.and {.code {scores}}} on" + ) print_step( x$removals, x$terms, diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index a3552e1..3adea4f 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -1,4 +1,4 @@ -#' Supervised Multivariable Feature Selection via Desirability Functions +#' Supervised Multivariate Feature Selection via Desirability Functions #' #' `step_predictor_desirability()` creates a *specification* of a recipe step #' that uses one or more "score" functions to measure how how much each predictor @@ -367,7 +367,12 @@ print.step_predictor_desirability <- function( width = max(20, options()$width - 36), ... ) { - title <- "Feature selection via desirability functions on" + scores <- purrr::map_chr(x$score@variables, ~ gsub("score_", "", .x)) + scores <- unique(scores) + + title <- cli::format_inline( + "Feature selection via desirability functions ({.code {scores}}) on" + ) print_step( x$removals, x$terms, diff --git a/inst/WORDLIST b/inst/WORDLIST index 61a8479..0c320f8 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -5,6 +5,7 @@ Codecov ORCID PBC Suich +desirabilities doi funder importances diff --git a/man/required_pkgs.important.Rd b/man/required_pkgs.important.Rd index 2282276..63c8644 100644 --- a/man/required_pkgs.important.Rd +++ b/man/required_pkgs.important.Rd @@ -1,12 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/step-predictor-best.R, % R/step-predictor-retain.R, R/step_predictor_desirability.R -\name{required_pkgs.step_predictor_desirability} +\name{required_pkgs.step_predictor_best} +\alias{required_pkgs.step_predictor_best} \alias{required_pkgs.step_predictor_desirability} \alias{required_pkgs.important} \title{S3 methods for tracking which additional packages are needed for steps.} \usage{ -\method{required_pkgs}{step_predictor_desirability}(x, ...) +\method{required_pkgs}{step_predictor_best}(x, ...) \method{required_pkgs}{step_predictor_desirability}(x, ...) diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index 5af06ad..3e297db 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/step_predictor_desirability.R \name{step_predictor_desirability} \alias{step_predictor_desirability} -\title{Supervised Multivariable Feature Selection via Desirability Functions} +\title{Supervised Multivariate Feature Selection via Desirability Functions} \usage{ step_predictor_desirability( recipe, diff --git a/tests/testthat/_snaps/step-predictor-retain.md b/tests/testthat/_snaps/step-predictor-retain.md index 64e120a..90df8cb 100644 --- a/tests/testthat/_snaps/step-predictor-retain.md +++ b/tests/testthat/_snaps/step-predictor-retain.md @@ -12,7 +12,7 @@ predictor: 10 -- Operations - * Feature selection on: + * Feature selection using on: --- @@ -31,7 +31,7 @@ Training data contained 32 data points and no incomplete rows. -- Operations - * Feature selection on: | Trained + * Feature selection using on: | Trained # printing @@ -47,7 +47,7 @@ predictor: 10 -- Operations - * Feature selection on: all_predictors() + * Feature selection using `cor_pearson` on: all_predictors() --- @@ -66,7 +66,7 @@ Training data contained 32 data points and no incomplete rows. -- Operations - * Feature selection on: cyl, disp, hp, wt | Trained + * Feature selection using `cor_pearson` on: cyl, disp, hp, wt | Trained # bad args From 87e006f9813cc5a963f24575a66ac0fddc94c231 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 3 Sep 2025 13:39:39 -0400 Subject: [PATCH 42/65] not needed --- man/rmd/tunable-args.Rmd | 40 ---------------------------------------- 1 file changed, 40 deletions(-) delete mode 100644 man/rmd/tunable-args.Rmd diff --git a/man/rmd/tunable-args.Rmd b/man/rmd/tunable-args.Rmd deleted file mode 100644 index d19d0ad..0000000 --- a/man/rmd/tunable-args.Rmd +++ /dev/null @@ -1,40 +0,0 @@ -```{r} -#| include: false -get_dials <- function(x) { - if (any(names(x) == "range")) { - cl <- rlang::call2(x$fun, .ns = x$pkg, range = x$range) - } else { - cl <- rlang::call2(x$fun, .ns = x$pkg) - } - rlang::eval_tidy(cl) -} - -get_param_list <- function(x) { - args <- formals(x) - params <- getS3method("tunable", x)(list()) |> - dplyr::mutate( - default = args[name], - dials = purrr::map(call_info, get_dials), - label = purrr::map_chr(dials, \(.x) .x$label), - type = purrr::map_chr(dials, \(.x) .x$type), - item = glue::glue("- `{name}`: {label} (type: {type}, default: {default})\n\n") - ) - - params$item -} -``` - -# Tuning Parameters - -```{r} -#| echo: false -param <- get_param_list(step) -``` - -This step has `r length(param)` tuning parameters: - -```{r} -#| echo: false -#| results: asis -param -``` From 4f3a0ccfd5c89abfc98604ec0829872285aedb66 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 3 Sep 2025 13:42:50 -0400 Subject: [PATCH 43/65] document existing filtro scores --- R/step-predictor-best.R | 53 ++++++++++++++++++---- R/step-predictor-retain.R | 52 +++++++++++++++++---- R/step_predictor_desirability.R | 5 +- man/rmd/filtro-scores.Rmd | 34 ++++++++++++++ man/step_predictor_best.Rd | 72 +++++++++++++++++++++++++---- man/step_predictor_desirability.Rd | 25 +++++++++- man/step_predictor_retain.Rd | 73 +++++++++++++++++++++++++----- 7 files changed, 273 insertions(+), 41 deletions(-) create mode 100644 man/rmd/filtro-scores.Rmd diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index bf88b00..c1cf986 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -1,12 +1,14 @@ #' Supervised Feature Selection via Choosing the Top Predictors #' -#' `step_predictor_best()` creates a *specification* of a recipe step that will -#' perform feature selection by ... +#' `step_predictor_best()` creates a *specification* of a recipe step that uses +#' a single scoring function to measure how much each predictor is related to +#' the outcome value. This step retains a proportion of the most important +#' predictors, and this proportion can be tuned. #' #' @inheritParams step_predictor_desirability #' #' @param score The name of a single score function from the \pkg{filtro} -#' package, such as [filtro::score_imp_rf()], [filtro:: score_roc_auc()], etc. +#' package, such as `"imp_rf"` (for [filtro::score_imp_rf()]), etc. #' See the Details and Examples sections below. This argument *should be named* #' when used. #' @@ -14,13 +16,46 @@ #' #' @details #' -#' This step ... +#' ```{r child = "man/rmd/filtro-scores.Rmd"} +#' ``` #' -#' This step can potentially remove columns from the data set. This may -#' cause issues for subsequent steps in your recipe if the missing columns are -#' specifically referenced by name. To avoid this, see the advice in the -#' _Tips for saving recipes and filtering columns_ section of -#' [recipes::selections]. +#' Some important notes: +#' +#' - Scores that are p-values are automatically transformed by \pkg{filtro} to +#' be in the format `-log10(pvalue)` so that a p-value of 0.1 is converted to +#' 1.0. For these, use the `maximize()` goal. +#' +#' - Other scores are also transformed in the data. For example, the correlation +#' scores given to the recipe step are in absolute value format. See the +#' \pkg{filtro} documentation for each score. +#' +#' - You can use some in-line functions using base R functions. For example, +#' `maximize(max(score_cor_spearman))`. +#' +#' - If a predictor cannot be computed for all scores, it is given a "fallback +#' value" that will prevent it from being excluded for this reason. +#' +#' This step can potentially remove columns from the data set. This may cause +#' issues for subsequent steps in your recipe if the missing columns are +#' specifically referenced by name. To avoid this, see the advice in the _Tips +#' for saving recipes and filtering columns_ section of [recipes::selections]. +#' +#' +#' ## Ties +#' +#' Note that [dplyr::slice_max()] with the argument `with_ties = TRUE ` is used +#' to select predictors. If there are many ties in overall desirability, the +#' proportion selected can be larger than the value given to `prep_terms()`. +#' +#' ## Case Weights +#' +#' Case weights can be used by some scoring functions. To learn more, load the +#' \pkg{filtro} package and check the `case_weights` property of the score object +#' (see Examples below). For a recipe, use one of the tidymodels case weight +#' functions such as [hardhat::importance_weights()] or +#' [hardhat::frequency_weights], to assign the correct data type to the vector of case +#' weights. A recipe will then interpret that class to be a case weight (and no +#' other role). A full example is below. #' #' # Tidying #' diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index fd4b583..16da51c 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -1,7 +1,9 @@ #' Supervised Feature Selection via A Single Filter #' -#' `step_predictor_retain()` creates a *specification* of a recipe step that will -#' perform feature selection by ... +#' `step_predictor_retain()` creates a *specification* of a recipe step that +#' uses a logical statement that includes one or more scoring functions to +#' measure how much each predictor is related to the outcome value. This step +#' retains the predictors that pass the logical statement. #' #' @inheritParams step_predictor_desirability #' @param score A valid R expression that produces a logical result. The @@ -14,15 +16,45 @@ #' #' @details #' -#' This step ... +#' The `score` should be valid R syntax that produces a logical result and +#' should not use external data. The list of variables that can be used is in +#' the section below. #' -#' This step can potentially remove columns from the data set. This may -#' cause issues for subsequent steps in your recipe if the missing columns are -#' specifically referenced by name. To avoid this, see the advice in the -#' _Tips for saving recipes and filtering columns_ section of -#' [recipes::selections]. +#' ```{r child = "man/rmd/filtro-scores.Rmd"} +#' ``` #' -#' # Tidying +#' Some important notes: +#' +#' - Scores that are p-values are automatically transformed by \pkg{filtro} to +#' be in the format `-log10(pvalue)` so that a p-value of 0.1 is converted to +#' 1.0. For these, use the `maximize()` goal. +#' +#' - Other scores are also transformed in the data. For example, the correlation +#' scores given to the recipe step are in absolute value format. See the +#' \pkg{filtro} documentation for each score. +#' +#' - You can use some in-line functions using base R functions. For example, +#' `maximize(max(score_cor_spearman))`. +#' +#' - If a predictor cannot be computed for all scores, it is given a "fallback +#' value" that will prevent it from being excluded for this reason. +#' +#' This step can potentially remove columns from the data set. This may cause +#' issues for subsequent steps in your recipe if the missing columns are +#' specifically referenced by name. To avoid this, see the advice in the _Tips +#' for saving recipes and filtering columns_ section of [recipes::selections]. +#' +#' ## Case Weights +#' +#' Case weights can be used by some scoring functions. To learn more, load the +#' \pkg{filtro} package and check the `case_weights` property of the score object +#' (see Examples below). For a recipe, use one of the tidymodels case weight +#' functions such as [hardhat::importance_weights()] or +#' [hardhat::frequency_weights], to assign the correct data type to the vector of case +#' weights. A recipe will then interpret that class to be a case weight (and no +#' other role). A full example is below. +#' +#' ## Tidy method #' #' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is #' returned with columns `terms` and `id`: @@ -40,7 +72,7 @@ #' rec <- recipe(mpg ~ ., data = mtcars) |> #' step_predictor_retain( #' all_predictors(), -#' score = cor_pearson >= 0.75 & cor_spearman >= 0.75 +#' score = cor_pearson >= 0.75 | cor_spearman >= 0.75 #' ) #' #' prepped <- prep(rec) diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 3adea4f..ced8031 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -1,7 +1,7 @@ #' Supervised Multivariate Feature Selection via Desirability Functions #' #' `step_predictor_desirability()` creates a *specification* of a recipe step -#' that uses one or more "score" functions to measure how how much each predictor +#' that uses one or more "score" functions to measure how much each predictor #' is related to the outcome value. These scores are combined into a composite #' value using user-specified _desirability_ functions and a proportion of the #' most desirable predictors are retained. @@ -58,6 +58,9 @@ #' specifically referenced by name. To avoid this, see the advice in the _Tips #' for saving recipes and filtering columns_ section of [recipes::selections]. #' +#' ```{r child = "man/rmd/filtro-scores.Rmd"} +#' ``` +#' #' ## Ties #' #' Note that [dplyr::slice_max()] with the argument `with_ties = TRUE ` is used diff --git a/man/rmd/filtro-scores.Rmd b/man/rmd/filtro-scores.Rmd new file mode 100644 index 0000000..48c76fb --- /dev/null +++ b/man/rmd/filtro-scores.Rmd @@ -0,0 +1,34 @@ +```{r} +#| include: false +library(filtro) +scores <- ls(pattern = "^score_", envir = asNamespace("filtro")) +scores <- sort(scores) +no_prefix <- gsub("^score_", "", scores) + +f_ver <- packageDescription("filtro")$Version +``` + +## Scoring Functions + +As of version `r f_ver` of the \pkg{filtro} package, the following score functions are available: + +\itemize{ + +```{r} +#| echo: FALSE +#| results: asis + +cat( + paste0( + " \\item \\code{", + no_prefix, + "} (\\code{\\link[filtro:score_", + no_prefix, + "]{documentation}})", + collapse = "\n" +) +) +cat("\n\n") +``` + +} diff --git a/man/step_predictor_best.Rd b/man/step_predictor_best.Rd index 870a723..4c583df 100644 --- a/man/step_predictor_best.Rd +++ b/man/step_predictor_best.Rd @@ -25,7 +25,7 @@ operations for this recipe.} See \code{\link[recipes:selections]{selections()}} for more details.} \item{score}{The name of a single score function from the \pkg{filtro} -package, such as \code{\link[filtro:score_imp_rf]{filtro::score_imp_rf()}}, \code{\link[filtro:score_roc_auc]{filtro:: score_roc_auc()}}, etc. +package, such as \code{"imp_rf"} (for \code{\link[filtro:score_imp_rf]{filtro::score_imp_rf()}}), etc. See the Details and Examples sections below. This argument \emph{should be named} when used.} @@ -54,17 +54,71 @@ may affect the computations for subsequent operations.} \item{id}{A character string that is unique to this step to identify it.} } \description{ -\code{step_predictor_best()} creates a \emph{specification} of a recipe step that will -perform feature selection by ... +\code{step_predictor_best()} creates a \emph{specification} of a recipe step that uses +a single scoring function to measure how much each predictor is related to +the outcome value. This step retains a proportion of the most important +predictors, and this proportion can be tuned. } \details{ -This step ... +\subsection{Scoring Functions}{ -This step can potentially remove columns from the data set. This may -cause issues for subsequent steps in your recipe if the missing columns are -specifically referenced by name. To avoid this, see the advice in the -\emph{Tips for saving recipes and filtering columns} section of -\link[recipes:selections]{recipes::selections}. +As of version 0.2.0 of the \pkg{filtro} package, the following score functions are available: + +\itemize{ + +\item \code{aov_fstat} (\code{\link[filtro:score_aov_fstat]{documentation}}) +\item \code{aov_pval} (\code{\link[filtro:score_aov_pval]{documentation}}) +\item \code{cor_pearson} (\code{\link[filtro:score_cor_pearson]{documentation}}) +\item \code{cor_spearman} (\code{\link[filtro:score_cor_spearman]{documentation}}) +\item \code{gain_ratio} (\code{\link[filtro:score_gain_ratio]{documentation}}) +\item \code{imp_rf} (\code{\link[filtro:score_imp_rf]{documentation}}) +\item \code{imp_rf_conditional} (\code{\link[filtro:score_imp_rf_conditional]{documentation}}) +\item \code{imp_rf_oblique} (\code{\link[filtro:score_imp_rf_oblique]{documentation}}) +\item \code{info_gain} (\code{\link[filtro:score_info_gain]{documentation}}) +\item \code{roc_auc} (\code{\link[filtro:score_roc_auc]{documentation}}) +\item \code{sym_uncert} (\code{\link[filtro:score_sym_uncert]{documentation}}) +\item \code{xtab_pval_chisq} (\code{\link[filtro:score_xtab_pval_chisq]{documentation}}) +\item \code{xtab_pval_fisher} (\code{\link[filtro:score_xtab_pval_fisher]{documentation}}) + +} + +Some important notes: +\itemize{ +\item Scores that are p-values are automatically transformed by \pkg{filtro} to +be in the format \code{-log10(pvalue)} so that a p-value of 0.1 is converted to +1.0. For these, use the \code{maximize()} goal. +\item Other scores are also transformed in the data. For example, the correlation +scores given to the recipe step are in absolute value format. See the +\pkg{filtro} documentation for each score. +\item You can use some in-line functions using base R functions. For example, +\code{maximize(max(score_cor_spearman))}. +\item If a predictor cannot be computed for all scores, it is given a "fallback +value" that will prevent it from being excluded for this reason. +} + +This step can potentially remove columns from the data set. This may cause +issues for subsequent steps in your recipe if the missing columns are +specifically referenced by name. To avoid this, see the advice in the \emph{Tips +for saving recipes and filtering columns} section of \link[recipes:selections]{recipes::selections}. +} + +\subsection{Ties}{ + +Note that \code{\link[dplyr:slice]{dplyr::slice_max()}} with the argument \code{with_ties = TRUE } is used +to select predictors. If there are many ties in overall desirability, the +proportion selected can be larger than the value given to \code{prep_terms()}. +} + +\subsection{Case Weights}{ + +Case weights can be used by some scoring functions. To learn more, load the +\pkg{filtro} package and check the \code{case_weights} property of the score object +(see Examples below). For a recipe, use one of the tidymodels case weight +functions such as \code{\link[hardhat:importance_weights]{hardhat::importance_weights()}} or +\link[hardhat:frequency_weights]{hardhat::frequency_weights}, to assign the correct data type to the vector of case +weights. A recipe will then interpret that class to be a case weight (and no +other role). A full example is below. +} } \section{Tidying}{ When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index 3e297db..c238a20 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -71,7 +71,7 @@ Once trained, additional columns are included (see Details section). } \description{ \code{step_predictor_desirability()} creates a \emph{specification} of a recipe step -that uses one or more "score" functions to measure how how much each predictor +that uses one or more "score" functions to measure how much each predictor is related to the outcome value. These scores are combined into a composite value using user-specified \emph{desirability} functions and a proportion of the most desirable predictors are retained. @@ -107,6 +107,29 @@ This step can potentially remove columns from the data set. This may cause issues for subsequent steps in your recipe if the missing columns are specifically referenced by name. To avoid this, see the advice in the \emph{Tips for saving recipes and filtering columns} section of \link[recipes:selections]{recipes::selections}. +\subsection{Scoring Functions}{ + +As of version 0.2.0 of the \pkg{filtro} package, the following score functions are available: + +\itemize{ + +\item \code{aov_fstat} (\code{\link[filtro:score_aov_fstat]{documentation}}) +\item \code{aov_pval} (\code{\link[filtro:score_aov_pval]{documentation}}) +\item \code{cor_pearson} (\code{\link[filtro:score_cor_pearson]{documentation}}) +\item \code{cor_spearman} (\code{\link[filtro:score_cor_spearman]{documentation}}) +\item \code{gain_ratio} (\code{\link[filtro:score_gain_ratio]{documentation}}) +\item \code{imp_rf} (\code{\link[filtro:score_imp_rf]{documentation}}) +\item \code{imp_rf_conditional} (\code{\link[filtro:score_imp_rf_conditional]{documentation}}) +\item \code{imp_rf_oblique} (\code{\link[filtro:score_imp_rf_oblique]{documentation}}) +\item \code{info_gain} (\code{\link[filtro:score_info_gain]{documentation}}) +\item \code{roc_auc} (\code{\link[filtro:score_roc_auc]{documentation}}) +\item \code{sym_uncert} (\code{\link[filtro:score_sym_uncert]{documentation}}) +\item \code{xtab_pval_chisq} (\code{\link[filtro:score_xtab_pval_chisq]{documentation}}) +\item \code{xtab_pval_fisher} (\code{\link[filtro:score_xtab_pval_fisher]{documentation}}) + +} +} + \subsection{Ties}{ Note that \code{\link[dplyr:slice]{dplyr::slice_max()}} with the argument \code{with_ties = TRUE } is used diff --git a/man/step_predictor_retain.Rd b/man/step_predictor_retain.Rd index d0805c0..041d200 100644 --- a/man/step_predictor_retain.Rd +++ b/man/step_predictor_retain.Rd @@ -46,19 +46,70 @@ may affect the computations for subsequent operations.} \item{id}{A character string that is unique to this step to identify it.} } \description{ -\code{step_predictor_retain()} creates a \emph{specification} of a recipe step that will -perform feature selection by ... +\code{step_predictor_retain()} creates a \emph{specification} of a recipe step that +uses a logical statement that includes one or more scoring functions to +measure how much each predictor is related to the outcome value. This step +retains the predictors that pass the logical statement. } \details{ -This step ... +The \code{score} should be valid R syntax that produces a logical result and +should not use external data. The list of variables that can be used is in +the section below. +\subsection{Scoring Functions}{ -This step can potentially remove columns from the data set. This may -cause issues for subsequent steps in your recipe if the missing columns are -specifically referenced by name. To avoid this, see the advice in the -\emph{Tips for saving recipes and filtering columns} section of -\link[recipes:selections]{recipes::selections}. +As of version 0.2.0 of the \pkg{filtro} package, the following score functions are available: + +\itemize{ + +\item \code{aov_fstat} (\code{\link[filtro:score_aov_fstat]{documentation}}) +\item \code{aov_pval} (\code{\link[filtro:score_aov_pval]{documentation}}) +\item \code{cor_pearson} (\code{\link[filtro:score_cor_pearson]{documentation}}) +\item \code{cor_spearman} (\code{\link[filtro:score_cor_spearman]{documentation}}) +\item \code{gain_ratio} (\code{\link[filtro:score_gain_ratio]{documentation}}) +\item \code{imp_rf} (\code{\link[filtro:score_imp_rf]{documentation}}) +\item \code{imp_rf_conditional} (\code{\link[filtro:score_imp_rf_conditional]{documentation}}) +\item \code{imp_rf_oblique} (\code{\link[filtro:score_imp_rf_oblique]{documentation}}) +\item \code{info_gain} (\code{\link[filtro:score_info_gain]{documentation}}) +\item \code{roc_auc} (\code{\link[filtro:score_roc_auc]{documentation}}) +\item \code{sym_uncert} (\code{\link[filtro:score_sym_uncert]{documentation}}) +\item \code{xtab_pval_chisq} (\code{\link[filtro:score_xtab_pval_chisq]{documentation}}) +\item \code{xtab_pval_fisher} (\code{\link[filtro:score_xtab_pval_fisher]{documentation}}) + +} + +Some important notes: +\itemize{ +\item Scores that are p-values are automatically transformed by \pkg{filtro} to +be in the format \code{-log10(pvalue)} so that a p-value of 0.1 is converted to +1.0. For these, use the \code{maximize()} goal. +\item Other scores are also transformed in the data. For example, the correlation +scores given to the recipe step are in absolute value format. See the +\pkg{filtro} documentation for each score. +\item You can use some in-line functions using base R functions. For example, +\code{maximize(max(score_cor_spearman))}. +\item If a predictor cannot be computed for all scores, it is given a "fallback +value" that will prevent it from being excluded for this reason. +} + +This step can potentially remove columns from the data set. This may cause +issues for subsequent steps in your recipe if the missing columns are +specifically referenced by name. To avoid this, see the advice in the \emph{Tips +for saving recipes and filtering columns} section of \link[recipes:selections]{recipes::selections}. } -\section{Tidying}{ + +\subsection{Case Weights}{ + +Case weights can be used by some scoring functions. To learn more, load the +\pkg{filtro} package and check the \code{case_weights} property of the score object +(see Examples below). For a recipe, use one of the tidymodels case weight +functions such as \code{\link[hardhat:importance_weights]{hardhat::importance_weights()}} or +\link[hardhat:frequency_weights]{hardhat::frequency_weights}, to assign the correct data type to the vector of case +weights. A recipe will then interpret that class to be a case weight (and no +other role). A full example is below. +} + +\subsection{Tidy method}{ + When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is returned with columns \code{terms} and \code{id}: @@ -69,14 +120,14 @@ returned with columns \code{terms} and \code{id}: The underlying operation does not allow for case weights. } - +} \examples{ library(recipes) rec <- recipe(mpg ~ ., data = mtcars) |> step_predictor_retain( all_predictors(), - score = cor_pearson >= 0.75 & cor_spearman >= 0.75 + score = cor_pearson >= 0.75 | cor_spearman >= 0.75 ) prepped <- prep(rec) From a3a06d0e63d02e183b88b1603eedff1746cf7070 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 3 Sep 2025 13:48:44 -0400 Subject: [PATCH 44/65] ensure that transformed scores are used --- R/step-predictor-best.R | 2 +- R/step-predictor-retain.R | 2 +- R/step_predictor_desirability.R | 2 +- tests/testthat/test-step-predictor-best.R | 3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index c1cf986..c5e9dd1 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -201,7 +201,7 @@ calculate_predictor_best <- function( score_df <- # save for tidy method score_res |> - filtro::fill_safe_value(return_results = TRUE) + filtro::fill_safe_value(return_results = TRUE, transform = TRUE) # ------------------------------------------------------------------------------ # filter predictors diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index 16da51c..0977bad 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -205,7 +205,7 @@ calculate_predictor_retain <- function( score_df <- # save for tidy method score_res |> - filtro::fill_safe_values() + filtro::fill_safe_values(transform = TRUE) # ------------------------------------------------------------------------------ # filter predictors diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index ced8031..8916a9f 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -320,7 +320,7 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { score_objs <- score_objs |> - filtro::fill_safe_values() # <- transforms too + filtro::fill_safe_values(transform = TRUE) # The score names include "score_" but the column names don't rm_vec <- gsub("^score_", "", score_names) diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index 67fb5da..d8eafe4 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -13,7 +13,8 @@ test_that("step works", { cor_pearson_res <- filtro::score_cor_pearson |> filtro::fit(mpg ~ ., data = mtcars) - cor_pearson_res <- cor_pearson_res |> filtro::fill_safe_value() + cor_pearson_res <- cor_pearson_res |> + filtro::fill_safe_value(transform = TRUE) exp <- cor_pearson_res@results |> dplyr::slice_max(score, prop = 1 / 2, with_ties = TRUE) |> dplyr::pull("predictor") From 9217b227385bca3422329ebccfaf0e9380281aa7 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 3 Sep 2025 15:14:54 -0400 Subject: [PATCH 45/65] harmonize the output objects --- R/important-package.R | 3 +- R/recipe_utils.R | 112 ++++++++++-------- R/step-predictor-best.R | 90 +++++++++----- R/step-predictor-retain.R | 61 ++++++---- R/step_predictor_desirability.R | 26 ++-- man/step_predictor_best.Rd | 20 ++++ man/step_predictor_desirability.Rd | 8 +- man/step_predictor_retain.Rd | 5 + .../testthat/_snaps/step-predictor-retain.md | 8 +- tests/testthat/test-step-predictor-retain.R | 52 ++++---- 10 files changed, 231 insertions(+), 154 deletions(-) diff --git a/R/important-package.R b/R/important-package.R index b82270d..e5a78e2 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -35,7 +35,8 @@ utils::globalVariables( ".d_overall", "outcome", "score_objs", - "retain" + ".removed", + "terms" ) ) diff --git a/R/recipe_utils.R b/R/recipe_utils.R index 752a0bb..98f94fd 100644 --- a/R/recipe_utils.R +++ b/R/recipe_utils.R @@ -1,76 +1,90 @@ # For an expression or quosure, list the user-specificied score functions extract_score_names <- function(x, call = rlang::caller_env()) { - if (rlang::is_quosure(x)) { - if (rlang::quo_is_missing(x)) { - cli::cli_abort("{.arg score} cannot be empty.", call = call) - } - x <- rlang::quo_get_expr(x) - } - res <- all.vars(x) - check_score_names(res, call) + if (rlang::is_quosure(x)) { + if (rlang::quo_is_missing(x)) { + cli::cli_abort("{.arg score} cannot be empty.", call = call) + } + x <- rlang::quo_get_expr(x) + } + res <- all.vars(x) + check_score_names(res, call) } check_score_names <- function(x, call) { - x <- unique(x) - res <- grep("^score_", x, value = TRUE) - if (length(res) == 0) { - cli::cli_abort("No score objects were found in {.arg score}.", call = call) - } - res + x <- unique(x) + res <- grep("^score_", x, value = TRUE) + if (length(res) == 0) { + cli::cli_abort("No score objects were found in {.arg score}.", call = call) + } + res } # TODO this should go into recipes pull_outcome_column_name <- function(x) { - outcome_name <- x$variable[x$role == "outcome"] - num_outcomes <- length(outcome_name) - if (num_outcomes != 1) { - cli::cli_abort("One column should have a role of {.val outcome}.") - } - outcome_name + outcome_name <- x$variable[x$role == "outcome"] + num_outcomes <- length(outcome_name) + if (num_outcomes != 1) { + cli::cli_abort("One column should have a role of {.val outcome}.") + } + outcome_name } check_weights <- function(object, weights) { - if (object@case_weights & !is.null(weights)) { - res <- weights - } else { - res <- NULL - } - res + if (object@case_weights & !is.null(weights)) { + res <- weights + } else { + res <- NULL + } + res } compute_score <- function(score, args, form, data, weights = NULL) { - score_obj <- find_score_object(score) + score_obj <- find_score_object(score) - # Process case weights - weights <- check_weights(score_obj, weights) - if (!is.null(weights)) { - args$case_weights <- weights - } + # Process case weights + weights <- check_weights(score_obj, weights) + if (!is.null(weights)) { + args$case_weights <- weights + } - cl <- rlang::call2( - "fit", - .ns = "generics", - object = quote(score_obj), - formula = quote(form), - data = quote(data) - ) - cl <- rlang::call_modify(cl, !!!args) + cl <- rlang::call2( + "fit", + .ns = "generics", + object = quote(score_obj), + formula = quote(form), + data = quote(data) + ) + cl <- rlang::call_modify(cl, !!!args) - res <- try(rlang::eval_tidy(cl), silent = TRUE) + res <- try(rlang::eval_tidy(cl), silent = TRUE) - # if error return all NA - res + # if error return all NA + res } # Temporary solution find_score_object <- function(x) { - utils::getFromNamespace(x, "filtro") + utils::getFromNamespace(x, "filtro") } update_prop <- function(num_cols, prop) { - min_prop <- 1/num_cols - if (prop < min_prop) { - prop <- min_prop + 2 * .Machine$double.eps - } - prop + min_prop <- 1 / num_cols + if (prop < min_prop) { + prop <- min_prop + 2 * .Machine$double.eps + } + prop +} + +tidy_filtro_rec <- function(x, ...) { + if (is_trained(x)) { + res <- + x$results |> + dplyr::select(-outcome, terms = predictor) |> + dplyr::relocate(.removed, .after = c(terms)) + } else { + term_names <- sel2char(x$terms) + res <- tibble::tibble(terms = term_names) + } + res$id <- x$id + res } diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index c5e9dd1..0972137 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -66,6 +66,20 @@ #' \item{terms}{character, the selectors or variables selected to be removed} #' \item{id}{character, id of this step} #' } +#' @examples +#' library(recipes) +#' +#' rec <- recipe(mpg ~ ., data = mtcars) |> +#' step_predictor_best( +#' all_predictors(), +#' score = "cor_spearman" +#' ) +#' +#' prepped <- prep(rec) +#' +#' bake(prepped, mtcars) +#' +#' tidy(prepped, 1) step_predictor_best <- function( recipe, ..., @@ -74,7 +88,7 @@ step_predictor_best <- function( trained = FALSE, prop_terms = 0.5, update_prop = TRUE, - #results = NULL, + results = NULL, removals = NULL, skip = FALSE, id = rand_id("predictor_best") @@ -88,7 +102,7 @@ step_predictor_best <- function( trained = trained, prop_terms = prop_terms, update_prop = update_prop, - #results = results, + results = results, removals = removals, skip = skip, id = id, @@ -105,7 +119,7 @@ step_predictor_best_new <- trained, prop_terms, update_prop = update_prop, - #results, + results, removals, skip, id, @@ -119,7 +133,7 @@ step_predictor_best_new <- trained = trained, prop_terms = prop_terms, update_prop = update_prop, - #results = results, + results = results, removals = removals, skip = skip, id = id, @@ -151,14 +165,24 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { outcome_name <- pull_outcome_column_name(info) if (length(col_names) > 1) { - filter <- calculate_predictor_best( - score = x$score, - prop_terms = x$prop_terms, - outcome = outcome_name, - data = training[, c(outcome_name, col_names)] + filter_res <- list( + raw = tibble::tibble( + outcome = character(0), + predictor = character(0), + score = double(0) + ), + removals = character(0) ) } else { - filter <- character(0) + filter_res <- list( + tibble::tibble( + outcome = character(0), + predictor = character(0), + score = double(0), + .removed = logical(0) + ), + removals = character(0) + ) } step_predictor_best_new( @@ -166,10 +190,10 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { score = x$score, role = x$role, trained = TRUE, - #results = score_objs, + results = filter_res$raw, prop_terms = x$prop_terms, update_prop = x$update_prop, - removals = filter, + removals = filter_res$removals, skip = x$skip, id = x$id, case_weights = were_weights_used @@ -199,20 +223,37 @@ calculate_predictor_best <- function( # The current filtro::fill_safe_value() only applies to class_score, not df nor tibble. - score_df <- # save for tidy method + score_df <- score_res |> filtro::fill_safe_value(return_results = TRUE, transform = TRUE) # ------------------------------------------------------------------------------ # filter predictors - final_res <- score_df |> - dplyr::slice_max(score, prop = prop_terms, with_ties = TRUE) |> - dplyr::pull("predictor") - if (length(final_res) == 0) { - # final_res <- fallback_pred() + if (score_res@direction == "maximize") { + keepers <- score_df |> + dplyr::slice_max(score, prop = prop_terms, with_ties = TRUE) + fallback_col <- score_df$predictor[which.max(score_df$score)[1]] + } else { + keepers <- score_df |> + dplyr::slice_min(score, prop = prop_terms, with_ties = TRUE) + fallback_col <- score_df$predictor[which.min(score_df$score)[1]] + } + keepers <- keepers |> dplyr::pull("predictor") + + if (length(keepers) == 0) { + keepers <- score_df$predictors[score_df$predictors != fallback_col] } - final_res + + removals <- setdiff(score_df$predictor, keepers) + + raw_res <- score_res@results |> dplyr::select(outcome, predictor, score) + raw_res$.removed <- raw_res$predictor %in% removals + + list( + raw = raw_res, + removals = removals + ) } #' @export @@ -244,16 +285,7 @@ print.step_predictor_best <- function( #' @usage NULL #' @export -tidy.step_predictor_best <- function(x, ...) { - if (is_trained(x)) { - res <- tibble::tibble(terms = unname(x$removals)) - } else { - term_names <- sel2char(x$terms) - res <- tibble::tibble(terms = term_names) - } - res$id <- x$id - res -} +tidy.step_predictor_best <- tidy_filtro_rec #' @export tunable.step_predictor_best <- function(x, ...) { diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index 0977bad..dc8b512 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -86,6 +86,7 @@ step_predictor_retain <- function( score, role = NA, trained = FALSE, + results = NULL, removals = NULL, skip = FALSE, id = rand_id("predictor_retain") @@ -97,6 +98,7 @@ step_predictor_retain <- function( role = role, trained = trained, score = rlang::enexpr(score), + results = results, removals = removals, skip = skip, id = id, @@ -111,6 +113,7 @@ step_predictor_retain_new <- role, trained, score, + results, removals, skip, id, @@ -122,6 +125,7 @@ step_predictor_retain_new <- role = role, trained = trained, score = score, + results = results, removals = removals, skip = skip, id = id, @@ -143,13 +147,20 @@ prep.step_predictor_retain <- function(x, training, info = NULL, ...) { outcome_name <- pull_outcome_column_name(info) if (length(col_names) > 1) { - filter <- calculate_predictor_retain( + filter_res <- calculate_predictor_retain( xpr = x$score, outcome = outcome_name, data = training[, c(outcome_name, col_names)] ) } else { - filter <- character(0) + filter_res <- list( + raw = tibble::tibble( + outcome = character(0), + predictor = character(0), + .removed = logical(0) + ), + removals = character(0) + ) } step_predictor_retain_new( @@ -157,7 +168,8 @@ prep.step_predictor_retain <- function(x, training, info = NULL, ...) { role = x$role, trained = TRUE, score = x$score, - removals = filter, + results = filter_res$raw, + removals = filter_res$removals, skip = x$skip, id = x$id, case_weights = were_weights_used @@ -210,17 +222,27 @@ calculate_predictor_retain <- function( # ------------------------------------------------------------------------------ # filter predictors - final_res <- score_df |> dplyr::filter(!!xpr) |> purrr::pluck("predictor") + keepers <- score_df |> dplyr::filter(!!xpr) |> purrr::pluck("predictor") - if (length(final_res) == 0) { - # final_res <- fallback_pred() - } - final_res -} + # if (length(keepers) == 0) { + # first_score <- all.vars(xpr)[1] + # first_score_obj <- score_res[[first_score]] + # + # if (first_score_obj@direction == "maximize") { + # keepers <- score_df$predictor[which.max(score_df[[first_score]])[1]] + # } else { + # keepers <- score_df$predictor[which.min(score_df[[first_score]])[1]] + # } + # } + removals <- setdiff(score_df$predictor, keepers) -fallback_pred <- function(scores) { - # get individual ranks - # save best average rank + raw_res <- filtro::bind_scores(score_res) + raw_res$.removed <- raw_res$predictor %in% removals + + list( + raw = raw_res, + removals = removals + ) } make_opt_list <- function(opts, scores) { @@ -247,8 +269,10 @@ print.step_predictor_retain <- function( ) { scores <- unique(all.vars(x$score)) + word <- ifelse(x$trained, "removing", "for") + title <- cli::format_inline( - "Feature selection using {.and {.code {scores}}} on" + "Feature selection using {.and {.code {scores}}} {word}" ) print_step( x$removals, @@ -263,16 +287,7 @@ print.step_predictor_retain <- function( #' @usage NULL #' @export -tidy.step_predictor_retain <- function(x, ...) { - if (is_trained(x)) { - res <- tibble::tibble(terms = unname(x$removals)) - } else { - term_names <- sel2char(x$terms) - res <- tibble::tibble(terms = term_names) - } - res$id <- x$id - res -} +tidy.step_predictor_retain <- tidy_filtro_rec #' @rdname required_pkgs.important #' @export diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 8916a9f..a31ee55 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -142,12 +142,12 @@ #' rec #' #' # Now evaluate the predictors and rank them via desirability: -#' rec_trained <- prep(rec) -#' rec_trained +#' prepped <- prep(rec) +#' prepped #' #' # Use the tidy() method to get the results: -#' predictor_scores <- tidy(rec_trained, number = 1) -#' mean(predictor_scores$retained) +#' predictor_scores <- tidy(prepped, number = 1) +#' mean(predictor_scores$.removed) #' predictor_scores #' #' # -------------------------------------------------------------------------- @@ -337,11 +337,11 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { dplyr::anti_join(score_df, keep_list[, "predictor"], by = "predictor") |> purrr::pluck("predictor") - score_df$retain <- score_df$predictor %in% rm_list + score_df$.removed <- score_df$predictor %in% rm_list score_df <- score_df |> dplyr::full_join(raw_scores, by = c("outcome", "predictor")) |> - dplyr::relocate(retain, .after = "predictor") + dplyr::relocate(.removed, .after = "predictor") step_predictor_desirability_new( terms = x$terms, @@ -389,19 +389,7 @@ print.step_predictor_desirability <- function( #' @usage NULL #' @export -tidy.step_predictor_desirability <- function(x, ...) { - if (is_trained(x)) { - res <- - x$results |> - dplyr::select(-outcome, terms = predictor) - res$retained <- !(res$terms %in% x$removals) - } else { - term_names <- sel2char(x$terms) - res <- tibble::tibble(terms = term_names) - } - res$id <- x$id - res -} +tidy.step_predictor_desirability <- tidy_filtro_rec #' @export tunable.step_predictor_desirability <- function(x, ...) { diff --git a/man/step_predictor_best.Rd b/man/step_predictor_best.Rd index 4c583df..858ce9a 100644 --- a/man/step_predictor_best.Rd +++ b/man/step_predictor_best.Rd @@ -12,6 +12,7 @@ step_predictor_best( trained = FALSE, prop_terms = 0.5, update_prop = TRUE, + results = NULL, removals = NULL, skip = FALSE, id = rand_id("predictor_best") @@ -41,6 +42,10 @@ used.} \item{update_prop}{A logical: should \code{prop_terms} be updated so that at least one predictor will be retained?} +\item{results}{A data frame of score and desirability values for each +predictor evaluated. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + \item{removals}{A character string that contains the names of predictors that should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} is called.} @@ -130,3 +135,18 @@ returned with columns \code{terms} and \code{id}: } } +\examples{ +library(recipes) + +rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_spearman" + ) + +prepped <- prep(rec) + +bake(prepped, mtcars) + +tidy(prepped, 1) +} diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index c238a20..fdcacf0 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -200,12 +200,12 @@ if (rlang::is_installed("modeldata")) { rec # Now evaluate the predictors and rank them via desirability: - rec_trained <- prep(rec) - rec_trained + prepped <- prep(rec) + prepped # Use the tidy() method to get the results: - predictor_scores <- tidy(rec_trained, number = 1) - mean(predictor_scores$retained) + predictor_scores <- tidy(prepped, number = 1) + mean(predictor_scores$.removed) predictor_scores # -------------------------------------------------------------------------- diff --git a/man/step_predictor_retain.Rd b/man/step_predictor_retain.Rd index 041d200..23d6c88 100644 --- a/man/step_predictor_retain.Rd +++ b/man/step_predictor_retain.Rd @@ -10,6 +10,7 @@ step_predictor_retain( score, role = NA, trained = FALSE, + results = NULL, removals = NULL, skip = FALSE, id = rand_id("predictor_retain") @@ -33,6 +34,10 @@ This argument \emph{should be named} when used.} \item{trained}{A logical to indicate if the quantities for preprocessing have been estimated.} +\item{results}{A data frame of score and desirability values for each +predictor evaluated. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} +is called.} + \item{removals}{A character string that contains the names of predictors that should be removed. These values are not determined until \code{\link[recipes:prep]{recipes::prep()}} is called.} diff --git a/tests/testthat/_snaps/step-predictor-retain.md b/tests/testthat/_snaps/step-predictor-retain.md index 90df8cb..195ac52 100644 --- a/tests/testthat/_snaps/step-predictor-retain.md +++ b/tests/testthat/_snaps/step-predictor-retain.md @@ -12,7 +12,7 @@ predictor: 10 -- Operations - * Feature selection using on: + * Feature selection using for: --- @@ -31,7 +31,7 @@ Training data contained 32 data points and no incomplete rows. -- Operations - * Feature selection using on: | Trained + * Feature selection using removing: | Trained # printing @@ -47,7 +47,7 @@ predictor: 10 -- Operations - * Feature selection using `cor_pearson` on: all_predictors() + * Feature selection using `cor_pearson` for: all_predictors() --- @@ -66,7 +66,7 @@ Training data contained 32 data points and no incomplete rows. -- Operations - * Feature selection using `cor_pearson` on: cyl, disp, hp, wt | Trained + * Feature selection using `cor_pearson` removing: drat qsec, ... | Trained # bad args diff --git a/tests/testthat/test-step-predictor-retain.R b/tests/testthat/test-step-predictor-retain.R index 43b00a1..7aa1d29 100644 --- a/tests/testthat/test-step-predictor-retain.R +++ b/tests/testthat/test-step-predictor-retain.R @@ -18,7 +18,7 @@ test_that("step works", { cor_pearson_res, cor_spearman_res )) |> - dplyr::filter(abs(cor_pearson) >= 0.75 & abs(cor_spearman) >= 0.6) |> + dplyr::filter(abs(cor_pearson) < 0.75 | abs(cor_spearman) < 0.6) |> dplyr::pull(predictor) expect_identical( @@ -26,41 +26,36 @@ test_that("step works", { sort(exp) ) expect_identical( - sort(res_tidy$terms), + sort(res_tidy$terms[res_tidy$.removed]), sort(exp) ) + expect_named( + res_tidy, + c("terms", ".removed", "cor_pearson", "cor_spearman", "id") + ) }) -test_that("step allows for no removals", { +test_that("EVERYTHING MUST GO", { rec <- recipe(mpg ~ ., data = mtcars) |> step_predictor_retain( all_predictors(), - score = abs(cor_pearson) >= 0.99 & abs(cor_spearman) >= 0.99 + score = abs(cor_pearson) >= Inf & abs(cor_spearman) >= Inf ) prepped <- prep(rec) res_bake <- bake(prepped, mtcars) res_tidy <- tidy(prepped, 1) - cor_pearson_res <- filtro::score_cor_pearson |> - filtro::fit(mpg ~ ., data = mtcars) - cor_spearman_res <- filtro::score_cor_spearman |> - filtro::fit(mpg ~ ., data = mtcars) - - exp <- filtro::fill_safe_values(list( - cor_pearson_res, - cor_spearman_res - )) |> - dplyr::filter(abs(cor_pearson) >= 0.99 & abs(cor_spearman) >= 0.99) |> - dplyr::pull(predictor) - expect_identical( sort(setdiff(names(mtcars), names(res_bake))), - sort(exp) + sort(names(mtcars)[-1]) ) - expect_identical( - sort(res_tidy$terms), - sort(exp) + expect_true( + all(res_tidy$.removed) + ) + expect_named( + res_tidy, + c("terms", ".removed", "cor_pearson", "cor_spearman", "id") ) }) @@ -79,16 +74,16 @@ test_that("allows for one score", { filtro::fit(mpg ~ ., data = mtcars) exp <- cor_pearson_res@results |> - dplyr::filter(abs(score) >= 0.7) |> + dplyr::filter(abs(score) < 0.7) |> dplyr::pull(predictor) expect_identical( sort(setdiff(names(mtcars), names(res_bake))), sort(exp) ) - expect_identical( - sort(res_tidy$terms), - sort(exp) + expect_named( + res_tidy, + c("terms", ".removed", "cor_pearson", "id") ) }) @@ -134,7 +129,14 @@ test_that("empty selection tidy method works", { rec <- prep(rec, mtcars) - expect_identical(tidy(rec, number = 1), expect) + expect_identical( + tidy(rec, number = 1), + tibble::tibble( + terms = character(0), + .removed = logical(0), + id = character(0) + ) + ) }) test_that("printing", { From 1f90ecf4922809a22fe030b64cae146136c048a0 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Wed, 3 Sep 2025 12:37:57 -0700 Subject: [PATCH 46/65] use any_of() in tidy_filtro_rec --- R/recipe_utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/recipe_utils.R b/R/recipe_utils.R index 98f94fd..cb0b8fb 100644 --- a/R/recipe_utils.R +++ b/R/recipe_utils.R @@ -80,7 +80,7 @@ tidy_filtro_rec <- function(x, ...) { res <- x$results |> dplyr::select(-outcome, terms = predictor) |> - dplyr::relocate(.removed, .after = c(terms)) + dplyr::relocate(dplyr::any_of(".removed"), .after = c(terms)) } else { term_names <- sel2char(x$terms) res <- tibble::tibble(terms = term_names) From f188d05c91db3f7cced694a49007203dad843050 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 3 Sep 2025 16:49:14 -0400 Subject: [PATCH 47/65] fix Rproj file --- important.Rproj | 1 + 1 file changed, 1 insertion(+) diff --git a/important.Rproj b/important.Rproj index e65709f..792a875 100644 --- a/important.Rproj +++ b/important.Rproj @@ -19,3 +19,4 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace From bbc01782b7e9f3e2934b78b2f1219416621fb968 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 3 Sep 2025 16:49:28 -0400 Subject: [PATCH 48/65] move some docs around --- R/step_predictor_desirability.R | 5 ++-- man/step_predictor_desirability.Rd | 37 +++++++++++++++--------------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index a31ee55..51d9251 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -37,6 +37,8 @@ #' define _goals_ for each score and pass that to the recipe in the `score` #' argument. #' +#' ```{r child = "man/rmd/filtro-scores.Rmd"} +#' ``` #' Some important notes: #' #' - Scores that are p-values are automatically transformed by \pkg{filtro} to @@ -58,9 +60,6 @@ #' specifically referenced by name. To avoid this, see the advice in the _Tips #' for saving recipes and filtering columns_ section of [recipes::selections]. #' -#' ```{r child = "man/rmd/filtro-scores.Rmd"} -#' ``` -#' #' ## Ties #' #' Note that [dplyr::slice_max()] with the argument `with_ties = TRUE ` is used diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index fdcacf0..635cd68 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -88,25 +88,6 @@ in \code{\link[desirability2:d_overall]{desirability2::d_overall()}} and \code{\ To define desirabilities, use \code{\link[desirability2:desirability]{desirability2::desirability()}} function to define \emph{goals} for each score and pass that to the recipe in the \code{score} argument. - -Some important notes: -\itemize{ -\item Scores that are p-values are automatically transformed by \pkg{filtro} to -be in the format \code{-log10(pvalue)} so that a p-value of 0.1 is converted to -1.0. For these, use the \code{maximize()} goal. -\item Other scores are also transformed in the data. For example, the correlation -scores given to the recipe step are in absolute value format. See the -\pkg{filtro} documentation for each score. -\item You can use some in-line functions using base R functions. For example, -\code{maximize(max(score_cor_spearman))}. -\item If a predictor cannot be computed for all scores, it is given a "fallback -value" that will prevent it from being excluded for this reason. -} - -This step can potentially remove columns from the data set. This may cause -issues for subsequent steps in your recipe if the missing columns are -specifically referenced by name. To avoid this, see the advice in the \emph{Tips -for saving recipes and filtering columns} section of \link[recipes:selections]{recipes::selections}. \subsection{Scoring Functions}{ As of version 0.2.0 of the \pkg{filtro} package, the following score functions are available: @@ -128,6 +109,24 @@ As of version 0.2.0 of the \pkg{filtro} package, the following score functions a \item \code{xtab_pval_fisher} (\code{\link[filtro:score_xtab_pval_fisher]{documentation}}) } +Some important notes: +\itemize{ +\item Scores that are p-values are automatically transformed by \pkg{filtro} to +be in the format \code{-log10(pvalue)} so that a p-value of 0.1 is converted to +1.0. For these, use the \code{maximize()} goal. +\item Other scores are also transformed in the data. For example, the correlation +scores given to the recipe step are in absolute value format. See the +\pkg{filtro} documentation for each score. +\item You can use some in-line functions using base R functions. For example, +\code{maximize(max(score_cor_spearman))}. +\item If a predictor cannot be computed for all scores, it is given a "fallback +value" that will prevent it from being excluded for this reason. +} + +This step can potentially remove columns from the data set. This may cause +issues for subsequent steps in your recipe if the missing columns are +specifically referenced by name. To avoid this, see the advice in the \emph{Tips +for saving recipes and filtering columns} section of \link[recipes:selections]{recipes::selections}. } \subsection{Ties}{ From a976bb9ede2e21c992af63631bc9585fa70c56a7 Mon Sep 17 00:00:00 2001 From: topepo Date: Wed, 3 Sep 2025 20:26:54 -0400 Subject: [PATCH 49/65] add code back in --- R/step-predictor-best.R | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index 0972137..116ac62 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -165,13 +165,11 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { outcome_name <- pull_outcome_column_name(info) if (length(col_names) > 1) { - filter_res <- list( - raw = tibble::tibble( - outcome = character(0), - predictor = character(0), - score = double(0) - ), - removals = character(0) + filter_res <- calculate_predictor_best( + score = x$score, + prop_terms = x$prop_terms, + outcome = outcome_name, + data = training[, c(outcome_name, col_names)] ) } else { filter_res <- list( From 502deb2b12a80660c267546490fc2b6381191d50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98topepo=E2=80=99?= Date: Wed, 3 Sep 2025 21:26:04 -0400 Subject: [PATCH 50/65] more work on step_predictors_best --- R/recipe_utils.R | 12 ++-- R/step-predictor-best.R | 11 ++-- tests/testthat/_snaps/step-predictor-best.md | 68 ++++++++++++++++++++ tests/testthat/test-step-predictor-best.R | 20 +++--- 4 files changed, 94 insertions(+), 17 deletions(-) create mode 100644 tests/testthat/_snaps/step-predictor-best.md diff --git a/R/recipe_utils.R b/R/recipe_utils.R index cb0b8fb..738d93a 100644 --- a/R/recipe_utils.R +++ b/R/recipe_utils.R @@ -77,10 +77,14 @@ update_prop <- function(num_cols, prop) { tidy_filtro_rec <- function(x, ...) { if (is_trained(x)) { - res <- - x$results |> - dplyr::select(-outcome, terms = predictor) |> - dplyr::relocate(dplyr::any_of(".removed"), .after = c(terms)) + if (is.null(x$results)) { + res <- tibble::tibble(terms = character(), id = character()) + } else { + res <- + x$results |> + dplyr::select(-outcome, terms = predictor) |> + dplyr::relocate(dplyr::any_of(".removed"), .after = c(terms)) + } } else { term_names <- sel2char(x$terms) res <- tibble::tibble(terms = term_names) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index 116ac62..bddf28f 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -266,10 +266,13 @@ print.step_predictor_best <- function( width = max(20, options()$width - 36), ... ) { - scores <- unique(x$score) - title <- cli::format_inline( - "Feature selection via {.code {scores}} on" - ) + if (identical(x$score, rlang::enexpr())) { + title <- cli::format_inline("Feature selection on") + } else { + scores <- unique(x$score) + title <- cli::format_inline("Feature selection via {.code {scores}} on") + } + print_step( x$removals, x$terms, diff --git a/tests/testthat/_snaps/step-predictor-best.md b/tests/testthat/_snaps/step-predictor-best.md new file mode 100644 index 0000000..d7fbf6b --- /dev/null +++ b/tests/testthat/_snaps/step-predictor-best.md @@ -0,0 +1,68 @@ +# empty printing + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Feature selection on: + +--- + + Code + rec + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection on: | Trained + +# printing + + Code + print(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + predictor: 11 + + -- Operations + * Feature selection on: all_predictors() + +--- + + Code + prep(rec) + Condition + Error in `step_predictor_best()`: + Caused by error in `pull_outcome_column_name()`: + ! One column should have a role of "outcome". + +# bad args + + Code + prep(step_predictor_best(recipe(mpg ~ ., mtcars), all_predictors(), prop_terms = 2)) + Condition + Error in `step_predictor_best()`: + Caused by error in `prep()`: + ! `prop_terms` must be a number between 2.22044604925031e-16 and 1, not the number 2. + diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index d8eafe4..12a4582 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -21,18 +21,19 @@ test_that("step works", { expect_identical( sort(setdiff(names(mtcars), names(res_bake))), - sort(exp) + sort(setdiff(names(mtcars)[-1], exp)) ) expect_identical( - sort(res_tidy$terms), - sort(exp) + sort(res_tidy$terms[res_tidy$.removed]), + sort(setdiff(names(mtcars)[-1], exp)) + ) + expect_named( + res_tidy, + c("terms", ".removed", "score", "id") ) }) -# TODO Add more tests - -skip() # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { @@ -84,13 +85,13 @@ test_that("printing", { step_predictor_best(all_predictors()) expect_snapshot(print(rec)) - expect_snapshot(prep(rec)) + expect_snapshot(prep(rec), error = TRUE) # Emil: is this intended? }) test_that("tunable is setup to work with extract_parameter_set_dials", { skip_if_not_installed("dials") rec <- recipe(~., data = mtcars) |> - step_predictor_best(all_predictors(), threshold = hardhat::tune()) + step_predictor_best(all_predictors(), prop_terms = hardhat::tune()) params <- extract_parameter_set_dials(rec) @@ -101,13 +102,14 @@ test_that("tunable is setup to work with extract_parameter_set_dials", { test_that("bad args", { expect_snapshot( recipe(mpg ~ ., mtcars) |> - step_predictor_best(all_predictors(), threshold = 2) |> + step_predictor_best(all_predictors(), prop_terms = 2) |> prep(), error = TRUE ) }) test_that("0 and 1 rows data work in bake method", { + skip("Emil: unsure if this is an intended error") data <- mtcars rec <- recipe(~., data) |> step_predictor_best(all_numeric_predictors()) |> From 6ea9a165a5cfd6bf023a9ec0f318d2a50023bd3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98topepo=E2=80=99?= Date: Wed, 3 Sep 2025 22:05:02 -0400 Subject: [PATCH 51/65] tests for desirability step --- R/step_predictor_desirability.R | 12 +- .../_snaps/step-predictor-desirability.md | 55 +++++ tests/testthat/helper-objects.R | 6 + .../test-step-predictor-desirability.R | 194 ++++++++++++++---- 4 files changed, 227 insertions(+), 40 deletions(-) create mode 100644 tests/testthat/_snaps/step-predictor-desirability.md diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 51d9251..6557d3c 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -329,9 +329,15 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { # make desirability expression/eval quosure score_df <- desirability2::make_desirability_cols(x$score, score_objs) - keep_list <- - score_df |> - dplyr::slice_max(.d_overall, prop = x$prop_terms, with_ties = TRUE) + bad_news <- purrr::map_lgl(score_df$.d_overall, ~ identical(.x, 0.0)) + if (all(bad_news)) { + keep_list <- score_df[0,] + } else { + keep_list <- + score_df |> + dplyr::slice_max(.d_overall, prop = x$prop_terms, with_ties = TRUE) + } + rm_list <- dplyr::anti_join(score_df, keep_list[, "predictor"], by = "predictor") |> purrr::pluck("predictor") diff --git a/tests/testthat/_snaps/step-predictor-desirability.md b/tests/testthat/_snaps/step-predictor-desirability.md new file mode 100644 index 0000000..c8a1c2f --- /dev/null +++ b/tests/testthat/_snaps/step-predictor-desirability.md @@ -0,0 +1,55 @@ +# empty printing + + Code + step_predictor_desirability(rec) + Condition + Error in `step_predictor_desirability()`: + ! argument "score" is missing, with no default + +# printing + + Code + print(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Operations + * Feature selection via desirability functions (`cor_pearson` and + `cor_spearman`) on: all_predictors() + +--- + + Code + prep(rec) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection via desirability functions (`cor_pearson` and + `cor_spearman`) on: drat, qsec, am, gear, carb | Trained + +# bad args + + Code + prep(step_predictor_desirability(recipe(mpg ~ ., mtcars), all_predictors(), + score = goals, prop_terms = 2)) + Condition + Error in `step_predictor_desirability()`: + Caused by error in `prep()`: + ! `prop_terms` must be a number between 2.22044604925031e-16 and 1, not the number 2. + diff --git a/tests/testthat/helper-objects.R b/tests/testthat/helper-objects.R index 1466704..69a4b92 100644 --- a/tests/testthat/helper-objects.R +++ b/tests/testthat/helper-objects.R @@ -100,3 +100,9 @@ POTATO <- function(x) { rlang::enquo(x) } +# ------------------------------------------------------------------------------ + +goals <- + desirability2::desirability( + maximize(score_cor_pearson), + constrain(score_cor_spearman, low = 0.7, high = 1)) diff --git a/tests/testthat/test-step-predictor-desirability.R b/tests/testthat/test-step-predictor-desirability.R index 55826bb..19acc51 100644 --- a/tests/testthat/test-step-predictor-desirability.R +++ b/tests/testthat/test-step-predictor-desirability.R @@ -1,5 +1,117 @@ +test_that("step works", { + set.seed(1) + rec <- recipe(mpg ~., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 0.2 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + # res_tidy <- tidy(prepped, 1) + + score_res <- + list( + filtro::score_cor_pearson |> filtro::fit(mpg ~ ., data = mtcars), + filtro::score_cor_spearman |> filtro::fit(mpg ~ ., data = mtcars) + ) |> + filtro::fill_safe_values(transform = TRUE) |> + dplyr::mutate( + d_pearson = desirability2::d_max(cor_pearson, use_data = TRUE), + d_spearman = desirability2::d_box(cor_spearman, low = 0.7, high = 1.0), + d_overall = desirability2::d_overall(dplyr::across(dplyr::starts_with("d_"))) + ) + + retained <- score_res |> dplyr::slice_max(d_overall, prop = 0.2, with_ties = TRUE) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(setdiff(names(mtcars)[-1], retained$predictor)) + ) + + # expect_identical( + # sort(res_tidy$terms[res_tidy$.removed]), + # sort(setdiff(names(mtcars)[-1], exp)) + # ) + # expect_named( + # res_tidy, + # c("terms", ".removed", "score", "id") + # ) +}) + +test_that("EVERYTHING MUST GO", { + bad_goals <- + desirability2::desirability( + constrain(score_cor_spearman, low = 2, high = 3)) + + set.seed(1) + rec <- recipe(mpg ~., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = bad_goals, + prop_terms = 0.2 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + names(res_bake), + "mpg" + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$.removed]), + sort(names(mtcars)[-1]) + ) + # TODO max fix + # expect_named( + # res_tidy, + # c("terms", ".removed", "score", "id") + # ) + +}) + +test_that("keep everything", { + easy_goals <- + desirability2::desirability( + constrain(score_cor_spearman, low = -2, high = 3)) + + set.seed(1) + rec <- recipe(mpg ~., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = easy_goals, + prop_terms = 0.2 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(names(res_bake)), + sort(names(mtcars)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$.removed]), + character(0) + ) + # TODO max fix + # expect_named( + # res_tidy, + # c("terms", ".removed", "score", "id") + # ) + +}) + # Infrastructure --------------------------------------------------------------- -skip("not yet!") test_that("bake method errors when needed non-standard role columns are missing", { # Here for completeness @@ -9,65 +121,73 @@ test_that("bake method errors when needed non-standard role columns are missing" test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) - rec <- step_predictor_desirability(rec) - - expect_snapshot(rec) - - rec <- prep(rec, mtcars) - - expect_snapshot(rec) + expect_snapshot(step_predictor_desirability(rec), error = TRUE) + # TODO should this error? + # expect_snapshot(rec) + # + # rec <- prep(rec, mtcars) + # + # expect_snapshot(rec) }) test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) - rec2 <- step_predictor_desirability(rec1) - - rec1 <- prep(rec1, mtcars) - rec2 <- prep(rec2, mtcars) - - baked1 <- bake(rec1, mtcars) - baked2 <- bake(rec2, mtcars) - - expect_identical(baked1, baked2) + # TODO should this error? + # rec2 <- step_predictor_desirability(rec1) + # + # rec1 <- prep(rec1, mtcars) + # rec2 <- prep(rec2, mtcars) + # + # baked1 <- bake(rec1, mtcars) + # baked2 <- bake(rec2, mtcars) + # + # expect_identical(baked1, baked2) }) test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) - rec <- step_predictor_desirability(rec) - - expect <- tibble(terms = character(), id = character()) - - expect_identical(tidy(rec, number = 1), expect) - - rec <- prep(rec, mtcars) - - expect_identical(tidy(rec, number = 1), expect) + # TODO should this error? + # rec <- step_predictor_desirability(rec) + # + # expect <- tibble(terms = character(), id = character()) + # + # expect_identical(tidy(rec, number = 1), expect) + # + # rec <- prep(rec, mtcars) + # + # expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", { set.seed(1) - rec <- recipe(~., data = mtcars) |> - step_predictor_desirability(all_predictors()) + rec <- recipe(mpg ~., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals + ) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to work with extract_parameter_set_dials", { - skip_if_not_installed("dials") - rec <- recipe(~., data = mtcars) |> - step_predictor_desirability(all_predictors(), threshold = hardhat::tune()) + skip_if_not_installed("dials") + + rec <- recipe( ~ ., data = mtcars) |> + step_predictor_desirability(all_predictors(), + score = goals, + prop_terms = hardhat::tune()) - params <- extract_parameter_set_dials(rec) + params <- extract_parameter_set_dials(rec) - expect_s3_class(params, "parameters") - expect_identical(nrow(params), 1L) + expect_s3_class(params, "parameters") + expect_identical(nrow(params), 1L) }) test_that("bad args", { expect_snapshot( recipe(mpg ~ ., mtcars) |> - step_predictor_desirability(all_predictors(), threshold = 2) |> + step_predictor_desirability(all_predictors(), score = goals, prop_terms = 2) |> prep(), error = TRUE ) @@ -75,8 +195,8 @@ test_that("bad args", { test_that("0 and 1 rows data work in bake method", { data <- mtcars - rec <- recipe(~., data) |> - step_predictor_desirability(all_numeric_predictors()) |> + rec <- recipe(mpg ~., data) |> + step_predictor_desirability(all_numeric_predictors(), score = goals,) |> prep() expect_identical( From 0b4d090ff2f281b26dc341183eedbd880fef3303 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98topepo=E2=80=99?= Date: Wed, 3 Sep 2025 22:46:41 -0400 Subject: [PATCH 52/65] updates for step_predictor_desirability --- R/recipe_utils.R | 6 + R/step_predictor_desirability.R | 138 +++++++++--------- man/step_predictor_desirability.Rd | 93 ++++++------ .../_snaps/step-predictor-desirability.md | 4 + tests/testthat/helper-objects.R | 4 +- .../test-step-predictor-desirability.R | 74 +++++++--- 6 files changed, 185 insertions(+), 134 deletions(-) diff --git a/R/recipe_utils.R b/R/recipe_utils.R index 738d93a..21001ae 100644 --- a/R/recipe_utils.R +++ b/R/recipe_utils.R @@ -92,3 +92,9 @@ tidy_filtro_rec <- function(x, ...) { res$id <- x$id res } + +all_scores_missing <- function(x) { + scores <- dplyr::select(x, -outcome, -predictor) + all_missing <- purrr::map_lgl(scores, ~ all(is.na(.x))) + all(all_missing) +} diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 6557d3c..9a9e673 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -50,7 +50,7 @@ #' \pkg{filtro} documentation for each score. #' #' - You can use some in-line functions using base R functions. For example, -#' `maximize(max(score_cor_spearman))`. +#' `maximize(max(cor_spearman))`. #' #' - If a predictor cannot be computed for all scores, it is given a "fallback #' value" that will prevent it from being excluded for this reason. @@ -109,6 +109,8 @@ #' @seealso [desirability2::desirability()] #' @references Derringer, G. and Suich, R. (1980), Simultaneous Optimization of #' Several Response Variables. _Journal of Quality Technology_, 12, 214-219. +#' +#' [https://desirability2.tidymodels.org/reference/inline_desirability.html](https://desirability2.tidymodels.org/reference/inline_desirability.html) #' @examples #' library(recipes) #' library(desirability2) @@ -125,10 +127,10 @@ #' #' # The score_* objects here are from the filtro package. See Details above. #' goals <- -#' desirability( -#' maximize(score_xtab_pval_fisher), -#' maximize(score_aov_pval) -#' ) +#' desirability( +#' maximize(xtab_pval_fisher), +#' maximize(aov_pval) +#' ) #' #' example_data <- modeldata::ad_data #' rec <- @@ -136,7 +138,7 @@ #' step_predictor_desirability( #' all_predictors(), #' score = goals, -#' prop_terms = 1/2 +#' prop_terms = 1 / 2 #' ) #' rec #' @@ -149,53 +151,54 @@ #' mean(predictor_scores$.removed) #' predictor_scores #' -#' # -------------------------------------------------------------------------- -#' -#' # Case-weight example: use the hardhat package to create the appropriate type -#' # of case weights. Here, we'll increase the weights for the minority class and -#' # add them to the data frame. -#' -#' library(hardhat) +#' # -------------------------------------------------------------------------- #' -#' example_weights <- example_data -#' weights <- ifelse(example_data$Class == "Impaired", 5, 1) -#' example_weights$weights <- importance_weights(weights) +#' # Case-weight example: use the hardhat package to create the appropriate type +#' # of case weights. Here, we'll increase the weights for the minority class and +#' # add them to the data frame. #' -#' # To see if the scores can use case weights, load the filtro package and -#' # check the `case_weights` property: +#' library(hardhat) #' -#' library(filtro) +#' example_weights <- example_data +#' weights <- ifelse(example_data$Class == "Impaired", 5, 1) +#' example_weights$weights <- importance_weights(weights) #' -#' score_xtab_pval_fisher@case_weights -#' score_aov_pval@case_weights +#' # To see if the scores can use case weights, load the filtro package and +#' # check the `case_weights` property: #' -#' # The recipe will automatically find the case weights and will -#' # not treat them as predictors. -#' rec_wts <- -#' recipe(Class ~ ., data = example_weights) |> -#' step_predictor_desirability( -#' all_predictors(), -#' score = goals, -#' prop_terms = 1/2 -#' ) |> -#' prep() -#' rec_wts +#' library(filtro) #' -#' predictor_scores_wts <- -#' tidy(rec_wts, number = 1) |> -#' select(terms, .d_overall_weighted = .d_overall) +#' score_xtab_pval_fisher@case_weights +#' score_aov_pval@case_weights #' -#' library(dplyr) -#' library(ggplot2) -#' -#' # The selection did not substantially change with these case weights -#' full_join(predictor_scores, predictor_scores_wts, by = "terms") |> -#' ggplot(aes(.d_overall, .d_overall_weighted)) + -#' geom_abline(col = "darkgreen", lty = 2) + -#' geom_point(alpha = 1 / 2) + -#' coord_fixed(ratio = 1) + -#' labs(x = "Unweighted", y = "Class Weighted") +#' # The recipe will automatically find the case weights and will +#' # not treat them as predictors. +#' rec_wts <- +#' recipe(Class ~ ., data = example_weights) |> +#' step_predictor_desirability( +#' all_predictors(), +#' score = goals, +#' prop_terms = 1 / 2 +#' ) |> +#' prep() +#' rec_wts +#' +#' predictor_scores_wts <- +#' tidy(rec_wts, number = 1) |> +#' select(terms, .d_overall_weighted = .d_overall) +#' +#' library(dplyr) +#' library(ggplot2) +#' +#' # The selection did not substantially change with these case weights +#' full_join(predictor_scores, predictor_scores_wts, by = "terms") |> +#' ggplot(aes(.d_overall, .d_overall_weighted)) + +#' geom_abline(col = "darkgreen", lty = 2) + +#' geom_point(alpha = 1 / 2) + +#' coord_fixed(ratio = 1) + +#' labs(x = "Unweighted", y = "Class Weighted") #' } +#' step_predictor_desirability <- function( recipe, ..., @@ -305,7 +308,9 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { wts <- NULL } - score_names <- check_score_names(unlist(x$score@variables)) + var_names <- unique(unlist(x$score@variables)) + score_names <- paste0("score_", var_names) + score_names <- check_score_names(score_names) outcome_name <- pull_outcome_column_name(info) fm <- paste(outcome_name, "~ .") fm <- stats::as.formula(fm) @@ -317,25 +322,25 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { ) raw_scores <- filtro::bind_scores(score_objs) - score_objs <- - score_objs |> - filtro::fill_safe_values(transform = TRUE) - - # The score names include "score_" but the column names don't - rm_vec <- gsub("^score_", "", score_names) - names(rm_vec) <- score_names - score_objs <- dplyr::rename(score_objs, rm_vec) - - # make desirability expression/eval quosure - score_df <- desirability2::make_desirability_cols(x$score, score_objs) + score_df <- + score_objs |> + filtro::fill_safe_values(transform = TRUE) - bad_news <- purrr::map_lgl(score_df$.d_overall, ~ identical(.x, 0.0)) - if (all(bad_news)) { - keep_list <- score_df[0,] + if (all_scores_missing(raw_scores)) { + cli::cli_warn("All score computations failed; skipping feature selection.") + keep_list <- score_df } else { - keep_list <- - score_df |> - dplyr::slice_max(.d_overall, prop = x$prop_terms, with_ties = TRUE) + # make desirability expression/eval quosure + score_df <- desirability2::make_desirability_cols(x$score, score_df) + + bad_news <- purrr::map_lgl(score_df$.d_overall, ~ identical(.x, 0.0)) + if (all(bad_news)) { + keep_list <- score_df[0,] + } else { + keep_list <- + score_df |> + dplyr::slice_max(.d_overall, prop = x$prop_terms, with_ties = TRUE) + } } rm_list <- @@ -345,8 +350,10 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { score_df$.removed <- score_df$predictor %in% rm_list score_df <- score_df |> + dplyr::select(outcome, predictor, .removed, dplyr::starts_with(".d_")) |> dplyr::full_join(raw_scores, by = c("outcome", "predictor")) |> - dplyr::relocate(.removed, .after = "predictor") + dplyr::relocate(.removed, .after = "predictor") |> + dplyr::relocate(dplyr::starts_with(".d_"), .after = dplyr::everything()) step_predictor_desirability_new( terms = x$terms, @@ -375,8 +382,7 @@ print.step_predictor_desirability <- function( width = max(20, options()$width - 36), ... ) { - scores <- purrr::map_chr(x$score@variables, ~ gsub("score_", "", .x)) - scores <- unique(scores) + scores <- unique(x$score@variables) title <- cli::format_inline( "Feature selection via desirability functions ({.code {scores}}) on" diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index 635cd68..0f66774 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -118,7 +118,7 @@ be in the format \code{-log10(pvalue)} so that a p-value of 0.1 is converted to scores given to the recipe step are in absolute value format. See the \pkg{filtro} documentation for each score. \item You can use some in-line functions using base R functions. For example, -\code{maximize(max(score_cor_spearman))}. +\code{maximize(max(cor_spearman))}. \item If a predictor cannot be computed for all scores, it is given a "fallback value" that will prevent it from being excluded for this reason. } @@ -183,10 +183,10 @@ if (rlang::is_installed("modeldata")) { # The score_* objects here are from the filtro package. See Details above. goals <- - desirability( - maximize(score_xtab_pval_fisher), - maximize(score_aov_pval) - ) + desirability( + maximize(xtab_pval_fisher), + maximize(aov_pval) + ) example_data <- modeldata::ad_data rec <- @@ -194,7 +194,7 @@ if (rlang::is_installed("modeldata")) { step_predictor_desirability( all_predictors(), score = goals, - prop_terms = 1/2 + prop_terms = 1 / 2 ) rec @@ -207,57 +207,60 @@ if (rlang::is_installed("modeldata")) { mean(predictor_scores$.removed) predictor_scores - # -------------------------------------------------------------------------- - - # Case-weight example: use the hardhat package to create the appropriate type - # of case weights. Here, we'll increase the weights for the minority class and - # add them to the data frame. - - library(hardhat) + # -------------------------------------------------------------------------- - example_weights <- example_data - weights <- ifelse(example_data$Class == "Impaired", 5, 1) - example_weights$weights <- importance_weights(weights) + # Case-weight example: use the hardhat package to create the appropriate type + # of case weights. Here, we'll increase the weights for the minority class and + # add them to the data frame. - # To see if the scores can use case weights, load the filtro package and - # check the `case_weights` property: + library(hardhat) - library(filtro) + example_weights <- example_data + weights <- ifelse(example_data$Class == "Impaired", 5, 1) + example_weights$weights <- importance_weights(weights) - score_xtab_pval_fisher@case_weights - score_aov_pval@case_weights + # To see if the scores can use case weights, load the filtro package and + # check the `case_weights` property: - # The recipe will automatically find the case weights and will - # not treat them as predictors. - rec_wts <- - recipe(Class ~ ., data = example_weights) |> - step_predictor_desirability( - all_predictors(), - score = goals, - prop_terms = 1/2 - ) |> - prep() - rec_wts + library(filtro) - predictor_scores_wts <- - tidy(rec_wts, number = 1) |> - select(terms, .d_overall_weighted = .d_overall) + score_xtab_pval_fisher@case_weights + score_aov_pval@case_weights - library(dplyr) - library(ggplot2) - - # The selection did not substantially change with these case weights - full_join(predictor_scores, predictor_scores_wts, by = "terms") |> - ggplot(aes(.d_overall, .d_overall_weighted)) + - geom_abline(col = "darkgreen", lty = 2) + - geom_point(alpha = 1 / 2) + - coord_fixed(ratio = 1) + - labs(x = "Unweighted", y = "Class Weighted") + # The recipe will automatically find the case weights and will + # not treat them as predictors. + rec_wts <- + recipe(Class ~ ., data = example_weights) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 1 / 2 + ) |> + prep() + rec_wts + + predictor_scores_wts <- + tidy(rec_wts, number = 1) |> + select(terms, .d_overall_weighted = .d_overall) + + library(dplyr) + library(ggplot2) + + # The selection did not substantially change with these case weights + full_join(predictor_scores, predictor_scores_wts, by = "terms") |> + ggplot(aes(.d_overall, .d_overall_weighted)) + + geom_abline(col = "darkgreen", lty = 2) + + geom_point(alpha = 1 / 2) + + coord_fixed(ratio = 1) + + labs(x = "Unweighted", y = "Class Weighted") } + } \references{ Derringer, G. and Suich, R. (1980), Simultaneous Optimization of Several Response Variables. \emph{Journal of Quality Technology}, 12, 214-219. + +\url{https://desirability2.tidymodels.org/reference/inline_desirability.html} } \seealso{ \code{\link[desirability2:desirability]{desirability2::desirability()}} diff --git a/tests/testthat/_snaps/step-predictor-desirability.md b/tests/testthat/_snaps/step-predictor-desirability.md index c8a1c2f..c2f96b9 100644 --- a/tests/testthat/_snaps/step-predictor-desirability.md +++ b/tests/testthat/_snaps/step-predictor-desirability.md @@ -1,3 +1,7 @@ +# wrong score type + + All score computations failed; skipping feature selection. + # empty printing Code diff --git a/tests/testthat/helper-objects.R b/tests/testthat/helper-objects.R index 69a4b92..67633cc 100644 --- a/tests/testthat/helper-objects.R +++ b/tests/testthat/helper-objects.R @@ -104,5 +104,5 @@ POTATO <- function(x) { goals <- desirability2::desirability( - maximize(score_cor_pearson), - constrain(score_cor_spearman, low = 0.7, high = 1)) + maximize(cor_pearson), + constrain(cor_spearman, low = 0.7, high = 1)) diff --git a/tests/testthat/test-step-predictor-desirability.R b/tests/testthat/test-step-predictor-desirability.R index 19acc51..2f1930e 100644 --- a/tests/testthat/test-step-predictor-desirability.R +++ b/tests/testthat/test-step-predictor-desirability.R @@ -10,7 +10,7 @@ test_that("step works", { prepped <- prep(rec) res_bake <- bake(prepped, mtcars) - # res_tidy <- tidy(prepped, 1) + res_tidy <- tidy(prepped, 1) score_res <- list( @@ -31,20 +31,21 @@ test_that("step works", { sort(setdiff(names(mtcars)[-1], retained$predictor)) ) - # expect_identical( - # sort(res_tidy$terms[res_tidy$.removed]), - # sort(setdiff(names(mtcars)[-1], exp)) - # ) - # expect_named( - # res_tidy, - # c("terms", ".removed", "score", "id") - # ) + expect_identical( + sort(res_tidy$terms[res_tidy$.removed]), + sort(setdiff(names(mtcars)[-1], retained$predictor)) + ) + expect_named( + res_tidy, + c("terms", ".removed", "cor_pearson", "cor_spearman", ".d_max_cor_pearson", + ".d_box_cor_spearman", ".d_overall", "id") + ) }) test_that("EVERYTHING MUST GO", { bad_goals <- desirability2::desirability( - constrain(score_cor_spearman, low = 2, high = 3)) + constrain(cor_spearman, low = 2, high = 3)) set.seed(1) rec <- recipe(mpg ~., data = mtcars) |> @@ -68,18 +69,50 @@ test_that("EVERYTHING MUST GO", { sort(res_tidy$terms[res_tidy$.removed]), sort(names(mtcars)[-1]) ) - # TODO max fix - # expect_named( - # res_tidy, - # c("terms", ".removed", "score", "id") - # ) + expect_named( + res_tidy, + c("terms", ".removed", "cor_spearman", ".d_box_cor_spearman", ".d_overall", "id") + ) }) +test_that("wrong score type", { + wrong_goals <- + desirability2::desirability( + maximize(xtab_pval_fisher)) + + set.seed(1) + rec <- recipe(mpg ~., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = wrong_goals, + prop_terms = 0.2 + ) + + expect_snapshot_warning(prepped <- prep(rec)) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(names(res_bake)), + sort(names(mtcars)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$.removed]), + character(0) + ) + expect_named( + res_tidy, + c("terms", ".removed", "xtab_pval_fisher", "id") + ) +}) + test_that("keep everything", { easy_goals <- desirability2::desirability( - constrain(score_cor_spearman, low = -2, high = 3)) + constrain(cor_spearman, low = -2, high = 3)) set.seed(1) rec <- recipe(mpg ~., data = mtcars) |> @@ -103,11 +136,10 @@ test_that("keep everything", { sort(res_tidy$terms[res_tidy$.removed]), character(0) ) - # TODO max fix - # expect_named( - # res_tidy, - # c("terms", ".removed", "score", "id") - # ) + expect_named( + res_tidy, + c("terms", ".removed", "cor_spearman", ".d_box_cor_spearman", ".d_overall", "id") + ) }) From 098275135ac1f01b0b3985b76327b78cc453a922 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=98topepo=E2=80=99?= Date: Wed, 3 Sep 2025 23:07:58 -0400 Subject: [PATCH 53/65] dials skips and unquote dplyr::pull() --- R/step-predictor-best.R | 2 +- R/step-predictor-retain.R | 2 +- R/step_predictor_desirability.R | 2 +- tests/testthat/test-step-predictor-best.R | 4 ++-- tests/testthat/test-step-predictor-desirability.R | 2 +- tests/testthat/test-step-predictor-retain.R | 10 +++++----- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index bddf28f..58e750c 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -237,7 +237,7 @@ calculate_predictor_best <- function( dplyr::slice_min(score, prop = prop_terms, with_ties = TRUE) fallback_col <- score_df$predictor[which.min(score_df$score)[1]] } - keepers <- keepers |> dplyr::pull("predictor") + keepers <- keepers |> dplyr::pull(predictor) if (length(keepers) == 0) { keepers <- score_df$predictors[score_df$predictors != fallback_col] diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index dc8b512..fe2e39f 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -222,7 +222,7 @@ calculate_predictor_retain <- function( # ------------------------------------------------------------------------------ # filter predictors - keepers <- score_df |> dplyr::filter(!!xpr) |> purrr::pluck("predictor") + keepers <- score_df |> dplyr::filter(!!xpr) |> dplyr::pull(predictor) # if (length(keepers) == 0) { # first_score <- all.vars(xpr)[1] diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 9a9e673..37e72f3 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -345,7 +345,7 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { rm_list <- dplyr::anti_join(score_df, keep_list[, "predictor"], by = "predictor") |> - purrr::pluck("predictor") + dplyr::pull(predictor) score_df$.removed <- score_df$predictor %in% rm_list diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index 12a4582..2c307a8 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -17,7 +17,7 @@ test_that("step works", { filtro::fill_safe_value(transform = TRUE) exp <- cor_pearson_res@results |> dplyr::slice_max(score, prop = 1 / 2, with_ties = TRUE) |> - dplyr::pull("predictor") + dplyr::pull(predictor) expect_identical( sort(setdiff(names(mtcars), names(res_bake))), @@ -89,7 +89,7 @@ test_that("printing", { }) test_that("tunable is setup to work with extract_parameter_set_dials", { - skip_if_not_installed("dials") + skip_if_not_installed("dials", minimum_version = "1.4.1.9000") rec <- recipe(~., data = mtcars) |> step_predictor_best(all_predictors(), prop_terms = hardhat::tune()) diff --git a/tests/testthat/test-step-predictor-desirability.R b/tests/testthat/test-step-predictor-desirability.R index 2f1930e..acebbdc 100644 --- a/tests/testthat/test-step-predictor-desirability.R +++ b/tests/testthat/test-step-predictor-desirability.R @@ -203,7 +203,7 @@ test_that("printing", { }) test_that("tunable is setup to work with extract_parameter_set_dials", { - skip_if_not_installed("dials") + skip_if_not_installed("dials", minimum_version = "1.4.1.9000") rec <- recipe( ~ ., data = mtcars) |> step_predictor_desirability(all_predictors(), diff --git a/tests/testthat/test-step-predictor-retain.R b/tests/testthat/test-step-predictor-retain.R index 7aa1d29..8a6982a 100644 --- a/tests/testthat/test-step-predictor-retain.R +++ b/tests/testthat/test-step-predictor-retain.R @@ -15,11 +15,11 @@ test_that("step works", { filtro::fit(mpg ~ ., data = mtcars) exp <- filtro::fill_safe_values(list( - cor_pearson_res, - cor_spearman_res + cor_pearson_res, + cor_spearman_res )) |> - dplyr::filter(abs(cor_pearson) < 0.75 | abs(cor_spearman) < 0.6) |> - dplyr::pull(predictor) + dplyr::filter(abs(cor_pearson) < 0.75 | abs(cor_spearman) < 0.6) |> + dplyr::pull(predictor) expect_identical( sort(setdiff(names(mtcars), names(res_bake))), @@ -75,7 +75,7 @@ test_that("allows for one score", { exp <- cor_pearson_res@results |> dplyr::filter(abs(score) < 0.7) |> - dplyr::pull(predictor) + dplyr::pull(predictor) expect_identical( sort(setdiff(names(mtcars), names(res_bake))), From 8461c890b4a986ab30b1ff19ae3bb7da3f520a64 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 4 Sep 2025 06:56:27 -0400 Subject: [PATCH 54/65] enable and test case weights --- R/step-predictor-best.R | 26 +- R/step-predictor-retain.R | 22 +- tests/testthat/helper-objects.R | 46 ++- tests/testthat/test-step-predictor-best.R | 38 +- .../test-step-predictor-desirability.R | 353 ++++++++++-------- tests/testthat/test-step-predictor-retain.R | 35 +- 6 files changed, 319 insertions(+), 201 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index 58e750c..5c60a55 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -156,8 +156,10 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { x$prop_terms <- update_prop(length(col_names), x$prop_terms) } + # First we check the _type_ of weight to see if it is used. Later, in + # `compute_score()`, we check to see if the score supports case weights. wts <- get_case_weights(info, training) - were_weights_used <- are_weights_used(wts, unsupervised = TRUE) + were_weights_used <- are_weights_used(wts, unsupervised = FALSE) if (isFALSE(were_weights_used)) { wts <- NULL } @@ -169,7 +171,8 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { score = x$score, prop_terms = x$prop_terms, outcome = outcome_name, - data = training[, c(outcome_name, col_names)] + data = training[, c(outcome_name, col_names)], + weights = wts ) } else { filter_res <- list( @@ -202,7 +205,8 @@ calculate_predictor_best <- function( score, prop_terms, outcome = character(0), - data + data, + weights ) { score_function <- paste0("score_", score) @@ -212,8 +216,8 @@ calculate_predictor_best <- function( score_function, args = list(), form = fm, - data = data #, - #weights = wts + data = data, + weights = weights ) # ------------------------------------------------------------------------------ @@ -266,12 +270,12 @@ print.step_predictor_best <- function( width = max(20, options()$width - 36), ... ) { - if (identical(x$score, rlang::enexpr())) { - title <- cli::format_inline("Feature selection on") - } else { - scores <- unique(x$score) - title <- cli::format_inline("Feature selection via {.code {scores}} on") - } + if (identical(x$score, rlang::enexpr())) { + title <- cli::format_inline("Feature selection on") + } else { + scores <- unique(x$score) + title <- cli::format_inline("Feature selection via {.code {scores}} on") + } print_step( x$removals, diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index fe2e39f..c8b6bb3 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -138,8 +138,10 @@ prep.step_predictor_retain <- function(x, training, info = NULL, ...) { col_names <- recipes_eval_select(x$terms, training, info) check_type(training[, col_names], types = c("double", "integer")) + # First we check the _type_ of weight to see if it is used. Later, in + # `compute_score()`, we check to see if the score supports case weights. wts <- get_case_weights(info, training) - were_weights_used <- are_weights_used(wts, unsupervised = TRUE) + were_weights_used <- are_weights_used(wts, unsupervised = FALSE) if (isFALSE(were_weights_used)) { wts <- NULL } @@ -150,7 +152,8 @@ prep.step_predictor_retain <- function(x, training, info = NULL, ...) { filter_res <- calculate_predictor_retain( xpr = x$score, outcome = outcome_name, - data = training[, c(outcome_name, col_names)] + data = training[, c(outcome_name, col_names)], + weights = wts ) } else { filter_res <- list( @@ -183,6 +186,7 @@ calculate_predictor_retain <- function( xpr, outcome = character(0), data, + weights, opts = list() ) { all_scores <- unique(all.vars(xpr)) @@ -208,7 +212,8 @@ calculate_predictor_retain <- function( opts, compute_score, form = fm, - data = data + data = data, + weights = weights ) names(score_res) <- all_scores @@ -223,17 +228,6 @@ calculate_predictor_retain <- function( # filter predictors keepers <- score_df |> dplyr::filter(!!xpr) |> dplyr::pull(predictor) - - # if (length(keepers) == 0) { - # first_score <- all.vars(xpr)[1] - # first_score_obj <- score_res[[first_score]] - # - # if (first_score_obj@direction == "maximize") { - # keepers <- score_df$predictor[which.max(score_df[[first_score]])[1]] - # } else { - # keepers <- score_df$predictor[which.min(score_df[[first_score]])[1]] - # } - # } removals <- setdiff(score_df$predictor, keepers) raw_res <- filtro::bind_scores(score_res) diff --git a/tests/testthat/helper-objects.R b/tests/testthat/helper-objects.R index 67633cc..c088160 100644 --- a/tests/testthat/helper-objects.R +++ b/tests/testthat/helper-objects.R @@ -13,14 +13,14 @@ suppressPackageStartupMessages(library(desirability2)) CO2_ex <- CO2 |> dplyr::select(-Plant, -Treatment) co2_rec <- recipes::recipe(uptake ~ ., data = CO2_ex) |> - recipes::step_dummy(recipes::all_factor_predictors()) + recipes::step_dummy(recipes::all_factor_predictors()) reg_f_wflow <- workflows::workflow(uptake ~ ., parsnip::linear_reg()) reg_r_wflow <- workflows::workflow(co2_rec, parsnip::linear_reg()) reg_v_wflow <- - workflows::workflow() |> - workflows::add_model(parsnip::linear_reg()) |> - workflows::add_variables(outcomes = uptake, predictors = c(everything())) + workflows::workflow() |> + workflows::add_model(parsnip::linear_reg()) |> + workflows::add_variables(outcomes = uptake, predictors = c(everything())) reg_1d_wflow <- workflows::workflow(uptake ~ conc, parsnip::linear_reg()) reg_f_fit <- workflows:::fit.workflow(reg_f_wflow, CO2_ex) @@ -35,9 +35,9 @@ reg_mtr <- yardstick::metric_set(yardstick::rsq, yardstick::mae) cls_f_wflow <- workflows::workflow(Class ~ ., parsnip::logistic_reg()) cls_v_wflow <- - workflows::workflow() |> - workflows::add_model(parsnip::logistic_reg()) |> - workflows::add_variables(outcomes = Class, predictors = c(everything())) + workflows::workflow() |> + workflows::add_model(parsnip::logistic_reg()) |> + workflows::add_variables(outcomes = Class, predictors = c(everything())) cls_1d_wflow <- workflows::workflow(Class ~ tau, parsnip::logistic_reg()) if (rlang::is_installed("modeldata")) { @@ -48,9 +48,9 @@ if (rlang::is_installed("modeldata")) { dplyr::select(Class, tau, p_tau, VEGF, MMP10, Genotype, male) ad_rec <- - recipes::recipe(Class ~ ., data = ad_data_small) |> - recipes::step_pca(tau, p_tau, VEGF, MMP10, num_comp = 2) |> - recipes::step_dummy(recipes::all_factor_predictors()) + recipes::recipe(Class ~ ., data = ad_data_small) |> + recipes::step_pca(tau, p_tau, VEGF, MMP10, num_comp = 2) |> + recipes::step_dummy(recipes::all_factor_predictors()) cls_r_wflow <- workflows::workflow(ad_rec, parsnip::logistic_reg()) @@ -60,7 +60,11 @@ if (rlang::is_installed("modeldata")) { cls_1d_fit <- workflows:::fit.workflow(cls_1d_wflow, ad_data_small) } -cls_mtr <- yardstick::metric_set(yardstick::brier_class, yardstick::kap, yardstick::mcc) +cls_mtr <- yardstick::metric_set( + yardstick::brier_class, + yardstick::kap, + yardstick::mcc +) # ------------------------------------------------------------------------------ # survival examples @@ -79,7 +83,10 @@ if (rlang::is_installed("censored")) { srv_times <- (1:4) / 4 } -srv_mtr <- yardstick::metric_set(yardstick::concordance_survival, yardstick::roc_auc_survival) +srv_mtr <- yardstick::metric_set( + yardstick::concordance_survival, + yardstick::roc_auc_survival +) # ------------------------------------------------------------------------------ @@ -97,12 +104,15 @@ ex_seed <- # for recipes POTATO <- function(x) { - rlang::enquo(x) + rlang::enquo(x) } -# ------------------------------------------------------------------------------ - goals <- - desirability2::desirability( - maximize(cor_pearson), - constrain(cor_spearman, low = 0.7, high = 1)) + desirability2::desirability( + maximize(cor_pearson), + constrain(cor_spearman, low = 0.7, high = 1) + ) + +mtcars_wts <- mtcars +.wts <- seq(0, 1, length.out = 32) +mtcars_wts$case_weights <- hardhat::importance_weights(.wts) diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index 2c307a8..d6fece7 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -17,7 +17,7 @@ test_that("step works", { filtro::fill_safe_value(transform = TRUE) exp <- cor_pearson_res@results |> dplyr::slice_max(score, prop = 1 / 2, with_ties = TRUE) |> - dplyr::pull(predictor) + dplyr::pull(predictor) expect_identical( sort(setdiff(names(mtcars), names(res_bake))), @@ -25,15 +25,41 @@ test_that("step works", { ) expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), - sort(setdiff(names(mtcars)[-1], exp)) + sort(res_tidy$terms[res_tidy$.removed]), + sort(setdiff(names(mtcars)[-1], exp)) ) expect_named( - res_tidy, - c("terms", ".removed", "score", "id") + res_tidy, + c("terms", ".removed", "score", "id") ) }) +test_that("case weights work", { + unweighted <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 1 / 2 + ) |> + prep() + + weighted <- recipe(mpg ~ ., data = mtcars_wts) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 1 / 2 + ) |> + prep() + + expect_snapshot(print(weighted)) + + unweighted_res <- tidy(unweighted, number = 1) |> + dplyr::select(terms, unweighted = score) + weighted_res <- tidy(weighted, number = 1) |> + dplyr::select(terms, weighted = score) + both_res <- dplyr::full_join(unweighted_res, weighted_res, by = "terms") + expect_false(isTRUE(all.equal(both_res$weighted, both_res$unweighted))) +}) # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { @@ -109,7 +135,7 @@ test_that("bad args", { }) test_that("0 and 1 rows data work in bake method", { - skip("Emil: unsure if this is an intended error") + skip("Emil: unsure if this is an intended error") data <- mtcars rec <- recipe(~., data) |> step_predictor_best(all_numeric_predictors()) |> diff --git a/tests/testthat/test-step-predictor-desirability.R b/tests/testthat/test-step-predictor-desirability.R index acebbdc..fb1706b 100644 --- a/tests/testthat/test-step-predictor-desirability.R +++ b/tests/testthat/test-step-predictor-desirability.R @@ -1,146 +1,199 @@ test_that("step works", { - set.seed(1) - rec <- recipe(mpg ~., data = mtcars) |> - step_predictor_desirability( - all_predictors(), - score = goals, - prop_terms = 0.2 - ) - - prepped <- prep(rec) - - res_bake <- bake(prepped, mtcars) - res_tidy <- tidy(prepped, 1) - - score_res <- - list( - filtro::score_cor_pearson |> filtro::fit(mpg ~ ., data = mtcars), - filtro::score_cor_spearman |> filtro::fit(mpg ~ ., data = mtcars) - ) |> - filtro::fill_safe_values(transform = TRUE) |> - dplyr::mutate( - d_pearson = desirability2::d_max(cor_pearson, use_data = TRUE), - d_spearman = desirability2::d_box(cor_spearman, low = 0.7, high = 1.0), - d_overall = desirability2::d_overall(dplyr::across(dplyr::starts_with("d_"))) - ) - - retained <- score_res |> dplyr::slice_max(d_overall, prop = 0.2, with_ties = TRUE) - - expect_identical( - sort(setdiff(names(mtcars), names(res_bake))), - sort(setdiff(names(mtcars)[-1], retained$predictor)) - ) - - expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), - sort(setdiff(names(mtcars)[-1], retained$predictor)) - ) - expect_named( - res_tidy, - c("terms", ".removed", "cor_pearson", "cor_spearman", ".d_max_cor_pearson", - ".d_box_cor_spearman", ".d_overall", "id") - ) + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 0.2 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + score_res <- + list( + filtro::score_cor_pearson |> filtro::fit(mpg ~ ., data = mtcars), + filtro::score_cor_spearman |> filtro::fit(mpg ~ ., data = mtcars) + ) |> + filtro::fill_safe_values(transform = TRUE) |> + dplyr::mutate( + d_pearson = desirability2::d_max(cor_pearson, use_data = TRUE), + d_spearman = desirability2::d_box(cor_spearman, low = 0.7, high = 1.0), + d_overall = desirability2::d_overall(dplyr::across(dplyr::starts_with( + "d_" + ))) + ) + + retained <- score_res |> + dplyr::slice_max(d_overall, prop = 0.2, with_ties = TRUE) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(setdiff(names(mtcars)[-1], retained$predictor)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$.removed]), + sort(setdiff(names(mtcars)[-1], retained$predictor)) + ) + expect_named( + res_tidy, + c( + "terms", + ".removed", + "cor_pearson", + "cor_spearman", + ".d_max_cor_pearson", + ".d_box_cor_spearman", + ".d_overall", + "id" + ) + ) }) test_that("EVERYTHING MUST GO", { - bad_goals <- - desirability2::desirability( - constrain(cor_spearman, low = 2, high = 3)) - - set.seed(1) - rec <- recipe(mpg ~., data = mtcars) |> - step_predictor_desirability( - all_predictors(), - score = bad_goals, - prop_terms = 0.2 - ) - - prepped <- prep(rec) - - res_bake <- bake(prepped, mtcars) - res_tidy <- tidy(prepped, 1) - - expect_identical( - names(res_bake), - "mpg" - ) - - expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), - sort(names(mtcars)[-1]) - ) - expect_named( - res_tidy, - c("terms", ".removed", "cor_spearman", ".d_box_cor_spearman", ".d_overall", "id") - ) + bad_goals <- + desirability2::desirability( + constrain(cor_spearman, low = 2, high = 3) + ) + + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = bad_goals, + prop_terms = 0.2 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + expect_identical( + names(res_bake), + "mpg" + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$.removed]), + sort(names(mtcars)[-1]) + ) + expect_named( + res_tidy, + c( + "terms", + ".removed", + "cor_spearman", + ".d_box_cor_spearman", + ".d_overall", + "id" + ) + ) }) test_that("wrong score type", { - wrong_goals <- - desirability2::desirability( - maximize(xtab_pval_fisher)) - - set.seed(1) - rec <- recipe(mpg ~., data = mtcars) |> - step_predictor_desirability( - all_predictors(), - score = wrong_goals, - prop_terms = 0.2 - ) - - expect_snapshot_warning(prepped <- prep(rec)) - - res_bake <- bake(prepped, mtcars) - res_tidy <- tidy(prepped, 1) - - expect_identical( - sort(names(res_bake)), - sort(names(mtcars)) - ) - - expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), - character(0) - ) - expect_named( - res_tidy, - c("terms", ".removed", "xtab_pval_fisher", "id") - ) + wrong_goals <- + desirability2::desirability( + maximize(xtab_pval_fisher) + ) + + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = wrong_goals, + prop_terms = 0.2 + ) + + expect_snapshot_warning(prepped <- prep(rec)) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(names(res_bake)), + sort(names(mtcars)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$.removed]), + character(0) + ) + expect_named( + res_tidy, + c("terms", ".removed", "xtab_pval_fisher", "id") + ) }) test_that("keep everything", { - easy_goals <- - desirability2::desirability( - constrain(cor_spearman, low = -2, high = 3)) - - set.seed(1) - rec <- recipe(mpg ~., data = mtcars) |> - step_predictor_desirability( - all_predictors(), - score = easy_goals, - prop_terms = 0.2 - ) - - prepped <- prep(rec) - - res_bake <- bake(prepped, mtcars) - res_tidy <- tidy(prepped, 1) - - expect_identical( - sort(names(res_bake)), - sort(names(mtcars)) - ) - - expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), - character(0) - ) - expect_named( - res_tidy, - c("terms", ".removed", "cor_spearman", ".d_box_cor_spearman", ".d_overall", "id") - ) + easy_goals <- + desirability2::desirability( + constrain(cor_spearman, low = -2, high = 3) + ) + + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = easy_goals, + prop_terms = 0.2 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(names(res_bake)), + sort(names(mtcars)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$.removed]), + character(0) + ) + expect_named( + res_tidy, + c( + "terms", + ".removed", + "cor_spearman", + ".d_box_cor_spearman", + ".d_overall", + "id" + ) + ) +}) + +test_that("case weights work", { + unweighted <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 0.2 + ) |> + prep() + + weighted <- recipe(mpg ~ ., data = mtcars_wts) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 0.2 + ) |> + prep() + + expect_snapshot(print(weighted)) + unweighted_res <- tidy(unweighted, number = 1) |> + dplyr::select(terms, unweighted = cor_spearman) + weighted_res <- tidy(weighted, number = 1) |> + dplyr::select(terms, weighted = cor_spearman) + both_res <- dplyr::full_join(unweighted_res, weighted_res, by = "terms") + expect_false(isTRUE(all.equal(both_res$weighted, both_res$unweighted))) }) # Infrastructure --------------------------------------------------------------- @@ -192,34 +245,40 @@ test_that("empty selection tidy method works", { test_that("printing", { set.seed(1) - rec <- recipe(mpg ~., data = mtcars) |> - step_predictor_desirability( - all_predictors(), - score = goals - ) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals + ) expect_snapshot(print(rec)) expect_snapshot(prep(rec)) }) test_that("tunable is setup to work with extract_parameter_set_dials", { - skip_if_not_installed("dials", minimum_version = "1.4.1.9000") + skip_if_not_installed("dials", minimum_version = "1.4.1.9000") - rec <- recipe( ~ ., data = mtcars) |> - step_predictor_desirability(all_predictors(), - score = goals, - prop_terms = hardhat::tune()) + rec <- recipe(~., data = mtcars) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = hardhat::tune() + ) - params <- extract_parameter_set_dials(rec) + params <- extract_parameter_set_dials(rec) - expect_s3_class(params, "parameters") - expect_identical(nrow(params), 1L) + expect_s3_class(params, "parameters") + expect_identical(nrow(params), 1L) }) test_that("bad args", { expect_snapshot( recipe(mpg ~ ., mtcars) |> - step_predictor_desirability(all_predictors(), score = goals, prop_terms = 2) |> + step_predictor_desirability( + all_predictors(), + score = goals, + prop_terms = 2 + ) |> prep(), error = TRUE ) @@ -227,8 +286,8 @@ test_that("bad args", { test_that("0 and 1 rows data work in bake method", { data <- mtcars - rec <- recipe(mpg ~., data) |> - step_predictor_desirability(all_numeric_predictors(), score = goals,) |> + rec <- recipe(mpg ~ ., data) |> + step_predictor_desirability(all_numeric_predictors(), score = goals, ) |> prep() expect_identical( diff --git a/tests/testthat/test-step-predictor-retain.R b/tests/testthat/test-step-predictor-retain.R index 8a6982a..f52b3aa 100644 --- a/tests/testthat/test-step-predictor-retain.R +++ b/tests/testthat/test-step-predictor-retain.R @@ -15,11 +15,11 @@ test_that("step works", { filtro::fit(mpg ~ ., data = mtcars) exp <- filtro::fill_safe_values(list( - cor_pearson_res, - cor_spearman_res + cor_pearson_res, + cor_spearman_res )) |> - dplyr::filter(abs(cor_pearson) < 0.75 | abs(cor_spearman) < 0.6) |> - dplyr::pull(predictor) + dplyr::filter(abs(cor_pearson) < 0.75 | abs(cor_spearman) < 0.6) |> + dplyr::pull(predictor) expect_identical( sort(setdiff(names(mtcars), names(res_bake))), @@ -75,7 +75,7 @@ test_that("allows for one score", { exp <- cor_pearson_res@results |> dplyr::filter(abs(score) < 0.7) |> - dplyr::pull(predictor) + dplyr::pull(predictor) expect_identical( sort(setdiff(names(mtcars), names(res_bake))), @@ -87,6 +87,31 @@ test_that("allows for one score", { ) }) +test_that("case weights work", { + unweighted <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= 0.75 & abs(cor_spearman) >= 0.6 + ) |> + prep() + + weighted <- recipe(mpg ~ ., data = mtcars_wts) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= 0.75 & abs(cor_spearman) >= 0.6 + ) |> + prep() + + expect_snapshot(print(weighted)) + + unweighted_res <- tidy(unweighted, number = 1) |> + dplyr::select(terms, unweighted = cor_spearman) + weighted_res <- tidy(weighted, number = 1) |> + dplyr::select(terms, weighted = cor_spearman) + both_res <- dplyr::full_join(unweighted_res, weighted_res, by = "terms") + expect_false(isTRUE(all.equal(both_res$weighted, both_res$unweighted))) +}) + # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { From 48601b9f89f56c1de25fa9d410767d4a7a95a10f Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 4 Sep 2025 06:57:00 -0400 Subject: [PATCH 55/65] try to prevent errors on older versions of R --- tests/testthat/helper-objects.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/helper-objects.R b/tests/testthat/helper-objects.R index c088160..e1dfdac 100644 --- a/tests/testthat/helper-objects.R +++ b/tests/testthat/helper-objects.R @@ -6,6 +6,7 @@ suppressPackageStartupMessages(library(parsnip)) # imported by tune suppressPackageStartupMessages(library(yardstick)) # imported by tune suppressPackageStartupMessages(library(filtro)) suppressPackageStartupMessages(library(desirability2)) +suppressPackageStartupMessages(library(S7)) # ------------------------------------------------------------------------------ # regression examples From 32b9baec31549b9bf2b5c262c2261d7615cec6bc Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 4 Sep 2025 07:37:45 -0400 Subject: [PATCH 56/65] more edge case tests --- R/step-predictor-best.R | 10 ++-- R/step_predictor_desirability.R | 38 +++++++------- tests/testthat/test-step-predictor-best.R | 55 +++++++++++++++++++++ tests/testthat/test-step-predictor-retain.R | 28 +++++++++++ 4 files changed, 106 insertions(+), 25 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index 5c60a55..aded810 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -145,9 +145,11 @@ step_predictor_best_new <- prep.step_predictor_best <- function(x, training, info = NULL, ...) { col_names <- recipes_eval_select(x$terms, training, info) check_type(training[, col_names], types = c("double", "integer", "factor")) + + bottom <- ifelse(x$update_prop, .Machine$double.eps, 0.0) check_number_decimal( x$prop_terms, - min = .Machine$double.eps, + min = bottom, max = 1, arg = "prop_terms" ) @@ -235,18 +237,12 @@ calculate_predictor_best <- function( if (score_res@direction == "maximize") { keepers <- score_df |> dplyr::slice_max(score, prop = prop_terms, with_ties = TRUE) - fallback_col <- score_df$predictor[which.max(score_df$score)[1]] } else { keepers <- score_df |> dplyr::slice_min(score, prop = prop_terms, with_ties = TRUE) - fallback_col <- score_df$predictor[which.min(score_df$score)[1]] } keepers <- keepers |> dplyr::pull(predictor) - if (length(keepers) == 0) { - keepers <- score_df$predictors[score_df$predictors != fallback_col] - } - removals <- setdiff(score_df$predictor, keepers) raw_res <- score_res@results |> dplyr::select(outcome, predictor, score) diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 37e72f3..22f1c1c 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -271,9 +271,11 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { rlang::check_installed("desirability2") col_names <- recipes_eval_select(x$terms, training, info) check_type(training[, col_names], types = c("double", "integer", "factor")) + + bottom <- ifelse(x$update_prop, .Machine$double.eps, 0.0) check_number_decimal( x$prop_terms, - min = .Machine$double.eps, + min = bottom, max = 1, arg = "prop_terms" ) @@ -323,37 +325,37 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { raw_scores <- filtro::bind_scores(score_objs) score_df <- - score_objs |> - filtro::fill_safe_values(transform = TRUE) + score_objs |> + filtro::fill_safe_values(transform = TRUE) if (all_scores_missing(raw_scores)) { - cli::cli_warn("All score computations failed; skipping feature selection.") - keep_list <- score_df + cli::cli_warn("All score computations failed; skipping feature selection.") + keep_list <- score_df } else { - # make desirability expression/eval quosure - score_df <- desirability2::make_desirability_cols(x$score, score_df) + # make desirability expression/eval quosure + score_df <- desirability2::make_desirability_cols(x$score, score_df) - bad_news <- purrr::map_lgl(score_df$.d_overall, ~ identical(.x, 0.0)) - if (all(bad_news)) { - keep_list <- score_df[0,] - } else { - keep_list <- - score_df |> - dplyr::slice_max(.d_overall, prop = x$prop_terms, with_ties = TRUE) - } + bad_news <- purrr::map_lgl(score_df$.d_overall, ~ identical(.x, 0.0)) + if (all(bad_news)) { + keep_list <- score_df[0, ] + } else { + keep_list <- + score_df |> + dplyr::slice_max(.d_overall, prop = x$prop_terms, with_ties = TRUE) + } } rm_list <- dplyr::anti_join(score_df, keep_list[, "predictor"], by = "predictor") |> - dplyr::pull(predictor) + dplyr::pull(predictor) score_df$.removed <- score_df$predictor %in% rm_list score_df <- score_df |> - dplyr::select(outcome, predictor, .removed, dplyr::starts_with(".d_")) |> + dplyr::select(outcome, predictor, .removed, dplyr::starts_with(".d_")) |> dplyr::full_join(raw_scores, by = c("outcome", "predictor")) |> dplyr::relocate(.removed, .after = "predictor") |> - dplyr::relocate(dplyr::starts_with(".d_"), .after = dplyr::everything()) + dplyr::relocate(dplyr::starts_with(".d_"), .after = dplyr::everything()) step_predictor_desirability_new( terms = x$terms, diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index d6fece7..83c33f3 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -34,6 +34,61 @@ test_that("step works", { ) }) +test_that("EVERYTHING MUST GO", { + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 0.0, + update_prop = FALSE + ) + + prepped <- prep(rec) + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(setdiff(names(mtcars), names(res_bake))), + sort(names(mtcars)[-1]) + ) + expect_true( + all(res_tidy$.removed) + ) + expect_named( + res_tidy, + c("terms", ".removed", "score", "id") + ) +}) + +test_that("keep everything", { + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = 1 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(names(res_bake)), + sort(names(mtcars)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$.removed]), + character(0) + ) + expect_named( + res_tidy, + c("terms", ".removed", "score", "id") + ) +}) + test_that("case weights work", { unweighted <- recipe(mpg ~ ., data = mtcars) |> step_predictor_best( diff --git a/tests/testthat/test-step-predictor-retain.R b/tests/testthat/test-step-predictor-retain.R index f52b3aa..f659265 100644 --- a/tests/testthat/test-step-predictor-retain.R +++ b/tests/testthat/test-step-predictor-retain.R @@ -59,6 +59,34 @@ test_that("EVERYTHING MUST GO", { ) }) +test_that("keep everything", { + set.seed(1) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_retain( + all_predictors(), + score = abs(cor_pearson) >= -1 & abs(cor_spearman) >= -1 + ) + + prepped <- prep(rec) + + res_bake <- bake(prepped, mtcars) + res_tidy <- tidy(prepped, 1) + + expect_identical( + sort(names(res_bake)), + sort(names(mtcars)) + ) + + expect_identical( + sort(res_tidy$terms[res_tidy$.removed]), + character(0) + ) + expect_named( + res_tidy, + c("terms", ".removed", "cor_pearson", "cor_spearman", "id") + ) +}) + test_that("allows for one score", { rec <- recipe(mpg ~ ., data = mtcars) |> step_predictor_retain( From b1e9514c57c04c559eb5fd2ebee0fe16f2c31761 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 4 Sep 2025 08:16:19 -0400 Subject: [PATCH 57/65] add some content to readme --- DESCRIPTION | 1 + README.Rmd | 54 ++++++++++++++++++++++++++- README.md | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 155 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3a5472a..2b2e1c2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,6 +43,7 @@ Suggests: mirai, modeldata, parsnip, + ranger, spelling, survival, testthat (>= 3.0.0), diff --git a/README.Rmd b/README.Rmd index 268f6d7..670a7ea 100644 --- a/README.Rmd +++ b/README.Rmd @@ -26,7 +26,14 @@ The important package has a succinct interface for obtaining estimates of predic - Importance can be calculated for either the original columns or at the level of any derived model terms created during feature engineering. - The computations that loop across permutation iterations and predictors columns are easily parallelized. - The results are returned in a tidy format. - + +There are also recipe steps for supervised feature selection: + +- `step_predictors_retain()` can filter the predictors using a single conditional statement (e.g., absolute correlation with the outcome > 0.75, etc). +- `step_predictors_best()` can retain the most important predictors for the outcome using a single scoring function. +- `step_predictors_desirability()` retains the most important predictors for the outcome using multiple scoring functions, blended using desirability functions. + +The latter two steps can be tuned over the proportion of predictors to be retained. ## Installation @@ -42,7 +49,7 @@ pak::pak("tidymodels/important") The main reason for making important is censored regression models. tidymodels released tools for fitting and qualifying models that have censored outcomes. This included some dynamic performance metrics that were evaluated at different time points. This was a substantial change for us, and it would have been even more challenging to add to other packages. -## Example +## Variable importance Example Let's look at an analysis that models [food delivery times](https://aml4td.org/chapters/whole-game.html#sec-delivery-times). The outcome is the time between an order being placed and the delivery (all data are complete - there is no censoring). We model this in terms of the order day/time, the distance to the restaurant, and which items are contained in the order. Exploratory data analysis shows several nonlinear trends in the data and some interactions between these trends. @@ -152,6 +159,49 @@ lm_orig_imp |> autoplot(lm_orig_imp) ``` +## Supervised Feature Selection Example + +Using the same dataset, let's illustrate the most common tool for filtering predictors: using random forest importance scores. + +important can use any of the "scoring functions" from the [filtro](https://filtro.tidymodels.org/) package. You can supply one, and the proportion of the predictors to retain: + +```{r} +#| label: select-top +set.seed(491) +selection_rec <- + recipe(time_to_delivery ~ ., data = delivery_train) |> + step_predictor_best(all_predictors(), score = "imp_rf", prop_terms = 1/4) |> + step_dummy(all_factor_predictors()) |> + step_zv(all_predictors()) |> + step_spline_natural(any_of(c("hour", "distance")), deg_free = 10) |> + step_interact(~ starts_with("hour_"):starts_with("day_")) |> + prep() +selection_rec +``` + +A list of possible scores is contained in the help page for the recipe steps. + +Note that we changed selectors in `step_spline_natural()` to use `any_of()` instead of specific names. Any step downstream of any filtering steps should be generalized so that there is no failure if the columns were removed. Using `any_of()` selects these two columns _if they still remain in the data_. + +Which were removed? + +```{r} +#| label: tidy-filter +selection_res <- + tidy(selection_rec, number = 1) |> + arrange(desc(score)) + +selection_res + +mean(selection_res$removed) +``` + +This example shows the basic usage of the recipe. In practice, we would probably do things differently: + + - This step would be included in a workflow so that it is coupled to a model. + - It would be a good idea to optimize how much selection is done by setting `prop_terms = tune()` in the step and using one of the tuning functions to find a good proportion. + +*Inappropriate* use of these selection steps occurs when it is used before the data are split or outside of a resampling step. ## Code of Conduct diff --git a/README.md b/README.md index 18f7948..b132bbf 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,20 @@ features: predictors columns are easily parallelized. - The results are returned in a tidy format. +There are also recipe steps for supervised feature selection: + +- `step_predictors_retain()` can filter the predictors using a single + conditional statement (e.g., absolute correlation with the outcome \> + 0.75, etc). +- `step_predictors_best()` can retain the most important predictors for + the outcome using a single scoring function. +- `step_predictors_desirability()` retains the most important predictors + for the outcome using multiple scoring functions, blended using + desirability functions. + +The latter two steps can be tuned over the proportion of predictors to +be retained. + ## Installation You can install the development version of important from @@ -41,7 +55,7 @@ were evaluated at different time points. This was a substantial change for us, and it would have been even more challenging to add to other packages. -## Example +## Variable importance Example Let’s look at an analysis that models [food delivery times](https://aml4td.org/chapters/whole-game.html#sec-delivery-times). @@ -183,6 +197,93 @@ autoplot(lm_orig_imp) +## Supervised Feature Selection Example + +Using the same dataset, let’s illustrate the most common tool for +filtering predictors: using random forest importance scores. + +important can use any of the “scoring functions” from the +[filtro](https://filtro.tidymodels.org/) package. You can supply one, +and the proportion of the predictors to retain: + +``` r +set.seed(491) +selection_rec <- + recipe(time_to_delivery ~ ., data = delivery_train) |> + step_predictor_best(all_predictors(), score = "imp_rf", prop_terms = 1/4) |> + step_dummy(all_factor_predictors()) |> + step_zv(all_predictors()) |> + step_spline_natural(any_of(c("hour", "distance")), deg_free = 10) |> + step_interact(~ starts_with("hour_"):starts_with("day_")) |> + prep() +selection_rec +#> +#> ── Recipe ────────────────────────────────────────────────────────────────────── +#> +#> ── Inputs +#> Number of variables by role +#> outcome: 1 +#> predictor: 30 +#> +#> ── Training information +#> Training data contained 6004 data points and no incomplete rows. +#> +#> ── Operations +#> • Feature selection via `imp_rf` on: item_03 item_04, ... | Trained +#> • Dummy variables from: day | Trained +#> • Zero variance filter removed: | Trained +#> • Natural spline expansion: hour distance | Trained +#> • Interactions with: hour_01:day_Tue hour_01:day_Wed, ... | Trained +``` + +A list of possible scores is contained in the help page for the recipe +steps. + +Note that we changed selectors in `step_spline_natural()` to use +`any_of()` instead of specific names. Any step downstream of any +filtering steps should be generalized so that there is no failure if the +columns were removed. Using `any_of()` selects these two columns *if +they still remain in the data*. + +Which were removed? + +``` r +selection_res <- + tidy(selection_rec, number = 1) |> + arrange(desc(score)) + +selection_res +#> # A tibble: 30 × 4 +#> terms removed score id +#> +#> 1 hour FALSE 48.5 predictor_best_rCIMa +#> 2 day FALSE 13.5 predictor_best_rCIMa +#> 3 distance FALSE 13.3 predictor_best_rCIMa +#> 4 item_10 FALSE 1.18 predictor_best_rCIMa +#> 5 item_01 FALSE 1.01 predictor_best_rCIMa +#> 6 item_24 FALSE 0.160 predictor_best_rCIMa +#> 7 item_02 FALSE 0.0676 predictor_best_rCIMa +#> 8 item_26 TRUE 0.0666 predictor_best_rCIMa +#> 9 item_03 TRUE 0.0593 predictor_best_rCIMa +#> 10 item_22 TRUE 0.0565 predictor_best_rCIMa +#> # ℹ 20 more rows + +mean(selection_res$removed) +#> [1] 0.7666667 +``` + +This example shows the basic usage of the recipe. In practice, we would +probably do things differently: + +- This step would be included in a workflow so that it is coupled to a + model. +- It would be a good idea to optimize how much selection is done by + setting `prop_terms = tune()` in the step and using one of the tuning + functions to find a good proportion. + +*Inappropriate* use of these selection steps occurs when it is used +before the data are split or outside of a resampling step. + ## Code of Conduct Please note that the important project is released with a [Contributor From 6e1d1ab61f6360ddb79c4cd854667704639743f0 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 4 Sep 2025 08:16:56 -0400 Subject: [PATCH 58/65] . removed -> removed --- R/important-package.R | 2 +- R/recipe_utils.R | 22 +++++++------- R/step-predictor-best.R | 21 ++++++++++---- R/step-predictor-retain.R | 21 ++++++++++---- R/step_predictor_desirability.R | 21 +++++--------- man/step_predictor_best.Rd | 29 ++++++++++++++----- man/step_predictor_desirability.Rd | 15 ++++------ man/step_predictor_retain.Rd | 26 ++++++++++++----- tests/testthat/_snaps/step-predictor-best.md | 20 +++++++++++++ .../_snaps/step-predictor-desirability.md | 22 ++++++++++++++ .../testthat/_snaps/step-predictor-retain.md | 21 ++++++++++++++ tests/testthat/test-step-predictor-best.R | 12 ++++---- .../test-step-predictor-desirability.R | 16 +++++----- tests/testthat/test-step-predictor-retain.R | 16 +++++----- 14 files changed, 180 insertions(+), 84 deletions(-) diff --git a/R/important-package.R b/R/important-package.R index e5a78e2..3f465c9 100644 --- a/R/important-package.R +++ b/R/important-package.R @@ -35,7 +35,7 @@ utils::globalVariables( ".d_overall", "outcome", "score_objs", - ".removed", + "removed", "terms" ) ) diff --git a/R/recipe_utils.R b/R/recipe_utils.R index 21001ae..4b5ff7c 100644 --- a/R/recipe_utils.R +++ b/R/recipe_utils.R @@ -77,14 +77,14 @@ update_prop <- function(num_cols, prop) { tidy_filtro_rec <- function(x, ...) { if (is_trained(x)) { - if (is.null(x$results)) { - res <- tibble::tibble(terms = character(), id = character()) - } else { - res <- - x$results |> - dplyr::select(-outcome, terms = predictor) |> - dplyr::relocate(dplyr::any_of(".removed"), .after = c(terms)) - } + if (is.null(x$results)) { + res <- tibble::tibble(terms = character(), id = character()) + } else { + res <- + x$results |> + dplyr::select(-outcome, terms = predictor) |> + dplyr::relocate(dplyr::any_of("removed"), .after = c(terms)) + } } else { term_names <- sel2char(x$terms) res <- tibble::tibble(terms = term_names) @@ -94,7 +94,7 @@ tidy_filtro_rec <- function(x, ...) { } all_scores_missing <- function(x) { - scores <- dplyr::select(x, -outcome, -predictor) - all_missing <- purrr::map_lgl(scores, ~ all(is.na(.x))) - all(all_missing) + scores <- dplyr::select(x, -outcome, -predictor) + all_missing <- purrr::map_lgl(scores, ~ all(is.na(.x))) + all(all_missing) } diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index aded810..ef15fb1 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -57,15 +57,26 @@ #' weights. A recipe will then interpret that class to be a case weight (and no #' other role). A full example is below. #' -#' # Tidying +#' ## Tidy method #' -#' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is -#' returned with columns `terms` and `id`: +#' For a trained recipe, the `tidy()` method will return a tibble with columns +#' `terms` (the predictor names), `id`, and columns for the estimated scores. +#' The score columns are the raw values, before being filled with "safe values" +#' or transformed. +#' +#' There is an additional local column called `removed` that notes whether the +#' predictor failed the filter and was removed after this step is executed. +#' +#' @return An updated version of `recipe` with the new step added to the +#' sequence of any existing operations. When you +#' [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is returned +#' with columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected to be removed} #' \item{id}{character, id of this step} #' } +#' Once trained, additional columns are included (see Details section). #' @examples #' library(recipes) #' @@ -182,7 +193,7 @@ prep.step_predictor_best <- function(x, training, info = NULL, ...) { outcome = character(0), predictor = character(0), score = double(0), - .removed = logical(0) + removed = logical(0) ), removals = character(0) ) @@ -246,7 +257,7 @@ calculate_predictor_best <- function( removals <- setdiff(score_df$predictor, keepers) raw_res <- score_res@results |> dplyr::select(outcome, predictor, score) - raw_res$.removed <- raw_res$predictor %in% removals + raw_res$removed <- raw_res$predictor %in% removals list( raw = raw_res, diff --git a/R/step-predictor-retain.R b/R/step-predictor-retain.R index c8b6bb3..878b832 100644 --- a/R/step-predictor-retain.R +++ b/R/step-predictor-retain.R @@ -56,15 +56,24 @@ #' #' ## Tidy method #' -#' When you [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is -#' returned with columns `terms` and `id`: +#' For a trained recipe, the `tidy()` method will return a tibble with columns +#' `terms` (the predictor names), `id`, and columns for the estimated scores. +#' The score columns are the raw values, before being filled with "safe values" +#' or transformed. +#' +#' There is an additional local column called `removed` that notes whether the +#' predictor failed the filter and was removed after this step is executed. +#' +#' @return An updated version of `recipe` with the new step added to the +#' sequence of any existing operations. When you +#' [`tidy()`][recipes::tidy.recipe] this step, a tibble::tibble is returned +#' with columns `terms` and `id`: #' #' \describe{ #' \item{terms}{character, the selectors or variables selected to be removed} #' \item{id}{character, id of this step} #' } -#' -#' The underlying operation does not allow for case weights. +#' Once trained, additional columns are included (see Details section). #' #' @examples #' library(recipes) @@ -160,7 +169,7 @@ prep.step_predictor_retain <- function(x, training, info = NULL, ...) { raw = tibble::tibble( outcome = character(0), predictor = character(0), - .removed = logical(0) + removed = logical(0) ), removals = character(0) ) @@ -231,7 +240,7 @@ calculate_predictor_retain <- function( removals <- setdiff(score_df$predictor, keepers) raw_res <- filtro::bind_scores(score_res) - raw_res$.removed <- raw_res$predictor %in% removals + raw_res$removed <- raw_res$predictor %in% removals list( raw = raw_res, diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 22f1c1c..08e2598 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -80,20 +80,15 @@ #' #' For a trained recipe, the `tidy()` method will return a tibble with columns #' `terms` (the predictor names), `id`, columns for the estimated scores, and -#' the desirability results. -#' -#' There are two versions of the score results. The columns prefixed with -#' `"score_1"` have been altered with their transformation (see the Details page -#' for each score) and have had missing values filled with "safe" values to -#' prevent them from being missing. The other set of scores lack the prefix and -#' are the original, raw score values. +#' the desirability results. The score columns are the raw values, before being +#' filled with "safe values" or transformed. #' #' The desirability columns will have the same name as the scores with an #' additional prefix of `.d_`. The overall desirability column is called #' `.d_overall`. #' -#' There is an additional local column called `retain` that notes whether the -#' predictor passed the filter and is retained after this step is executed. +#' There is an additional local column called `removed` that notes whether the +#' predictor failed the filter and was removed after this step is executed. #' #' @return An updated version of `recipe` with the new step added to the #' sequence of any existing operations. When you @@ -148,7 +143,7 @@ #' #' # Use the tidy() method to get the results: #' predictor_scores <- tidy(prepped, number = 1) -#' mean(predictor_scores$.removed) +#' mean(predictor_scores$removed) #' predictor_scores #' #' # -------------------------------------------------------------------------- @@ -349,12 +344,12 @@ prep.step_predictor_desirability <- function(x, training, info = NULL, ...) { dplyr::anti_join(score_df, keep_list[, "predictor"], by = "predictor") |> dplyr::pull(predictor) - score_df$.removed <- score_df$predictor %in% rm_list + score_df$removed <- score_df$predictor %in% rm_list score_df <- score_df |> - dplyr::select(outcome, predictor, .removed, dplyr::starts_with(".d_")) |> + dplyr::select(outcome, predictor, removed, dplyr::starts_with(".d_")) |> dplyr::full_join(raw_scores, by = c("outcome", "predictor")) |> - dplyr::relocate(.removed, .after = "predictor") |> + dplyr::relocate(removed, .after = "predictor") |> dplyr::relocate(dplyr::starts_with(".d_"), .after = dplyr::everything()) step_predictor_desirability_new( diff --git a/man/step_predictor_best.Rd b/man/step_predictor_best.Rd index 858ce9a..58cfa00 100644 --- a/man/step_predictor_best.Rd +++ b/man/step_predictor_best.Rd @@ -58,6 +58,18 @@ may affect the computations for subsequent operations.} \item{id}{A character string that is unique to this step to identify it.} } +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. When you +\code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is returned +with columns \code{terms} and \code{id}: + +\describe{ +\item{terms}{character, the selectors or variables selected to be removed} +\item{id}{character, id of this step} +} +Once trained, additional columns are included (see Details section). +} \description{ \code{step_predictor_best()} creates a \emph{specification} of a recipe step that uses a single scoring function to measure how much each predictor is related to @@ -124,17 +136,18 @@ functions such as \code{\link[hardhat:importance_weights]{hardhat::importance_we weights. A recipe will then interpret that class to be a case weight (and no other role). A full example is below. } -} -\section{Tidying}{ -When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is -returned with columns \code{terms} and \code{id}: -\describe{ -\item{terms}{character, the selectors or variables selected to be removed} -\item{id}{character, id of this step} +\subsection{Tidy method}{ + +For a trained recipe, the \code{tidy()} method will return a tibble with columns +\code{terms} (the predictor names), \code{id}, and columns for the estimated scores. +The score columns are the raw values, before being filled with "safe values" +or transformed. + +There is an additional local column called \code{removed} that notes whether the +predictor failed the filter and was removed after this step is executed. } } - \examples{ library(recipes) diff --git a/man/step_predictor_desirability.Rd b/man/step_predictor_desirability.Rd index 0f66774..7dc37cb 100644 --- a/man/step_predictor_desirability.Rd +++ b/man/step_predictor_desirability.Rd @@ -151,20 +151,15 @@ other role). A full example is below. For a trained recipe, the \code{tidy()} method will return a tibble with columns \code{terms} (the predictor names), \code{id}, columns for the estimated scores, and -the desirability results. - -There are two versions of the score results. The columns prefixed with -\code{"score_1"} have been altered with their transformation (see the Details page -for each score) and have had missing values filled with "safe" values to -prevent them from being missing. The other set of scores lack the prefix and -are the original, raw score values. +the desirability results. The score columns are the raw values, before being +filled with "safe values" or transformed. The desirability columns will have the same name as the scores with an additional prefix of \code{.d_}. The overall desirability column is called \code{.d_overall}. -There is an additional local column called \code{retain} that notes whether the -predictor passed the filter and is retained after this step is executed. +There is an additional local column called \code{removed} that notes whether the +predictor failed the filter and was removed after this step is executed. } } \examples{ @@ -204,7 +199,7 @@ if (rlang::is_installed("modeldata")) { # Use the tidy() method to get the results: predictor_scores <- tidy(prepped, number = 1) - mean(predictor_scores$.removed) + mean(predictor_scores$removed) predictor_scores # -------------------------------------------------------------------------- diff --git a/man/step_predictor_retain.Rd b/man/step_predictor_retain.Rd index 23d6c88..ce22b89 100644 --- a/man/step_predictor_retain.Rd +++ b/man/step_predictor_retain.Rd @@ -50,6 +50,18 @@ may affect the computations for subsequent operations.} \item{id}{A character string that is unique to this step to identify it.} } +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. When you +\code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is returned +with columns \code{terms} and \code{id}: + +\describe{ +\item{terms}{character, the selectors or variables selected to be removed} +\item{id}{character, id of this step} +} +Once trained, additional columns are included (see Details section). +} \description{ \code{step_predictor_retain()} creates a \emph{specification} of a recipe step that uses a logical statement that includes one or more scoring functions to @@ -115,15 +127,13 @@ other role). A full example is below. \subsection{Tidy method}{ -When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble::tibble is -returned with columns \code{terms} and \code{id}: - -\describe{ -\item{terms}{character, the selectors or variables selected to be removed} -\item{id}{character, id of this step} -} +For a trained recipe, the \code{tidy()} method will return a tibble with columns +\code{terms} (the predictor names), \code{id}, and columns for the estimated scores. +The score columns are the raw values, before being filled with "safe values" +or transformed. -The underlying operation does not allow for case weights. +There is an additional local column called \code{removed} that notes whether the +predictor failed the filter and was removed after this step is executed. } } \examples{ diff --git a/tests/testthat/_snaps/step-predictor-best.md b/tests/testthat/_snaps/step-predictor-best.md index d7fbf6b..e942898 100644 --- a/tests/testthat/_snaps/step-predictor-best.md +++ b/tests/testthat/_snaps/step-predictor-best.md @@ -1,3 +1,23 @@ +# case weights work + + Code + print(weighted) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + case_weights: 1 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection via `cor_pearson` on: drat qsec, ... | Trained, weighted + # empty printing Code diff --git a/tests/testthat/_snaps/step-predictor-desirability.md b/tests/testthat/_snaps/step-predictor-desirability.md index c2f96b9..340b840 100644 --- a/tests/testthat/_snaps/step-predictor-desirability.md +++ b/tests/testthat/_snaps/step-predictor-desirability.md @@ -2,6 +2,28 @@ All score computations failed; skipping feature selection. +# case weights work + + Code + print(weighted) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + case_weights: 1 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection via desirability functions (`cor_pearson` and + `cor_spearman`) on: disp, hp, drat, qsec, vs, am, gear, carb | Trained, + weighted + # empty printing Code diff --git a/tests/testthat/_snaps/step-predictor-retain.md b/tests/testthat/_snaps/step-predictor-retain.md index 195ac52..ad38311 100644 --- a/tests/testthat/_snaps/step-predictor-retain.md +++ b/tests/testthat/_snaps/step-predictor-retain.md @@ -1,3 +1,24 @@ +# case weights work + + Code + print(weighted) + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + case_weights: 1 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection using `cor_pearson` and `cor_spearman` removing: drat, + qsec, vs, am, gear, carb | Trained, weighted + # empty printing Code diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index 83c33f3..351fd8d 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -25,12 +25,12 @@ test_that("step works", { ) expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), + sort(res_tidy$terms[res_tidy$removed]), sort(setdiff(names(mtcars)[-1], exp)) ) expect_named( res_tidy, - c("terms", ".removed", "score", "id") + c("terms", "removed", "score", "id") ) }) @@ -52,11 +52,11 @@ test_that("EVERYTHING MUST GO", { sort(names(mtcars)[-1]) ) expect_true( - all(res_tidy$.removed) + all(res_tidy$removed) ) expect_named( res_tidy, - c("terms", ".removed", "score", "id") + c("terms", "removed", "score", "id") ) }) @@ -80,12 +80,12 @@ test_that("keep everything", { ) expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), + sort(res_tidy$terms[res_tidy$removed]), character(0) ) expect_named( res_tidy, - c("terms", ".removed", "score", "id") + c("terms", "removed", "score", "id") ) }) diff --git a/tests/testthat/test-step-predictor-desirability.R b/tests/testthat/test-step-predictor-desirability.R index fb1706b..d3d8978 100644 --- a/tests/testthat/test-step-predictor-desirability.R +++ b/tests/testthat/test-step-predictor-desirability.R @@ -35,14 +35,14 @@ test_that("step works", { ) expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), + sort(res_tidy$terms[res_tidy$removed]), sort(setdiff(names(mtcars)[-1], retained$predictor)) ) expect_named( res_tidy, c( "terms", - ".removed", + "removed", "cor_pearson", "cor_spearman", ".d_max_cor_pearson", @@ -78,14 +78,14 @@ test_that("EVERYTHING MUST GO", { ) expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), + sort(res_tidy$terms[res_tidy$removed]), sort(names(mtcars)[-1]) ) expect_named( res_tidy, c( "terms", - ".removed", + "removed", "cor_spearman", ".d_box_cor_spearman", ".d_overall", @@ -119,12 +119,12 @@ test_that("wrong score type", { ) expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), + sort(res_tidy$terms[res_tidy$removed]), character(0) ) expect_named( res_tidy, - c("terms", ".removed", "xtab_pval_fisher", "id") + c("terms", "removed", "xtab_pval_fisher", "id") ) }) @@ -153,14 +153,14 @@ test_that("keep everything", { ) expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), + sort(res_tidy$terms[res_tidy$removed]), character(0) ) expect_named( res_tidy, c( "terms", - ".removed", + "removed", "cor_spearman", ".d_box_cor_spearman", ".d_overall", diff --git a/tests/testthat/test-step-predictor-retain.R b/tests/testthat/test-step-predictor-retain.R index f659265..c3ff9bf 100644 --- a/tests/testthat/test-step-predictor-retain.R +++ b/tests/testthat/test-step-predictor-retain.R @@ -26,12 +26,12 @@ test_that("step works", { sort(exp) ) expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), + sort(res_tidy$terms[res_tidy$removed]), sort(exp) ) expect_named( res_tidy, - c("terms", ".removed", "cor_pearson", "cor_spearman", "id") + c("terms", "removed", "cor_pearson", "cor_spearman", "id") ) }) @@ -51,11 +51,11 @@ test_that("EVERYTHING MUST GO", { sort(names(mtcars)[-1]) ) expect_true( - all(res_tidy$.removed) + all(res_tidy$removed) ) expect_named( res_tidy, - c("terms", ".removed", "cor_pearson", "cor_spearman", "id") + c("terms", "removed", "cor_pearson", "cor_spearman", "id") ) }) @@ -78,12 +78,12 @@ test_that("keep everything", { ) expect_identical( - sort(res_tidy$terms[res_tidy$.removed]), + sort(res_tidy$terms[res_tidy$removed]), character(0) ) expect_named( res_tidy, - c("terms", ".removed", "cor_pearson", "cor_spearman", "id") + c("terms", "removed", "cor_pearson", "cor_spearman", "id") ) }) @@ -111,7 +111,7 @@ test_that("allows for one score", { ) expect_named( res_tidy, - c("terms", ".removed", "cor_pearson", "id") + c("terms", "removed", "cor_pearson", "id") ) }) @@ -186,7 +186,7 @@ test_that("empty selection tidy method works", { tidy(rec, number = 1), tibble::tibble( terms = character(0), - .removed = logical(0), + removed = logical(0), id = character(0) ) ) From b84f48fb83706cf6df0c87bda61eed7850afb6da Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 4 Sep 2025 08:31:33 -0400 Subject: [PATCH 59/65] try to check classes for different versions of R --- R/step_predictor_desirability.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/R/step_predictor_desirability.R b/R/step_predictor_desirability.R index 08e2598..f0c7ecc 100644 --- a/R/step_predictor_desirability.R +++ b/R/step_predictor_desirability.R @@ -207,12 +207,16 @@ step_predictor_desirability <- function( skip = FALSE, id = rand_id("predictor_desirability") ) { - if (!inherits(score, "desirability2::desirability_set")) { - cli::cli_abort( - "Please use the {.fn desirability} function in the {.pkg desirability2} + S7::check_is_S7(score) + if (getRversion() >= "4.3.0") { + if (!inherits(score, "desirability2::desirability_set")) { + cli::cli_abort( + "Please use the {.fn desirability} function in the {.pkg desirability2} package to create an object to pass to {.arg score}." - ) + ) + } } + add_step( recipe, step_predictor_desirability_new( From a7b226fd846daa8c560c606fa835e2d870920efb Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 4 Sep 2025 09:30:53 -0400 Subject: [PATCH 60/65] version-based skip --- tests/testthat/test-step-predictor-desirability.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-step-predictor-desirability.R b/tests/testthat/test-step-predictor-desirability.R index d3d8978..0cf26d6 100644 --- a/tests/testthat/test-step-predictor-desirability.R +++ b/tests/testthat/test-step-predictor-desirability.R @@ -205,6 +205,7 @@ test_that("bake method errors when needed non-standard role columns are missing" }) test_that("empty printing", { + skip_if(getRversion() <= "4.3.0") rec <- recipe(mpg ~ ., mtcars) expect_snapshot(step_predictor_desirability(rec), error = TRUE) # TODO should this error? From e5dd9e2c33e780683400c81f94b79bb9ee60ef36 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Thu, 4 Sep 2025 09:36:17 -0700 Subject: [PATCH 61/65] fix step_predictor_best() printing test --- tests/testthat/_snaps/step-predictor-best.md | 23 +++++++++++++++----- tests/testthat/test-step-predictor-best.R | 6 ++--- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/tests/testthat/_snaps/step-predictor-best.md b/tests/testthat/_snaps/step-predictor-best.md index e942898..bea9cb8 100644 --- a/tests/testthat/_snaps/step-predictor-best.md +++ b/tests/testthat/_snaps/step-predictor-best.md @@ -63,19 +63,30 @@ -- Inputs Number of variables by role - predictor: 11 + outcome: 1 + predictor: 10 -- Operations - * Feature selection on: all_predictors() + * Feature selection via `cor_pearson` on: all_predictors() --- Code prep(rec) - Condition - Error in `step_predictor_best()`: - Caused by error in `pull_outcome_column_name()`: - ! One column should have a role of "outcome". + Message + + -- Recipe ---------------------------------------------------------------------- + + -- Inputs + Number of variables by role + outcome: 1 + predictor: 10 + + -- Training information + Training data contained 32 data points and no incomplete rows. + + -- Operations + * Feature selection via `cor_pearson` on: qsec, vs, am, gear, carb | Trained # bad args diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index 351fd8d..d97eb96 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -162,11 +162,11 @@ test_that("empty selection tidy method works", { test_that("printing", { set.seed(1) - rec <- recipe(~., data = mtcars) |> - step_predictor_best(all_predictors()) + rec <- recipe(mpg ~ ., data = mtcars) |> + step_predictor_best(all_predictors(), score = "cor_pearson") expect_snapshot(print(rec)) - expect_snapshot(prep(rec), error = TRUE) # Emil: is this intended? + expect_snapshot(prep(rec)) }) test_that("tunable is setup to work with extract_parameter_set_dials", { From 2133e6cb1c66b032b8db8d7e9a7cf400dd3126f2 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Thu, 4 Sep 2025 09:44:01 -0700 Subject: [PATCH 62/65] make sure we error on missing score arg in step_predictor_best() --- R/step-predictor-best.R | 2 +- tests/testthat/_snaps/step-predictor-best.md | 16 +++++++++++--- tests/testthat/test-step-predictor-best.R | 23 ++++++++++++++++---- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/R/step-predictor-best.R b/R/step-predictor-best.R index ef15fb1..718d8fe 100644 --- a/R/step-predictor-best.R +++ b/R/step-predictor-best.R @@ -108,7 +108,7 @@ step_predictor_best <- function( recipe, step_predictor_best_new( terms = enquos(...), - score = rlang::enexpr(score), # Or score = score? + score = score, role = role, trained = trained, prop_terms = prop_terms, diff --git a/tests/testthat/_snaps/step-predictor-best.md b/tests/testthat/_snaps/step-predictor-best.md index bea9cb8..58cf779 100644 --- a/tests/testthat/_snaps/step-predictor-best.md +++ b/tests/testthat/_snaps/step-predictor-best.md @@ -18,6 +18,15 @@ -- Operations * Feature selection via `cor_pearson` on: drat qsec, ... | Trained, weighted +# missing score arg + + Code + step_predictor_best(recipe(mpg ~ ., data = mtcars), all_predictors(), + prop_terms = 1 / 2) + Condition + Error in `step_predictor_best()`: + ! argument "score" is missing, with no default + # empty printing Code @@ -32,7 +41,7 @@ predictor: 10 -- Operations - * Feature selection on: + * Feature selection via `cor_pearson` on: --- @@ -51,7 +60,7 @@ Training data contained 32 data points and no incomplete rows. -- Operations - * Feature selection on: | Trained + * Feature selection via `cor_pearson` on: | Trained # printing @@ -91,7 +100,8 @@ # bad args Code - prep(step_predictor_best(recipe(mpg ~ ., mtcars), all_predictors(), prop_terms = 2)) + prep(step_predictor_best(recipe(mpg ~ ., mtcars), all_predictors(), prop_terms = 2, + score = "cor_pearson")) Condition Error in `step_predictor_best()`: Caused by error in `prep()`: diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index d97eb96..b5740c1 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -116,6 +116,17 @@ test_that("case weights work", { expect_false(isTRUE(all.equal(both_res$weighted, both_res$unweighted))) }) +test_that("missing score arg", { + expect_snapshot( + error = TRUE, + recipe(mpg ~ ., data = mtcars) |> + step_predictor_best( + all_predictors(), + prop_terms = 1 / 2 + ) + ) +}) + # Infrastructure --------------------------------------------------------------- test_that("bake method errors when needed non-standard role columns are missing", { # Here for completeness @@ -125,7 +136,7 @@ test_that("bake method errors when needed non-standard role columns are missing" test_that("empty printing", { rec <- recipe(mpg ~ ., mtcars) - rec <- step_predictor_best(rec) + rec <- step_predictor_best(rec, score = "cor_pearson") expect_snapshot(rec) @@ -136,7 +147,7 @@ test_that("empty printing", { test_that("empty selection prep/bake is a no-op", { rec1 <- recipe(mpg ~ ., mtcars) - rec2 <- step_predictor_best(rec1) + rec2 <- step_predictor_best(rec1, score = "cor_pearson") rec1 <- prep(rec1, mtcars) rec2 <- prep(rec2, mtcars) @@ -149,7 +160,7 @@ test_that("empty selection prep/bake is a no-op", { test_that("empty selection tidy method works", { rec <- recipe(mpg ~ ., mtcars) - rec <- step_predictor_best(rec) + rec <- step_predictor_best(rec, score = "cor_pearson") expect <- tibble(terms = character(), id = character()) @@ -183,7 +194,11 @@ test_that("tunable is setup to work with extract_parameter_set_dials", { test_that("bad args", { expect_snapshot( recipe(mpg ~ ., mtcars) |> - step_predictor_best(all_predictors(), prop_terms = 2) |> + step_predictor_best( + all_predictors(), + prop_terms = 2, + score = "cor_pearson" + ) |> prep(), error = TRUE ) From 563e874b20f404cff08eb65fdb564558acbd5c48 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Thu, 4 Sep 2025 09:47:18 -0700 Subject: [PATCH 63/65] tweak last infrasturcture test for step_predictor_best --- tests/testthat/test-step-predictor-best.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step-predictor-best.R index b5740c1..28efaa7 100644 --- a/tests/testthat/test-step-predictor-best.R +++ b/tests/testthat/test-step-predictor-best.R @@ -205,10 +205,9 @@ test_that("bad args", { }) test_that("0 and 1 rows data work in bake method", { - skip("Emil: unsure if this is an intended error") data <- mtcars - rec <- recipe(~., data) |> - step_predictor_best(all_numeric_predictors()) |> + rec <- recipe(mpg ~ ., data) |> + step_predictor_best(all_numeric_predictors(), score = "cor_pearson") |> prep() expect_identical( From c2edf3ef2e107db48af3e3e67a41b34880809cbb Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Thu, 4 Sep 2025 09:52:03 -0700 Subject: [PATCH 64/65] realign file names --- R/{step-predictor-best.R => step_predictor_best.R} | 0 R/{step-predictor-retain.R => step_predictor_retain.R} | 0 man/required_pkgs.important.Rd | 6 +++--- man/step_predictor_best.Rd | 2 +- man/step_predictor_retain.Rd | 2 +- .../{step-predictor-best.md => step_predictor_best.md} | 0 ...ictor-desirability.md => step_predictor_desirability.md} | 0 .../{step-predictor-retain.md => step_predictor_retain.md} | 0 ...est-step-predictor-best.R => test-step_predictor_best.R} | 0 ...or-desirability.R => test-step_predictor_desirability.R} | 0 ...step-predictor-retain.R => test-step_predictor_retain.R} | 0 11 files changed, 5 insertions(+), 5 deletions(-) rename R/{step-predictor-best.R => step_predictor_best.R} (100%) rename R/{step-predictor-retain.R => step_predictor_retain.R} (100%) rename tests/testthat/_snaps/{step-predictor-best.md => step_predictor_best.md} (100%) rename tests/testthat/_snaps/{step-predictor-desirability.md => step_predictor_desirability.md} (100%) rename tests/testthat/_snaps/{step-predictor-retain.md => step_predictor_retain.md} (100%) rename tests/testthat/{test-step-predictor-best.R => test-step_predictor_best.R} (100%) rename tests/testthat/{test-step-predictor-desirability.R => test-step_predictor_desirability.R} (100%) rename tests/testthat/{test-step-predictor-retain.R => test-step_predictor_retain.R} (100%) diff --git a/R/step-predictor-best.R b/R/step_predictor_best.R similarity index 100% rename from R/step-predictor-best.R rename to R/step_predictor_best.R diff --git a/R/step-predictor-retain.R b/R/step_predictor_retain.R similarity index 100% rename from R/step-predictor-retain.R rename to R/step_predictor_retain.R diff --git a/man/required_pkgs.important.Rd b/man/required_pkgs.important.Rd index 63c8644..cf62557 100644 --- a/man/required_pkgs.important.Rd +++ b/man/required_pkgs.important.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/step-predictor-best.R, -% R/step-predictor-retain.R, R/step_predictor_desirability.R +% Please edit documentation in R/step_predictor_best.R, +% R/step_predictor_desirability.R, R/step_predictor_retain.R \name{required_pkgs.step_predictor_best} \alias{required_pkgs.step_predictor_best} -\alias{required_pkgs.step_predictor_desirability} \alias{required_pkgs.important} +\alias{required_pkgs.step_predictor_desirability} \title{S3 methods for tracking which additional packages are needed for steps.} \usage{ \method{required_pkgs}{step_predictor_best}(x, ...) diff --git a/man/step_predictor_best.Rd b/man/step_predictor_best.Rd index 58cfa00..1620e59 100644 --- a/man/step_predictor_best.Rd +++ b/man/step_predictor_best.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/step-predictor-best.R +% Please edit documentation in R/step_predictor_best.R \name{step_predictor_best} \alias{step_predictor_best} \title{Supervised Feature Selection via Choosing the Top Predictors} diff --git a/man/step_predictor_retain.Rd b/man/step_predictor_retain.Rd index ce22b89..1f704b5 100644 --- a/man/step_predictor_retain.Rd +++ b/man/step_predictor_retain.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/step-predictor-retain.R +% Please edit documentation in R/step_predictor_retain.R \name{step_predictor_retain} \alias{step_predictor_retain} \title{Supervised Feature Selection via A Single Filter} diff --git a/tests/testthat/_snaps/step-predictor-best.md b/tests/testthat/_snaps/step_predictor_best.md similarity index 100% rename from tests/testthat/_snaps/step-predictor-best.md rename to tests/testthat/_snaps/step_predictor_best.md diff --git a/tests/testthat/_snaps/step-predictor-desirability.md b/tests/testthat/_snaps/step_predictor_desirability.md similarity index 100% rename from tests/testthat/_snaps/step-predictor-desirability.md rename to tests/testthat/_snaps/step_predictor_desirability.md diff --git a/tests/testthat/_snaps/step-predictor-retain.md b/tests/testthat/_snaps/step_predictor_retain.md similarity index 100% rename from tests/testthat/_snaps/step-predictor-retain.md rename to tests/testthat/_snaps/step_predictor_retain.md diff --git a/tests/testthat/test-step-predictor-best.R b/tests/testthat/test-step_predictor_best.R similarity index 100% rename from tests/testthat/test-step-predictor-best.R rename to tests/testthat/test-step_predictor_best.R diff --git a/tests/testthat/test-step-predictor-desirability.R b/tests/testthat/test-step_predictor_desirability.R similarity index 100% rename from tests/testthat/test-step-predictor-desirability.R rename to tests/testthat/test-step_predictor_desirability.R diff --git a/tests/testthat/test-step-predictor-retain.R b/tests/testthat/test-step_predictor_retain.R similarity index 100% rename from tests/testthat/test-step-predictor-retain.R rename to tests/testthat/test-step_predictor_retain.R From d744f601369ff39c1e00acc72979fa53af9f2906 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 4 Sep 2025 13:43:21 -0400 Subject: [PATCH 65/65] small updates --- inst/WORDLIST | 1 + tests/testthat/test-step_predictor_best.R | 7 +++- .../test-step_predictor_desirability.R | 34 ------------------- 3 files changed, 7 insertions(+), 35 deletions(-) diff --git a/inst/WORDLIST b/inst/WORDLIST index 0c320f8..8fc125b 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -7,6 +7,7 @@ PBC Suich desirabilities doi +filtro funder importances mirai diff --git a/tests/testthat/test-step_predictor_best.R b/tests/testthat/test-step_predictor_best.R index 28efaa7..f0df075 100644 --- a/tests/testthat/test-step_predictor_best.R +++ b/tests/testthat/test-step_predictor_best.R @@ -117,6 +117,7 @@ test_that("case weights work", { }) test_that("missing score arg", { + skip_if(getRversion() <= "4.3.0") expect_snapshot( error = TRUE, recipe(mpg ~ ., data = mtcars) |> @@ -183,7 +184,11 @@ test_that("printing", { test_that("tunable is setup to work with extract_parameter_set_dials", { skip_if_not_installed("dials", minimum_version = "1.4.1.9000") rec <- recipe(~., data = mtcars) |> - step_predictor_best(all_predictors(), prop_terms = hardhat::tune()) + step_predictor_best( + all_predictors(), + score = "cor_pearson", + prop_terms = hardhat::tune() + ) params <- extract_parameter_set_dials(rec) diff --git a/tests/testthat/test-step_predictor_desirability.R b/tests/testthat/test-step_predictor_desirability.R index 0cf26d6..0a5fd91 100644 --- a/tests/testthat/test-step_predictor_desirability.R +++ b/tests/testthat/test-step_predictor_desirability.R @@ -208,40 +208,6 @@ test_that("empty printing", { skip_if(getRversion() <= "4.3.0") rec <- recipe(mpg ~ ., mtcars) expect_snapshot(step_predictor_desirability(rec), error = TRUE) - # TODO should this error? - # expect_snapshot(rec) - # - # rec <- prep(rec, mtcars) - # - # expect_snapshot(rec) -}) - -test_that("empty selection prep/bake is a no-op", { - rec1 <- recipe(mpg ~ ., mtcars) - # TODO should this error? - # rec2 <- step_predictor_desirability(rec1) - # - # rec1 <- prep(rec1, mtcars) - # rec2 <- prep(rec2, mtcars) - # - # baked1 <- bake(rec1, mtcars) - # baked2 <- bake(rec2, mtcars) - # - # expect_identical(baked1, baked2) -}) - -test_that("empty selection tidy method works", { - rec <- recipe(mpg ~ ., mtcars) - # TODO should this error? - # rec <- step_predictor_desirability(rec) - # - # expect <- tibble(terms = character(), id = character()) - # - # expect_identical(tidy(rec, number = 1), expect) - # - # rec <- prep(rec, mtcars) - # - # expect_identical(tidy(rec, number = 1), expect) }) test_that("printing", {