add steps by EmilHvitfeldt · Pull Request #21 · tidymodels/important

EmilHvitfeldt · 2025-08-19T18:51:40Z

This PR sets up 3 recipe steps with dummy names for use in feature selection. These have been set up to follow the standards that we use in the rest of the tidymodels recipes packages

franceslinyc · 2025-08-19T19:50:54Z

step_predictor_best()

step_predictor_desirability()

step_predictor_retain()

jrosell · 2025-09-04T17:36:53Z

This is so cool that I couldn't wait for this PR to be merged and I did a little experiment with noise features, but I get weird results.

I expected to pick some noise predictors with the new step_predictor_best function.

pak::pak(c("tidymodels/important@recipes-steps", "tune", "recipes", "dials", "filtro", "desirability2", "tictoc", "future", "parallelly"))
#> ℹ Loading metadata database
#> ✔ Loading metadata database ... done
#> 
#> 
#> ✔ All system requirements are already installed.
#> 
#> ℹ No downloads are needed
#> ✔ 9 pkgs + 75 deps: kept 79 [6.8s]

library(tidyverse)
library(tidymodels)
library(important)
library(future)

data(deliveries, package = "modeldata")



# Original features

set.seed(991)
delivery_split <- 
  deliveries |> 
  initial_validation_split(prop = c(0.6, 0.2), strata = time_to_delivery)
delivery_train <- training(delivery_split)
folds <- vfold_cv(delivery_train)
delivery_rec <- 
  recipe(time_to_delivery ~ ., data = delivery_train) |> 
  step_dummy(all_factor_predictors()) |> 
  step_spline_natural(hour, distance, deg_free = 10) |> 
  step_interact(~ starts_with("hour_"):starts_with("day_")) |>
  print()
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:    1
#> predictor: 30
#> 
#> ── Operations
#> • Dummy variables from: all_factor_predictors()
#> • Natural spline expansion: hour distance
#> • Interactions with: starts_with("hour_"):starts_with("day_")
delivery_wflow <- workflow(delivery_rec, linear_reg())
plan(multicore, workers = parallelly::availableCores(omit = 10))
res_original <- delivery_wflow |>
  fit_resamples(
    folds,
    metrics = metric_set(mae, rsq),
    control = control_resamples(
      save_workflow = FALSE,
      save_pred = FALSE
    )
  )
res_original  |> show_best(metric = "mae")
#> # A tibble: 1 × 6
#>   .metric .estimator  mean     n std_err .config        
#>   <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
#> 1 mae     standard    1.63    10  0.0217 pre0_mod0_post0


# Original features and noise features

set.seed(991)
noise_split <- 
  map(1:30, \(i) rnorm(n = nrow(deliveries))) |> 
  as_tibble(.name_repair = "minimal") |> 
  setNames(paste0("noise_", 1:30)) |> 
  bind_cols(deliveries) |> 
  initial_validation_split(prop = c(0.6, 0.2), strata = time_to_delivery)
noise_train <- training(noise_split)
noise_folds <- vfold_cv(noise_train)
noise_rec <- 
  recipe(time_to_delivery ~ ., data = noise_train) |>
  step_dummy(all_factor_predictors()) |>
  step_spline_natural(hour, distance, deg_free = 10) |> 
  step_interact(~ starts_with("hour_"):starts_with("day_")) |>
  print()
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:    1
#> predictor: 60
#> 
#> ── Operations
#> • Dummy variables from: all_factor_predictors()
#> • Natural spline expansion: hour distance
#> • Interactions with: starts_with("hour_"):starts_with("day_")
noise_wflow <- workflow(noise_rec, linear_reg())
tictoc::tic("Tuning")
plan(multicore, workers = parallelly::availableCores(omit = 10))
res_noise <- noise_wflow |>
  fit_resamples(
    noise_folds,
    metrics = metric_set(mae, rsq),
    control = control_resamples(
      save_workflow = FALSE,
      save_pred = FALSE
    )
  )
tictoc::toc()
#> Tuning: 2.002 sec elapsed
res_noise  |> show_best(metric = "mae")
#> # A tibble: 1 × 6
#>   .metric .estimator  mean     n std_err .config        
#>   <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
#> 1 mae     standard    1.65    10  0.0191 pre0_mod0_post0


# Feature selection

tune_rec <- 
  recipe(time_to_delivery ~ ., data = noise_train) |> 
  step_dummy(all_factor_predictors()) |> 
  step_spline_natural(hour, distance, deg_free = 10) |> 
  step_interact(~ starts_with("hour_"):starts_with("day_")) |>
  step_predictor_best(
    all_predictors(), 
    prop_terms = tune(),
    score = "cor_pearson"
  ) |> 
  print()
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:    1
#> predictor: 60
#> 
#> ── Operations
#> • Dummy variables from: all_factor_predictors()
#> • Natural spline expansion: hour distance
#> • Interactions with: starts_with("hour_"):starts_with("day_")
#> • Feature selection via `cor_pearson` on: all_predictors()

tune_wflow <- workflow(tune_rec, linear_reg())

tictoc::tic("Tuning")
plan(multicore, workers = parallelly::availableCores(omit = 10))
tune_res <- tune_wflow |>
  tune_grid(
    noise_folds,
    metrics = metric_set(mae, rsq),
    grid = tibble(prop_terms = seq(0.5, 1, length.out = 10)),
    control = control_grid(      
      save_workflow = FALSE,
      save_pred = FALSE
    )
  )
tictoc::toc()
#> Tuning: 12.561 sec elapsed

tune_res  |> show_best(metric = "mae")
#> # A tibble: 5 × 7
#>   prop_terms .metric .estimator  mean     n std_err .config         
#>        <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
#> 1      1     mae     standard    1.65    10  0.0191 pre10_mod0_post0
#> 2      0.944 mae     standard    1.66    10  0.0192 pre09_mod0_post0
#> 3      0.889 mae     standard    1.67    10  0.0199 pre08_mod0_post0
#> 4      0.833 mae     standard    1.67    10  0.0187 pre07_mod0_post0
#> 5      0.778 mae     standard    1.69    10  0.0208 pre06_mod0_post0

^{Created on 2025-09-04 with reprex v2.1.1.9000}

topepo · 2025-09-05T02:28:54Z

@jrosell I was about to hit the button to send this to CRAN and saw this. 🙀 Thanks for posting this.

I had a few thoughts. I was surprised to see the error go down as the noise predictors were retained. The amount is not much, reaching the MAE levels that we see with no noise predictors, and the overall difference in the delivery time MAE is about nine seconds. The signal is clear but tiny.

This particular data set has a handful of very important predictors and about 35ish item predictors that have a negligible effect on the outcome, especially without a few interactions. Plain old linear regression isn’t robust against extra predictors (like glmnet), but it isn’t particularly sensitive to them in “small” amounts. This is especially true if they are not very correlated. However, this should result in a relatively flat pattern in MAE, not an increasing one.

I can verify that the recipe step ranks the predictors appropriately, and the data being held out is not involved in the selection, so data leakage probably isn’t the issue. The same data is used to select predictors and model them. That’s not optimal (compared to full-on RFE). However, if that were the issue, we would be overfitting the predictor set, and the assessment data would show poor results.

One thing that I would do differently is to apply the filter first in the recipe. That eliminates a lot more terms at once. Correlating spline terms to the outcome might not work well since they are designed to model a local area in predictor space. That might dilute the correlation.

I did modify the recipe to see if that was the issue, and I got the same result pattern.

So again, there is surprise, but it’s not a big effect.

So what would happen if the model were very sensitive to extra noise columns? I used the same neural network tuned in the “whole game” chapter and used 24 units (same as the book). I ran the filter first, then did the existing feature engineering (code below).

Here are the results:

If there are about 6-10 predictors, that is a proportion range of 0.1 to 0.17. The best point is around 0.13 and shows underfitting before that, and the effect of too many predictors after.

(The weird spikes are cases when the gradient search went off the rails, causing a numeric overflow in the loss function. I’m still working on fixing that.)

This result is consistent with what I would expect, so I feel comfortable that it is working as intended.

One other thing, the desirability2 package can let you make tradeoffs between performance and the amount of feature selection via

library(desirability2)
tune_res |> 
	show_best_desirability(
	minimize(mae), 
	minimize(prop_terms, scale = 2)
)

so that can help make a good choice.

Other code:

norm_rec <- 
	recipe(time_to_delivery ~ ., data = noise_train) %>% 
	step_predictor_best(
		all_predictors(), 
		prop_terms = tune(),
		score = "cor_pearson",
	) |> 
	step_dummy(all_factor_predictors()) %>% 
	step_zv(all_predictors()) %>% 
	step_normalize(all_predictors())

nnet_spec <- 
	mlp(
		hidden_units = 24,
		penalty = 0.01,
		learn_rate = 0.1,
		epochs = 5000
	) %>%
	set_mode("regression") %>%
	set_engine("brulee", stop_iter = 10, rate_schedule = "cyclic")

nnet_wflow <- 
	workflow() %>% 
	add_model(nnet_spec) %>% 
	add_recipe(norm_rec)

nnet_grid <- tibble(prop_terms = seq(0.05, 1, length.out = 50))

set.seed(388)
nnet_res <-
	tune_grid(nnet_wflow,
						noise_folds,
						grid = nnet_grid,
						metrics = metric_set(mae, rsq))

nnet_res |>
	collect_metrics(summarize = FALSE) |>
	summarize(
		mean = mean(.estimate),
		median = median(.estimate),
		.by = c(prop_terms, .metric)
	) |>
	pivot_longer(
		cols = c(mean, median),
		names_to = "estimate",
		values_to = "value"
	) |>
	ggplot(aes(prop_terms, value, col = estimate, group = estimate)) + 
	geom_point(cex = 1) + 
	geom_line() + 
	facet_wrap(~ .metric, ncol = 1, scales = "free_y") + 
	geom_vline(xintercept = c(0.1, 0.17), lty = 3) + 
	theme_bw()

jrosell · 2025-09-05T06:13:36Z

Don't worry. Awesome addition. I see I didn't came up with a good example :)

EmilHvitfeldt added 4 commits August 19, 2025 11:47

add rlang standalone checkers

02d20b8

add recipe step

5af6bc0

add 2 more steps

33141d0

add infrastruture tests

0e60102

topepo and others added 25 commits August 19, 2025 17:29

some utility functions for recipes

1b54b75

rename files, add argument placeholders

1c98b6d

Replace all step_select_1 with step_predictor_best

d210f7e

Rename files

a9584bb

Add skip()

f3ff011

Fix R CMD check error

2635741

added a few more utilities

eaa76a4

imports and global variable false positives

4213eae

kind of working version of desirability step

c9108e7

add case weight machinery and example

49fbea6

update for CRAN hardhat

5460627

no longer needed

eea4454

Merge branch 'main' into recipes-steps

9ef5dca

try removing postproc to deal with dependency issue

0e843c0

start step_predictor_retain()

2e766f7

use outcome_name in step_predictor_retain()

b172557

add some tests for step_predictor_retain

51f05e8

slight example update

ac0ff85

try moving desirability2 to suggests

f27f746

more documentation

67bc874

remotes WITH AN S

37494b8

fitro in Remotes

6c25f14

load two other packages in workers

9b67b6d

WIP

fb38af5

make GHA run clean

d5cf3e2

topepo and others added 24 commits September 3, 2025 12:05

put score names in print method

d3bd281

not needed

87e006f

document existing filtro scores

4f3a0cc

ensure that transformed scores are used

a3a06d0

harmonize the output objects

9217b22

use any_of() in tidy_filtro_rec

1f90ecf

fix Rproj file

f188d05

move some docs around

bbc0178

add code back in

a976bb9

more work on step_predictors_best

502deb2

tests for desirability step

6ea9a16

updates for step_predictor_desirability

0b4d090

dials skips and unquote dplyr::pull()

0982751

enable and test case weights

8461c89

try to prevent errors on older versions of R

48601b9

more edge case tests

32b9bae

add some content to readme

b1e9514

. removed -> removed

6e1d1ab

try to check classes for different versions of R

b84f48f

version-based skip

a7b226f

fix step_predictor_best() printing test

e5dd9e2

make sure we error on missing score arg in step_predictor_best()

2133e6c

tweak last infrasturcture test for step_predictor_best

563e874

realign file names

c2edf3e

small updates

d744f60

topepo merged commit 90822d4 into main Sep 4, 2025
14 checks passed

topepo deleted the recipes-steps branch September 4, 2025 17:54

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

add steps#21

add steps#21
topepo merged 66 commits intomainfrom
recipes-steps

EmilHvitfeldt commented Aug 19, 2025

Uh oh!

franceslinyc commented Aug 19, 2025

Uh oh!

jrosell commented Sep 4, 2025 •

edited

Loading

Uh oh!

Uh oh!

topepo commented Sep 5, 2025

Uh oh!

jrosell commented Sep 5, 2025 •

edited

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

Conversation

EmilHvitfeldt commented Aug 19, 2025

Uh oh!

franceslinyc commented Aug 19, 2025

Uh oh!

jrosell commented Sep 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

topepo commented Sep 5, 2025

Uh oh!

jrosell commented Sep 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

jrosell commented Sep 4, 2025 •

edited

Loading

jrosell commented Sep 5, 2025 •

edited

Loading