Skip to content

add steps#21

Merged
topepo merged 66 commits intomainfrom
recipes-steps
Sep 4, 2025
Merged

add steps#21
topepo merged 66 commits intomainfrom
recipes-steps

Conversation

@EmilHvitfeldt
Copy link
Member

This PR sets up 3 recipe steps with dummy names for use in feature selection. These have been set up to follow the standards that we use in the rest of the tidymodels recipes packages

@franceslinyc
Copy link
Collaborator

step_predictor_best()

step_predictor_desirability()

step_predictor_retain()

@jrosell
Copy link

jrosell commented Sep 4, 2025

This is so cool that I couldn't wait for this PR to be merged and I did a little experiment with noise features, but I get weird results.

I expected to pick some noise predictors with the new step_predictor_best function.

pak::pak(c("tidymodels/important@recipes-steps", "tune", "recipes", "dials", "filtro", "desirability2", "tictoc", "future", "parallelly"))
#> ℹ Loading metadata database
#> ✔ Loading metadata database ... done
#> 
#> 
#> ✔ All system requirements are already installed.
#> 
#> ℹ No downloads are needed
#> ✔ 9 pkgs + 75 deps: kept 79 [6.8s]

library(tidyverse)
library(tidymodels)
library(important)
library(future)

data(deliveries, package = "modeldata")



# Original features

set.seed(991)
delivery_split <- 
  deliveries |> 
  initial_validation_split(prop = c(0.6, 0.2), strata = time_to_delivery)
delivery_train <- training(delivery_split)
folds <- vfold_cv(delivery_train)
delivery_rec <- 
  recipe(time_to_delivery ~ ., data = delivery_train) |> 
  step_dummy(all_factor_predictors()) |> 
  step_spline_natural(hour, distance, deg_free = 10) |> 
  step_interact(~ starts_with("hour_"):starts_with("day_")) |>
  print()
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:    1
#> predictor: 30
#> 
#> ── Operations
#> • Dummy variables from: all_factor_predictors()
#> • Natural spline expansion: hour distance
#> • Interactions with: starts_with("hour_"):starts_with("day_")
delivery_wflow <- workflow(delivery_rec, linear_reg())
plan(multicore, workers = parallelly::availableCores(omit = 10))
res_original <- delivery_wflow |>
  fit_resamples(
    folds,
    metrics = metric_set(mae, rsq),
    control = control_resamples(
      save_workflow = FALSE,
      save_pred = FALSE
    )
  )
res_original  |> show_best(metric = "mae")
#> # A tibble: 1 × 6
#>   .metric .estimator  mean     n std_err .config        
#>   <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
#> 1 mae     standard    1.63    10  0.0217 pre0_mod0_post0


# Original features and noise features

set.seed(991)
noise_split <- 
  map(1:30, \(i) rnorm(n = nrow(deliveries))) |> 
  as_tibble(.name_repair = "minimal") |> 
  setNames(paste0("noise_", 1:30)) |> 
  bind_cols(deliveries) |> 
  initial_validation_split(prop = c(0.6, 0.2), strata = time_to_delivery)
noise_train <- training(noise_split)
noise_folds <- vfold_cv(noise_train)
noise_rec <- 
  recipe(time_to_delivery ~ ., data = noise_train) |>
  step_dummy(all_factor_predictors()) |>
  step_spline_natural(hour, distance, deg_free = 10) |> 
  step_interact(~ starts_with("hour_"):starts_with("day_")) |>
  print()
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:    1
#> predictor: 60
#> 
#> ── Operations
#> • Dummy variables from: all_factor_predictors()
#> • Natural spline expansion: hour distance
#> • Interactions with: starts_with("hour_"):starts_with("day_")
noise_wflow <- workflow(noise_rec, linear_reg())
tictoc::tic("Tuning")
plan(multicore, workers = parallelly::availableCores(omit = 10))
res_noise <- noise_wflow |>
  fit_resamples(
    noise_folds,
    metrics = metric_set(mae, rsq),
    control = control_resamples(
      save_workflow = FALSE,
      save_pred = FALSE
    )
  )
tictoc::toc()
#> Tuning: 2.002 sec elapsed
res_noise  |> show_best(metric = "mae")
#> # A tibble: 1 × 6
#>   .metric .estimator  mean     n std_err .config        
#>   <chr>   <chr>      <dbl> <int>   <dbl> <chr>          
#> 1 mae     standard    1.65    10  0.0191 pre0_mod0_post0


# Feature selection

tune_rec <- 
  recipe(time_to_delivery ~ ., data = noise_train) |> 
  step_dummy(all_factor_predictors()) |> 
  step_spline_natural(hour, distance, deg_free = 10) |> 
  step_interact(~ starts_with("hour_"):starts_with("day_")) |>
  step_predictor_best(
    all_predictors(), 
    prop_terms = tune(),
    score = "cor_pearson"
  ) |> 
  print()
#> 
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#> 
#> ── Inputs
#> Number of variables by role
#> outcome:    1
#> predictor: 60
#> 
#> ── Operations
#> • Dummy variables from: all_factor_predictors()
#> • Natural spline expansion: hour distance
#> • Interactions with: starts_with("hour_"):starts_with("day_")
#> • Feature selection via `cor_pearson` on: all_predictors()

tune_wflow <- workflow(tune_rec, linear_reg())

tictoc::tic("Tuning")
plan(multicore, workers = parallelly::availableCores(omit = 10))
tune_res <- tune_wflow |>
  tune_grid(
    noise_folds,
    metrics = metric_set(mae, rsq),
    grid = tibble(prop_terms = seq(0.5, 1, length.out = 10)),
    control = control_grid(      
      save_workflow = FALSE,
      save_pred = FALSE
    )
  )
tictoc::toc()
#> Tuning: 12.561 sec elapsed

tune_res  |> show_best(metric = "mae")
#> # A tibble: 5 × 7
#>   prop_terms .metric .estimator  mean     n std_err .config         
#>        <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>           
#> 1      1     mae     standard    1.65    10  0.0191 pre10_mod0_post0
#> 2      0.944 mae     standard    1.66    10  0.0192 pre09_mod0_post0
#> 3      0.889 mae     standard    1.67    10  0.0199 pre08_mod0_post0
#> 4      0.833 mae     standard    1.67    10  0.0187 pre07_mod0_post0
#> 5      0.778 mae     standard    1.69    10  0.0208 pre06_mod0_post0

Created on 2025-09-04 with reprex v2.1.1.9000

@topepo topepo merged commit 90822d4 into main Sep 4, 2025
14 checks passed
@topepo topepo deleted the recipes-steps branch September 4, 2025 17:54
@topepo
Copy link
Member

topepo commented Sep 5, 2025

@jrosell I was about to hit the button to send this to CRAN and saw this. 🙀 Thanks for posting this.

I had a few thoughts. I was surprised to see the error go down as the noise predictors were retained. The amount is not much, reaching the MAE levels that we see with no noise predictors, and the overall difference in the delivery time MAE is about nine seconds. The signal is clear but tiny.

This particular data set has a handful of very important predictors and about 35ish item predictors that have a negligible effect on the outcome, especially without a few interactions. Plain old linear regression isn’t robust against extra predictors (like glmnet), but it isn’t particularly sensitive to them in “small” amounts. This is especially true if they are not very correlated. However, this should result in a relatively flat pattern in MAE, not an increasing one.

I can verify that the recipe step ranks the predictors appropriately, and the data being held out is not involved in the selection, so data leakage probably isn’t the issue. The same data is used to select predictors and model them. That’s not optimal (compared to full-on RFE). However, if that were the issue, we would be overfitting the predictor set, and the assessment data would show poor results.

One thing that I would do differently is to apply the filter first in the recipe. That eliminates a lot more terms at once. Correlating spline terms to the outcome might not work well since they are designed to model a local area in predictor space. That might dilute the correlation.

I did modify the recipe to see if that was the issue, and I got the same result pattern.

So again, there is surprise, but it’s not a big effect.

So what would happen if the model were very sensitive to extra noise columns? I used the same neural network tuned in the “whole game” chapter and used 24 units (same as the book). I ran the filter first, then did the existing feature engineering (code below).

Here are the results:

image

If there are about 6-10 predictors, that is a proportion range of 0.1 to 0.17. The best point is around 0.13 and shows underfitting before that, and the effect of too many predictors after.

(The weird spikes are cases when the gradient search went off the rails, causing a numeric overflow in the loss function. I’m still working on fixing that.)

This result is consistent with what I would expect, so I feel comfortable that it is working as intended.

One other thing, the desirability2 package can let you make tradeoffs between performance and the amount of feature selection via

library(desirability2)
tune_res |> 
	show_best_desirability(
	minimize(mae), 
	minimize(prop_terms, scale = 2)
)

so that can help make a good choice.

Other code:

norm_rec <- 
	recipe(time_to_delivery ~ ., data = noise_train) %>% 
	step_predictor_best(
		all_predictors(), 
		prop_terms = tune(),
		score = "cor_pearson",
	) |> 
	step_dummy(all_factor_predictors()) %>% 
	step_zv(all_predictors()) %>% 
	step_normalize(all_predictors())

nnet_spec <- 
	mlp(
		hidden_units = 24,
		penalty = 0.01,
		learn_rate = 0.1,
		epochs = 5000
	) %>%
	set_mode("regression") %>%
	set_engine("brulee", stop_iter = 10, rate_schedule = "cyclic")

nnet_wflow <- 
	workflow() %>% 
	add_model(nnet_spec) %>% 
	add_recipe(norm_rec)

nnet_grid <- tibble(prop_terms = seq(0.05, 1, length.out = 50))

set.seed(388)
nnet_res <-
	tune_grid(nnet_wflow,
						noise_folds,
						grid = nnet_grid,
						metrics = metric_set(mae, rsq))

nnet_res |>
	collect_metrics(summarize = FALSE) |>
	summarize(
		mean = mean(.estimate),
		median = median(.estimate),
		.by = c(prop_terms, .metric)
	) |>
	pivot_longer(
		cols = c(mean, median),
		names_to = "estimate",
		values_to = "value"
	) |>
	ggplot(aes(prop_terms, value, col = estimate, group = estimate)) + 
	geom_point(cex = 1) + 
	geom_line() + 
	facet_wrap(~ .metric, ncol = 1, scales = "free_y") + 
	geom_vline(xintercept = c(0.1, 0.17), lty = 3) + 
	theme_bw()

@jrosell
Copy link

jrosell commented Sep 5, 2025

Don't worry. Awesome addition. I see I didn't came up with a good example :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants