diff --git a/.Rbuildignore b/.Rbuildignore index f35d5c6..29fe8c4 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,18 +1,18 @@ ^Thumbs\.db$ ^build_site.R$ -^pkgdown$ -^\.DS_Store$ -^\.drake$ -^\.drake_history$ -^\.future$ -^\.git$ -^\.github$ -^\.gitignore$ -^\.lintr$ -^\.Rapp.history$ -^\.RData$ -^\.Rbuildignore$ -^\.Rhistory$ +pkgdown$ +\.DS_Store$ +\.drake$ +\.drake_history$ +\.future$ +\.git$ +\.github$ +\.gitignore$ +\.lintr$ +\.Rapp.history$ +\.RData$ +\.Rbuildignore$ +\.Rhistory$ ^CODE_OF_CONDUCT.md$ ^CONTRIBUTING.md$ ^deploy.sh$ diff --git a/DESCRIPTION b/DESCRIPTION index 86869ae..afabed7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,7 @@ Description: Machine learning workflows can be difficult to manage. The slides, notebooks, and Shiny apps in this package teach how to create and maintain machine learning projects with drake-powered automation. -Version: 0.0.2 +Version: 0.0.2.9000 License: GPL-3 URL: https://github.com/wlandau/learndrake BugReports: https://github.com/wlandau/learndrake/issues @@ -36,7 +36,8 @@ SystemRequirements: Depends: R (>= 3.5.0) Imports: - drake (>= 7.4.0.9000), + clustermq (>= 0.8.8), + drake (>= 7.6.1), drakeplanner, future, future.callr, @@ -46,7 +47,6 @@ Imports: lubridate, recipes, rmarkdown, - rprojroot, rsample, shiny, styler, @@ -55,7 +55,6 @@ Imports: visNetwork, yardstick Suggests: - clustermq, prettycode, shinytest, tidyselect (>= 0.2.4), diff --git a/NAMESPACE b/NAMESPACE index 7c8cda3..1b174c4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,6 +5,7 @@ export(save_app) export(save_notebooks) export(save_slides) export(view_slides) +importFrom(clustermq,Q) importFrom(drake,drake_plan) importFrom(drake,make) importFrom(drake,vis_drake_graph) @@ -22,7 +23,6 @@ importFrom(recipes,bake) importFrom(recipes,juice) importFrom(rmarkdown,render) importFrom(rmarkdown,run) -importFrom(rprojroot,find_rstudio_root_file) importFrom(rsample,initial_split) importFrom(rsample,testing) importFrom(rsample,training) diff --git a/R/package.R b/R/package.R index 7ddb268..575c436 100644 --- a/R/package.R +++ b/R/package.R @@ -1,4 +1,5 @@ -#' learndrake: materials to learn machine learning workflow management with drake +#' learndrake: materials to learn +#' machine learning workflow management with drake #' @docType package #' @description Machine learning workflows can be difficult to manage. #' A single round of computation can take several hours to complete, @@ -24,6 +25,7 @@ #' launch_app("drakeplanner") #' } #' @references +#' @importFrom clustermq Q #' @importFrom drake drake_plan vis_drake_graph make #' @importFrom drakeplanner drakeplanner #' @importFrom future plan future @@ -34,7 +36,6 @@ #' @importFrom lubridate duration dseconds #' @importFrom recipes bake juice #' @importFrom rmarkdown render run -#' @importFrom rprojroot find_rstudio_root_file #' @importFrom rsample initial_split testing training #' @importFrom shiny shinyApp #' @importFrom styler style_text diff --git a/inst/notebooks/.Rprofile b/inst/notebooks/.Rprofile index 8ae1d91..9ed8589 100644 --- a/inst/notebooks/.Rprofile +++ b/inst/notebooks/.Rprofile @@ -1,5 +1,4 @@ options( - drake_make_menu = FALSE, drake_clean_menu = FALSE, warnPartialMatchArgs = FALSE, crayon.enabled = FALSE, diff --git a/inst/notebooks/1-churn/1-churn.Rmd b/inst/notebooks/1-churn/1-churn.Rmd index 7aadcd0..0d247c9 100644 --- a/inst/notebooks/1-churn/1-churn.Rmd +++ b/inst/notebooks/1-churn/1-churn.Rmd @@ -21,7 +21,7 @@ source("../config/options.R") # About -This notebook introduces the workshop's underlying data analysis case study. The exercises establish the necesary packages, data, *user-defined functions*, and motivating context. +This notebook introduces the workshop's underlying data analysis case study. As you work through the setup and motivating context, pay patricular attention to how the work is organized into custom user-defined functions such as `prepare_recipe()`, `define_model()`, and `train_model()`. # Customer churn case study @@ -56,6 +56,15 @@ library(tidyverse) library(yardstick) ``` +Check if TensorFlow is installed. If you see "Hellow, Tensorflow!", you are all set. Do not worry about other console messages. + +```{r, message = FALSE} +library(tensorflow) +sess <- tf$Session() +hello <- tf$constant("Hello, TensorFlow!") +sess$run(hello) +``` + # Data The dataset has one row per customer. @@ -116,7 +125,7 @@ Let's get our data ready for the machine learning models. 1. Partition the `tenure` variable into 6 bins. 2. Take the log of `TotalCharges` (strengthens the association with `Churn`). -3. One-hot encode all categorical variables. +3. Represent each categorical variable as a collection of 0-1 indicator variables. 4. Center and scale all covariates. We translate this preprocessing workflow into a recipe ([`recipes`](https://tidymodels.github.io/recipes) package). @@ -138,7 +147,7 @@ prepare_recipe <- function(data) { } ``` -Our custom `prepare_recipe()` function creates a new recipe and applies it to a dataset. The return values is a prepped recipe. +Our custom `prepare_recipe()` function creates a new recipe and applies it to a dataset. The return value is a prepped recipe. ```{r} rec <- prepare_recipe(data) @@ -199,7 +208,7 @@ define_model <- function(rec, units1, units2, act1, act2, act3) { } ``` -We write another function to compile, train, and serialize the deep neural net. +We write another function to compile and train the deep neural net. The function returns the fitted model. ```{r} train_model <- function( @@ -241,7 +250,7 @@ train_model <- function( validation_split = 0.3, verbose = 0 ) - serialize_model(model) + model } ``` @@ -252,15 +261,7 @@ model_16 <- train_model(data, rec, units1 = 16, units2 = 16) ``` ```{r} -str(model_16) -``` - -`train_model()` returns a [*serialized*](https://keras.rstudio.com/reference/serialize_model.html) model. The model is a string of raw bytes that we can save and then load into a different R session (important for [`drake`](https://github.com/ropensci/drake)). -We have to call [`unserialize_model()`](https://keras.rstudio.com/reference/serialize_model.html) to convert it back into a usable format. - -```{r} -unserialized_model_16 <- unserialize_model(model_16) -print(unserialized_model_16) +model_16 ``` # Performance @@ -268,8 +269,7 @@ print(unserialized_model_16) A [confusion matrix](https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology) summarizes the performance of a binary classifier. Let's write a custom function to create one for a given model. ```{r} -confusion_matrix <- function(data, rec, serialized_model) { - model <- unserialize_model(serialized_model) +confusion_matrix <- function(data, rec, model) { testing_data <- bake(rec, testing(data)) x_test_tbl <- testing_data %>% select(-Churn) %>% diff --git a/inst/notebooks/2-setup/2-setup.Rmd b/inst/notebooks/2-setup/2-setup.Rmd index a851fa4..189dec2 100644 --- a/inst/notebooks/2-setup/2-setup.Rmd +++ b/inst/notebooks/2-setup/2-setup.Rmd @@ -19,25 +19,34 @@ basename(getwd()) # Should be "2-setup" source("../config/options.R") ``` - # About -After working through `1-churn.Rmd`, we have the building blocks of our data analysis project. Now, let's put the pieces together in a `drake`-powered reproducible workflow. +After working through `1-churn.Rmd`, we have the building blocks (i.e. functions) of our data analysis project. Now, let's put the pieces together in a `drake`-powered reproducible workflow. + +You may find these references on `drake` helpful (but hopefully not necessary for this notebook). + +- GitHub page: +- Reference website: +- User manual: +- Example code: +- `drakeplanner` app: +- Existing presentations: +- This workshop: # Dependencies -To set up a [`drake`](http://github.com/ropensci/drake) workflow for the customer churn case study, we first load our packages and functions into the current R session. +To set up a `drake` workflow for the customer churn case study, we first load our packages and functions into the current R session. ```{r message = FALSE} source("R/packages.R") # Load the packages. source("R/functions.R") # Define our custom functions. ``` - Open up `packages.R` and `functions.R` scripts and take a look at how they are organized. +Open up `packages.R` and `functions.R` scripts and take a look at how they are organized. # Plan -Now, it is time to plan the actual workflow. If you were to write an R script, this is what it would look like. +Now, it is time to plan the actual data analysis. If you were to write an R script, this is what the workflow would look like. ```{r, eval = FALSE} # Get the data. @@ -60,23 +69,33 @@ conf <- confusion_matrix(data, rec, model) metrics <- compare_models(conf) ``` -But for [`drake`](http://github.com/ropensci/drake)-powered automation and reproducibility, we use a special data frame called a [`drake` plan](https://ropenscilabs.github.io/drake-manual/plans.html). Fill out the [`drake`](http://github.com/ropensci/drake) plan below. The "targets" (`data`, `rec`, etc.) can be in any order. +But for `drake`-powered automation and reproducibility, we use a special data frame called a "`drake` plan" (). + +Now it is your turn: fill out the `drake` plan below using the code from the previous chunk. The "targets" (`data`, `rec`, etc.) can be in any order. ```{r} plan <- drake_plan( - rec = , # Create the recipe. + # Create the recipe. + rec = , + + # Train the model. + model = target( + , # Write a call to train_model() to the left of the comma. + format = "keras" # Tells drake to save the model as an HDF5 file. + ), - model = , # Train the model. + # Compare to testing data. + conf = , - conf = , # Compare to testing data. + # Compute performance metrics. + metrics = , - metrics = , # Compute performance metrics. - - data = read_csv( # Get the data. Filled in for you already. + # Get the data. Filled in for you already. + data = read_csv( file_in("../data/customer_churn.csv"), col_types = cols() ) %>% - initial_split(prop = 0.3), + initial_split(prop = 0.3) ) ``` @@ -151,9 +170,7 @@ hash <- history %>% filter(target == "model") %>% pull(hash) %>% head(n = 1) -cache <- drake_cache() -cache$get_value(hash) %>% - unserialize_model() +drake_cache()$get_value(hash) ``` # Data recovery (experimental) diff --git a/inst/notebooks/2-setup/R/functions.R b/inst/notebooks/2-setup/R/functions.R index ba907bf..65032fc 100644 --- a/inst/notebooks/2-setup/R/functions.R +++ b/inst/notebooks/2-setup/R/functions.R @@ -77,11 +77,10 @@ train_model <- function( validation_split = 0.3, verbose = 0 ) - serialize_model(model) + model } -confusion_matrix <- function(data, rec, serialized_model) { - model <- unserialize_model(serialized_model) +confusion_matrix <- function(data, rec, model) { testing_data <- bake(rec, testing(data)) x_test_tbl <- testing_data %>% select(-Churn) %>% diff --git a/inst/notebooks/3-flow/R/functions.R b/inst/notebooks/3-flow/R/functions.R index ba907bf..65032fc 100644 --- a/inst/notebooks/3-flow/R/functions.R +++ b/inst/notebooks/3-flow/R/functions.R @@ -77,11 +77,10 @@ train_model <- function( validation_split = 0.3, verbose = 0 ) - serialize_model(model) + model } -confusion_matrix <- function(data, rec, serialized_model) { - model <- unserialize_model(serialized_model) +confusion_matrix <- function(data, rec, model) { testing_data <- bake(rec, testing(data)) x_test_tbl <- testing_data %>% select(-Churn) %>% diff --git a/inst/notebooks/3-flow/R/plan.R b/inst/notebooks/3-flow/R/plan.R index c7b3705..c76e159 100644 --- a/inst/notebooks/3-flow/R/plan.R +++ b/inst/notebooks/3-flow/R/plan.R @@ -1,7 +1,10 @@ plan <- drake_plan( metrics = compare_models(conf), rec = prepare_recipe(data), - model = train_model(data, rec), + model = target( + train_model(data, rec), + format = "keras" + ), conf = confusion_matrix(data, rec, model), data = read_csv( file_in("../data/customer_churn.csv"), diff --git a/inst/notebooks/4-plans/4-plans.Rmd b/inst/notebooks/4-plans/4-plans.Rmd index 41da1e1..a135721 100644 --- a/inst/notebooks/4-plans/4-plans.Rmd +++ b/inst/notebooks/4-plans/4-plans.Rmd @@ -44,6 +44,7 @@ plan <- drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, act1 = activation), + format = "keras", transform = map(activation = c("relu", "sigmoid", "softmax")) ), conf = target( @@ -107,6 +108,7 @@ drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, units1 = units), + format = "keras", transform = map(units = units) ), conf = target( @@ -120,7 +122,7 @@ drake_plan( ) ``` -We want the **values** stored in `units`, not the name of the symbol. To get the correct plan, we use `!!` from [tidy evaluation](https://tidyeval.tidyverse.org/). +We want the **values** stored in `units`, not the name of the symbol "units". To get the correct plan, we use `!!` from [tidy evaluation](https://tidyeval.tidyverse.org/). ```{r, paged.print = FALSE} plan <- drake_plan( @@ -132,6 +134,7 @@ plan <- drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, units1 = units), + format = "keras", transform = map(units = !!units) # Use tidy evaluation here. ), conf = target( @@ -166,6 +169,7 @@ plan <- drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, units1 = units), + format = "keras", transform = map(units = !!units) # Use tidy evaluation here. ), conf = target( @@ -200,6 +204,7 @@ drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, units1 = units), + format = "keras", transform = map(units = !!units) ), conf = target( diff --git a/inst/notebooks/4-plans/R/functions.R b/inst/notebooks/4-plans/R/functions.R index ba907bf..65032fc 100644 --- a/inst/notebooks/4-plans/R/functions.R +++ b/inst/notebooks/4-plans/R/functions.R @@ -77,11 +77,10 @@ train_model <- function( validation_split = 0.3, verbose = 0 ) - serialize_model(model) + model } -confusion_matrix <- function(data, rec, serialized_model) { - model <- unserialize_model(serialized_model) +confusion_matrix <- function(data, rec, model) { testing_data <- bake(rec, testing(data)) x_test_tbl <- testing_data %>% select(-Churn) %>% diff --git a/inst/notebooks/5-files/.gitignore b/inst/notebooks/5-files/.gitignore new file mode 100644 index 0000000..22a66b1 --- /dev/null +++ b/inst/notebooks/5-files/.gitignore @@ -0,0 +1 @@ +model.h5 diff --git a/inst/notebooks/5-files/5-files.Rmd b/inst/notebooks/5-files/5-files.Rmd index 844c856..0bafc39 100644 --- a/inst/notebooks/5-files/5-files.Rmd +++ b/inst/notebooks/5-files/5-files.Rmd @@ -19,54 +19,9 @@ basename(getwd()) # Should be "5-files" source("../config/options.R") ``` -# About +# Reproducible file management -This part of the workshop aims to demonstrate how to reproducibly track input and output files with `drake`. - -# Motivation: possible storage slowness - -When `drake` runs your model, it [serializes](https://en.wikipedia.org/wiki/Serialization) the output twice: once in `keras::serialize_model()` and again in [`drake`'s storage system](https://github.com/richfitz/storr). This may slow down workflows with quick models and enormous datasets. - -Let's find out if our own workflow suffers such slowness. First, we run the workflow. - -```{r} -source("R/packages.R") -source("R/functions.R") -source("R/plan.R") -make(plan) -``` - -Then, we look at build times. - -```{r} -build_times() -``` - -`drake` records the time it took to fully process the model. - -```{r} -build <- build_times(model, type = "build")$elapsed -build -``` - -And the time it took just to run the command - -```{r} -command <- build_times(model, type = "command")$elapsed -command -``` - -The relative difference is the overhead incurred by `drake`. - -```{r} -sprintf("%.2f%%", 100 * (build - command) / build) -``` - -In the customer churn case study, the overhead is not so bad. However, you should perform this runtime check in your own deep learning projects. If the overhead is too high, you should avoid `drake`'s one-size-fits-all storage system and save your models to custom files. - -# How to track files reproducibly. - -For `drake` plans, there are three functions to reproducibly track files. +Some targets depend on external files. When you edit a file, you want `make()` to run all the affected targets. To tell `drake` to pay attention to files, there are special functions to identify files in your plan. Function | Purpose | Works with directories? | Works inside custom functions? ---|---|---|--- @@ -74,23 +29,26 @@ Function | Purpose | Works with directories? | Works inside custom functions? `file_out()` | Track **output** files | Yes | No `knitr_in()` | Track `knitr` report files | No | No -For now, let's focus on `file_in()` and` file_out()`. We already have an example of `file_in()` to load our data. +For now, let's focus on `file_in()` and` file_out()`. We have already seen `file_in()` in our previous plans. -```{r} +```{r, message = FALSE} +source("R/packages.R") +source("R/functions.R") +source("R/plan.R") plan %>% filter(target == "data") %>% pull(command) %>% `[[`(1) ``` -`file_in()` tells `make()` to fingerprint and track `customer_churn.csv`. As we saw in the `3-iterate` exercises, `drake` updates the downstream targets when the data file changes. +`file_in()` tells `make()` to fingerprint and track `customer_churn.csv`. As we saw in the `3-flow` exercises, `drake` updates the downstream targets when the data file changes. ```{r} config <- drake_config(plan) -vis_drake_graph(config, from = file_store("../data/customer_churn.csv")) +vis_drake_graph(config, from = file_store("../data/customer_churn.csv")) ``` -`file_out()` works similarly, but for output files. Output files from one target can serve as input files for other targets. +`file_out()` works similarly, but for output files. Output files from upstream targets can serve as input files to downstream targets. ```{r} example_plan <- drake_plan( @@ -107,12 +65,11 @@ config <- drake_config(example_plan) vis_drake_graph(config) ``` - # Exercise: back to customer churn -Let's save our model to a custom file. We will use [`save_model_hdf5()`](https://keras.rstudio.com/reference/save_model_hdf5.html) and `load_model_hdf5()` from [`keras`](https://keras.rstudio.com). +Instead of letting `drake` save our model automatically, let's manually save it to a custom file. That way, the return value of the target can be something else: say, the progression of the model fit over each epoch (). To manually save the model, we use the [`save_model_hdf5()`](https://keras.rstudio.com/reference/save_model_hdf5.html) and `load_model_hdf5()` from `keras`. (This is what `drake` uses when you insert `format = "keras"` in the plan.) -Follow the directions in the comments of `R/plan.R` and `R/functions.R`. If you get stuck and need to reset your files, start over with a fresh copy of the notebooks and supporting files ([`learndrake::save_notebooks()`](https://github.com/wlandau/learndrake)). You can peek at [this chapter of the manual](https://ropenscilabs.github.io/drake-manual/churn.html#increasing-efficiency) for hints. +Your turn: follow the directions in the comments of `R/plan.R` and `R/functions.R` to save the model to a reproducibly-tracked HDF5 file. If you get stuck and need to reset your files, start over with a fresh copy of the notebooks and supporting files ([`learndrake::save_notebooks()`](https://github.com/wlandau/learndrake)). You can peek at [this chapter of the manual](https://ropenscilabs.github.io/drake-manual/churn.html#increasing-efficiency) for hints. When you are done, check your dependency graph. @@ -121,22 +78,22 @@ source("R/packages.R") source("R/functions.R") source("R/plan.R") config <- drake_config(plan) -vis_drake_graph(config) +vis_drake_graph(config, hover = TRUE) ``` -Our graph should look something like this. +Your graph should look something like this. ```{r} readRDS("img/graph.rds") ``` -Now let's fit our model and run the downstream analyses. +Now let's run the workflow. ```{r} make(plan) ``` -Sanity check: are all our targets up to date now? +Sanity check: are all the targets up to date now? ```{r} outdated(config) @@ -146,15 +103,14 @@ outdated(config) make(plan) ``` -Our file-based approach frees us up to see the progression of the model run. +As promised, file-based approach frees us up to see the progression of the model run. ```{r} loadd(progression) plot(progression) ``` - -Our model is in a file. +The model is in the file. ```{r} load_model_hdf5("model.h5") @@ -172,7 +128,7 @@ Which targets are outdated now? outdated(config) ``` -Which targets get rebuilt when we call `make(plan)`? +Which targets get rebuilt when you call `make(plan)`? ```{r} make(plan) diff --git a/inst/notebooks/5-files/R/functions.R b/inst/notebooks/5-files/R/functions.R index 27746c7..407e718 100644 --- a/inst/notebooks/5-files/R/functions.R +++ b/inst/notebooks/5-files/R/functions.R @@ -72,14 +72,13 @@ train_model <- function( validation_split = 0.3, verbose = 0 ) - serialize_model(model) # Replace with save_model_hdf5(model, model_file). + model # Replace with save_model_hdf5(model, model_file). # Return `progression`. } -# Rename the `serialized_model` argument to `model_file`. -confusion_matrix <- function(data, rec, serialized_model) { - # Replace with `model <- load_model_hdf5(model_file)`: - model <- unserialize_model(serialized_model) +# Rename the `model` argument. Call it `model_file`. +confusion_matrix <- function(data, rec, model) { + # Write `model <- load_model_hdf5(model_file)` testing_data <- bake(rec, testing(data)) x_test_tbl <- testing_data %>% select(-Churn) %>% diff --git a/inst/notebooks/5-files/R/plan.R b/inst/notebooks/5-files/R/plan.R index a33dbd7..d8f478d 100644 --- a/inst/notebooks/5-files/R/plan.R +++ b/inst/notebooks/5-files/R/plan.R @@ -5,9 +5,13 @@ plan <- drake_plan( ) %>% initial_split(prop = 0.3), rec = prepare_recipe(data), - # Set `model_file = file_out("model.h5")` in the call to train_model(). - # Name this target `progression` instead of `model`. - model = train_model(data, rec), + # Name this target `progression` instead of `model`: + model = target( + # Set `model_file = file_out("model.h5")` in the call to train_model(). + train_model(data, rec), + # The target value is no longer a model, so remove `format = "keras"`. + format = "keras" + ), # Replace `model` with `file_in("model.h5")`: conf = confusion_matrix(data, rec, model), metrics = compare_models(conf) diff --git a/inst/notebooks/5-files/img/graph.rds b/inst/notebooks/5-files/img/graph.rds index 989e3a6..88836e4 100644 Binary files a/inst/notebooks/5-files/img/graph.rds and b/inst/notebooks/5-files/img/graph.rds differ diff --git a/inst/notebooks/6-reports/.gitignore b/inst/notebooks/6-reports/.gitignore new file mode 100644 index 0000000..66980c1 --- /dev/null +++ b/inst/notebooks/6-reports/.gitignore @@ -0,0 +1 @@ +churn-results.html diff --git a/inst/notebooks/6-reports/R/functions.R b/inst/notebooks/6-reports/R/functions.R index ba907bf..65032fc 100644 --- a/inst/notebooks/6-reports/R/functions.R +++ b/inst/notebooks/6-reports/R/functions.R @@ -77,11 +77,10 @@ train_model <- function( validation_split = 0.3, verbose = 0 ) - serialize_model(model) + model } -confusion_matrix <- function(data, rec, serialized_model) { - model <- unserialize_model(serialized_model) +confusion_matrix <- function(data, rec, model) { testing_data <- bake(rec, testing(data)) x_test_tbl <- testing_data %>% select(-Churn) %>% diff --git a/inst/notebooks/6-reports/R/plan.R b/inst/notebooks/6-reports/R/plan.R index 7563d27..9aec0b2 100644 --- a/inst/notebooks/6-reports/R/plan.R +++ b/inst/notebooks/6-reports/R/plan.R @@ -7,6 +7,7 @@ plan <- drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, units1 = u1), + format = "keras", transform = map(u1 = c(16, 32, 64)) ), conf = target( diff --git a/inst/notebooks/7-hpc/.gitignore b/inst/notebooks/7-hpc/.gitignore new file mode 100644 index 0000000..66980c1 --- /dev/null +++ b/inst/notebooks/7-hpc/.gitignore @@ -0,0 +1 @@ +churn-results.html diff --git a/inst/notebooks/7-hpc/7-hpc.Rmd b/inst/notebooks/7-hpc/7-hpc.Rmd index 698664b..f45e1fb 100644 --- a/inst/notebooks/7-hpc/7-hpc.Rmd +++ b/inst/notebooks/7-hpc/7-hpc.Rmd @@ -48,7 +48,7 @@ source("R/plan.R") ``` ```{r} -make(plan) +make(plan) # Sometimes multicore parallelism and keras don't work together. # Only attempt if you have clustermq and ZeroMQ installed: # make(plan, parallelism = "clustermq", jobs = 2) ``` @@ -93,7 +93,7 @@ To run the pipeline with transient workers, use `parallelism = "future"` and set ```{r} clean() # Invalidate all the targets so we can run from scratch. -make(plan, parallelism = "future", jobs = 2) +make(plan, parallelism = "future", jobs = 2, caching = "worker") ``` # Tips diff --git a/inst/notebooks/7-hpc/R/functions.R b/inst/notebooks/7-hpc/R/functions.R index 7e37171..a7c9224 100644 --- a/inst/notebooks/7-hpc/R/functions.R +++ b/inst/notebooks/7-hpc/R/functions.R @@ -68,11 +68,10 @@ train_model <- function( validation_split = validation_split, verbose = 0 ) - serialize_model(model) + model } -confusion_matrix <- function(data, rec, serialized_model) { - model <- unserialize_model(serialized_model) +confusion_matrix <- function(data, rec, model) { testing_data <- bake(rec, testing(data)) x_test_tbl <- testing_data %>% select(-Churn) %>% diff --git a/inst/notebooks/7-hpc/R/plan.R b/inst/notebooks/7-hpc/R/plan.R index 5873c2a..ff52507 100644 --- a/inst/notebooks/7-hpc/R/plan.R +++ b/inst/notebooks/7-hpc/R/plan.R @@ -7,6 +7,7 @@ plan <- drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, batch_size = batch_size_, epochs = 32), + format = "keras", transform = map(batch_size_ = c(16, 32)) ), conf = target( diff --git a/inst/slides/R/functions.R b/inst/slides/R/functions.R index 1ff30c9..a4615bd 100644 --- a/inst/slides/R/functions.R +++ b/inst/slides/R/functions.R @@ -77,11 +77,10 @@ train_model <- function( validation_split = 0.3, verbose = 0 ) - serialize_model(model) + model } -confusion_matrix <- function(data, rec, serialized_model) { - model <- unserialize_model(serialized_model) +confusion_matrix <- function(data, rec, model) { testing_data <- bake(rec, testing(data)) x_test_tbl <- testing_data %>% select(-Churn) %>% diff --git a/inst/slides/R/plan.R b/inst/slides/R/plan.R index e764554..4982bbe 100644 --- a/inst/slides/R/plan.R +++ b/inst/slides/R/plan.R @@ -6,6 +6,7 @@ plan <- drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, act1 = act), + format = "keras", transform = map(act = !!activations) ), conf = target( diff --git a/inst/slides/R/plan2.R b/inst/slides/R/plan2.R index 241b5fd..8b85708 100644 --- a/inst/slides/R/plan2.R +++ b/inst/slides/R/plan2.R @@ -6,6 +6,7 @@ plan <- drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, act1 = act), + format = "keras", transform = map(act = !!activations) ), conf = target( diff --git a/inst/slides/README.md b/inst/slides/README.md index a0c99fe..0a7ede5 100644 --- a/inst/slides/README.md +++ b/inst/slides/README.md @@ -1,3 +1,3 @@ -# Intro: machine learning workflow management with drake +# Reproducible workflows at scale with drake -This repo holds the introductory slide deck for a workshop on machine learning and [`drake`](https://github.com/ropensci/drake) at https://github.com/wlandau/drake-ws-src. +Ambitious workflows in R, such as machine learning analyses, can be difficult to manage. A single round of computation can take several hours to complete, and routine updates to the code and data tend to invalidate hard-earned results. You can enhance the maintainability, hygiene, speed, scale, and reproducibility of such projects with the drake R package. drake resolves the dependency structure of your analysis pipeline, skips tasks that are already up to date, executes the rest with optional distributed computing, and manages data storage for you. This talk demonstrates how to create and maintain a realistic machine learning project using drake-powered automation. diff --git a/inst/slides/index.Rmd b/inst/slides/index.Rmd index 6840c47..fe8a48e 100755 --- a/inst/slides/index.Rmd +++ b/inst/slides/index.Rmd @@ -1,5 +1,5 @@ --- -title: "Machine learning workflow management in R" +title: "Reproducible workflows at scale with drake" author: Will Landau output: xaringan::moon_reader: @@ -84,6 +84,24 @@ knitr::opts_chunk$set( ) ``` +## Large data science projects + +1. Long runtimes. +2. Many steps. +3. Steps are interconnected. + +- Deep learning +- Classical machine learning. +- Bayesian computation, e.g. Markov chain Monte Carlo. +- Spatial data analysis. +- Clinical trial modeling and simulation. +- Subgroup identification. +- Graph-based multiple comparison procedures. +- Genomics pipelines. +- PK/PD modeling (e.g. [`mrgsolve`](https://github.com/metrumresearchgroup/mrgsolve)) +- ... + +--- ## Workflows have interconnected steps.
@@ -131,21 +149,6 @@ knitr::opts_chunk$set( --- -## When do we face these issues? - -
-- Long computation! - - Clinical trial modeling and simulation - - Subgroup identification - - Bayesian data analysis - - Graph-based multiple comparison procedures - - Bayesian networks in genomics - - PK/PD modeling (e.g. [`mrgsolve`](https://github.com/metrumresearchgroup/mrgsolve)) - - **Deep learning** - - ... - ---- - ## Solution: pipeline tools
@@ -233,7 +236,7 @@ train_model <- function(data, rec, units1, units2, act1, act2, act3) { # ... } -confusion_matrix <- function(data, rec, serialized_model) { +confusion_matrix <- function(data, rec, model) { # ... } @@ -255,6 +258,7 @@ plan <- drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, act1 = act), + format = "keras", transform = map(act = !!activations) ), conf = target( @@ -318,6 +322,7 @@ plan <- drake_plan( rec = prepare_recipe(data), model = target( train_model(data, rec, act1 = act), + format = "keras", transform = map(act = !!activations) #<< ), conf = target( @@ -380,6 +385,33 @@ make(plan) --- +## Specialized data formats + +- Increased speed and reduced memory consumption. + +```{r, eval = FALSE} +library(drake) +n <- 1e8 # Each target is 1.6 GB in memory. +plan <- drake_plan( + data_fst = target( + data.frame(x = runif(n), y = runif(n)), + format = "fst" #<< + ), + data_old = data.frame(x = runif(n), y = runif(n)) +) +make(plan) +#> target data_fst +#> target data_old +build_times(type = "build") +#> # A tibble: 2 x 4 +#> target elapsed user system +#> +#> 1 data_fst 13.93s 37.562s 7.954s #<< +#> 2 data_old 184s (~3.07 minutes) 177s (~2.95 minutes) 4.157s #<< +``` + +--- + ## History and provenance ```{r history} @@ -388,7 +420,7 @@ drake_history() --- -## Reproducible data recovery (experimental) +## Reproducible data recovery ```{r recovery} clean() # Oops! diff --git a/inst/slides/index.html b/inst/slides/index.html index c55ad74..3c0a065 100644 --- a/inst/slides/index.html +++ b/inst/slides/index.html @@ -1,7 +1,7 @@ - Machine learning workflow management in R + Reproducible workflows at scale with drake @@ -11,7 +11,7 @@