wlandau · wlandau · Aug 30, 2019 · Jul 22, 2019 · Jul 24, 2019 · Jul 24, 2019
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,18 +1,18 @@
 ^Thumbs\.db$
 ^build_site.R$
-^pkgdown$
-^\.DS_Store$
-^\.drake$
-^\.drake_history$
-^\.future$
-^\.git$
-^\.github$
-^\.gitignore$
-^\.lintr$
-^\.Rapp.history$
-^\.RData$
-^\.Rbuildignore$
-^\.Rhistory$
+pkgdown$
+\.DS_Store$
+\.drake$
+\.drake_history$
+\.future$
+\.git$
+\.github$
+\.gitignore$
+\.lintr$
+\.Rapp.history$
+\.RData$
+\.Rbuildignore$
+\.Rhistory$
 ^CODE_OF_CONDUCT.md$
 ^CONTRIBUTING.md$
 ^deploy.sh$

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -12,7 +12,7 @@ Description: Machine learning workflows can be difficult to manage.
   The slides, notebooks, and Shiny apps in this package
   teach how to create and maintain machine
   learning projects with drake-powered automation.
-Version: 0.0.2
+Version: 0.0.2.9000
 License: GPL-3
 URL: https://github.com/wlandau/learndrake
 BugReports: https://github.com/wlandau/learndrake/issues
@@ -36,7 +36,8 @@ SystemRequirements:
 Depends:
   R (>= 3.5.0)
 Imports:
-  drake (>= 7.4.0.9000),
+  clustermq (>= 0.8.8),
+  drake (>= 7.6.1),
   drakeplanner,
   future,
   future.callr, 
@@ -46,7 +47,6 @@ Imports:
   lubridate,
   recipes,
   rmarkdown,
-  rprojroot,
   rsample,
   shiny,
   styler,
@@ -55,7 +55,6 @@ Imports:
   visNetwork,
   yardstick
 Suggests:
-  clustermq,
   prettycode,
   shinytest,
   tidyselect (>= 0.2.4),

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,6 +5,7 @@ export(save_app)
 export(save_notebooks)
 export(save_slides)
 export(view_slides)
+importFrom(clustermq,Q)
 importFrom(drake,drake_plan)
 importFrom(drake,make)
 importFrom(drake,vis_drake_graph)
@@ -22,7 +23,6 @@ importFrom(recipes,bake)
 importFrom(recipes,juice)
 importFrom(rmarkdown,render)
 importFrom(rmarkdown,run)
-importFrom(rprojroot,find_rstudio_root_file)
 importFrom(rsample,initial_split)
 importFrom(rsample,testing)
 importFrom(rsample,training)

diff --git a/R/package.R b/R/package.R
@@ -1,4 +1,5 @@
-#' learndrake: materials to learn machine learning workflow management with drake
+#' learndrake: materials to learn
+#' machine learning workflow management with drake
 #' @docType package
 #' @description Machine learning workflows can be difficult to manage.
 #'   A single round of computation can take several hours to complete,
@@ -24,6 +25,7 @@
 #' launch_app("drakeplanner")
 #' }
 #' @references <https://github.com/wlandau/learndrake>
+#' @importFrom clustermq Q
 #' @importFrom drake drake_plan vis_drake_graph make
 #' @importFrom drakeplanner drakeplanner
 #' @importFrom future plan future
@@ -34,7 +36,6 @@
 #' @importFrom lubridate duration dseconds
 #' @importFrom recipes bake juice
 #' @importFrom rmarkdown render run
-#' @importFrom rprojroot find_rstudio_root_file
 #' @importFrom rsample initial_split testing training
 #' @importFrom shiny shinyApp
 #' @importFrom styler style_text

diff --git a/inst/notebooks/.Rprofile b/inst/notebooks/.Rprofile
@@ -1,5 +1,4 @@
 options(
-	drake_make_menu = FALSE,
 	drake_clean_menu = FALSE,
 	warnPartialMatchArgs = FALSE,
 	crayon.enabled = FALSE,

diff --git a/inst/notebooks/1-churn/1-churn.Rmd b/inst/notebooks/1-churn/1-churn.Rmd
@@ -21,7 +21,7 @@ source("../config/options.R")
 
 # About
 
-This notebook introduces the workshop's underlying data analysis case study. The exercises establish the necesary packages, data, *user-defined functions*, and motivating context.
+This notebook introduces the workshop's underlying data analysis case study. As you work through the setup and motivating context, pay patricular attention to how the work is organized into custom user-defined functions such as `prepare_recipe()`, `define_model()`, and `train_model()`.
 
 # Customer churn case study
 
@@ -56,6 +56,15 @@ library(tidyverse)
 library(yardstick)
 ```
 
+Check if TensorFlow is installed. If you see "Hellow, Tensorflow!", you are all set. Do not worry about other console messages.
+
+```{r, message = FALSE}
+library(tensorflow)
+sess <- tf$Session()
+hello <- tf$constant("Hello, TensorFlow!")
+sess$run(hello)
+```
+
 # Data
 
 The dataset has one row per customer. 
@@ -116,7 +125,7 @@ Let's get our data ready for the machine learning models.
 
 1. Partition the `tenure` variable into 6 bins.
 2. Take the log of `TotalCharges` (strengthens the association with `Churn`).
-3. One-hot encode all categorical variables.
+3. Represent each categorical variable as a collection of 0-1 indicator variables.
 4. Center and scale all covariates.
 
 We translate this preprocessing workflow into a recipe ([`recipes`](https://tidymodels.github.io/recipes) package).
@@ -138,7 +147,7 @@ prepare_recipe <- function(data) {
 }
 ```
 
-Our custom `prepare_recipe()` function creates a new recipe and applies it to a dataset. The return values is a prepped recipe.
+Our custom `prepare_recipe()` function creates a new recipe and applies it to a dataset. The return value is a prepped recipe.
 
 ```{r}
 rec <- prepare_recipe(data)
@@ -199,7 +208,7 @@ define_model <- function(rec, units1, units2, act1, act2, act3) {
 }
 ```
 
-We write another function to compile, train, and serialize the deep neural net. 
+We write another function to compile and train the deep neural net. The function returns the fitted model.
 
 ```{r}
 train_model <- function(
@@ -241,7 +250,7 @@ train_model <- function(
     validation_split = 0.3,
     verbose = 0
   )
-  serialize_model(model)
+  model
 }
 ```
 
@@ -252,24 +261,15 @@ model_16 <- train_model(data, rec, units1 = 16, units2 = 16)
 ```
 
 ```{r}
-str(model_16)
-```
-
-`train_model()` returns a [*serialized*](https://keras.rstudio.com/reference/serialize_model.html) model. The model is a string of raw bytes that we can save and then load into a different R session (important for [`drake`](https://github.com/ropensci/drake)).
-We have to call [`unserialize_model()`](https://keras.rstudio.com/reference/serialize_model.html) to convert it back into a usable format.
-
-```{r}
-unserialized_model_16 <- unserialize_model(model_16)
-print(unserialized_model_16)
+model_16
 ```
 
 # Performance
 
 A [confusion matrix](https://www.dataschool.io/simple-guide-to-confusion-matrix-terminology) summarizes the performance of a binary classifier. Let's write a custom function to create one for a given model.
 
 ```{r}
-confusion_matrix <- function(data, rec, serialized_model) {
-  model <- unserialize_model(serialized_model)
+confusion_matrix <- function(data, rec, model) {
   testing_data <- bake(rec, testing(data))
   x_test_tbl <- testing_data %>%
     select(-Churn) %>%

diff --git a/inst/notebooks/2-setup/2-setup.Rmd b/inst/notebooks/2-setup/2-setup.Rmd
@@ -19,25 +19,34 @@ basename(getwd()) # Should be "2-setup"
 source("../config/options.R")
 ```
 
-
 # About
 
-After working through `1-churn.Rmd`, we have the building blocks of our data analysis project. Now, let's put the pieces together in a `drake`-powered reproducible workflow.
+After working through `1-churn.Rmd`, we have the building blocks (i.e. functions) of our data analysis project. Now, let's put the pieces together in a `drake`-powered reproducible workflow.
+
+You may find these references on `drake` helpful (but hopefully not necessary for this notebook).
+
+- GitHub page: <https://github.com/ropensci/drake>
+- Reference website: <https://ropensci.github.io/drake/>
+- User manual: <https://ropenscilabs.github.io/drake-manual/>
+- Example code: <https://github.com/wlandau/drake-examples>
+- `drakeplanner` app: <https://github.com/wlandau/drakeplanner>
+- Existing presentations: <https://ropenscilabs.github.io/drake-manual/index.html#presentations>
+- This workshop: <https://github.com/wlandau/learndrake>
 
 # Dependencies
 
-To set up a [`drake`](http://github.com/ropensci/drake) workflow for the customer churn case study, we first load our packages and functions into the current R session.
+To set up a `drake` workflow for the customer churn case study, we first load our packages and functions into the current R session.
 
 ```{r message = FALSE}
 source("R/packages.R")  # Load the packages.
 source("R/functions.R") # Define our custom functions.
 ```
 
- Open up `packages.R` and `functions.R` scripts and take a look at how they are organized.
+Open up `packages.R` and `functions.R` scripts and take a look at how they are organized.
 
 # Plan
 
-Now, it is time to plan the actual workflow. If you were to write an R script, this is what it would look like.
+Now, it is time to plan the actual data analysis. If you were to write an R script, this is what the workflow would look like.
 
 ```{r, eval = FALSE}
 # Get the data.
@@ -60,23 +69,33 @@ conf <- confusion_matrix(data, rec, model)
 metrics <- compare_models(conf)
 ```
 
-But for [`drake`](http://github.com/ropensci/drake)-powered automation and reproducibility, we use a special data frame called a [`drake` plan](https://ropenscilabs.github.io/drake-manual/plans.html). Fill out the [`drake`](http://github.com/ropensci/drake) plan below. The "targets" (`data`, `rec`, etc.) can be in any order.
+But for `drake`-powered automation and reproducibility, we use a special data frame called a "`drake` plan" (<https://ropenscilabs.github.io/drake-manual/plans.html>).
+
+Now it is your turn: fill out the `drake` plan below using the code from the previous chunk. The "targets" (`data`, `rec`, etc.) can be in any order.
 
 ```{r}
 plan <- drake_plan(
-  rec = , # Create the recipe.
+	# Create the recipe.
+  rec = ,
+
+  # Train the model.
+  model = target(
+  	, # Write a call to train_model() to the left of the comma.
+    format = "keras" # Tells drake to save the model as an HDF5 file.
+  ),
 
-  model = , # Train the model.
+  # Compare to testing data.
+  conf = ,
 
-  conf = , # Compare to testing data.
+  # Compute performance metrics.
+  metrics = ,
 
-  metrics = , # Compute performance metrics.
-
-  data = read_csv( # Get the data. Filled in for you already.
+  # Get the data. Filled in for you already.
+  data = read_csv(
     file_in("../data/customer_churn.csv"),
     col_types = cols()
   ) %>%
-    initial_split(prop = 0.3),
+    initial_split(prop = 0.3)
 )
 ```
 
@@ -151,9 +170,7 @@ hash <- history %>%
   filter(target == "model") %>%
   pull(hash) %>%
   head(n = 1)
-cache <- drake_cache()
-cache$get_value(hash) %>%
-  unserialize_model()
+drake_cache()$get_value(hash)
 ```
 
 # Data recovery (experimental)

diff --git a/inst/notebooks/2-setup/R/functions.R b/inst/notebooks/2-setup/R/functions.R
@@ -77,11 +77,10 @@ train_model <- function(
     validation_split = 0.3,
     verbose = 0
   )
-  serialize_model(model)
+  model
 }
 
-confusion_matrix <- function(data, rec, serialized_model) {
-  model <- unserialize_model(serialized_model)
+confusion_matrix <- function(data, rec, model) {
   testing_data <- bake(rec, testing(data))
   x_test_tbl <- testing_data %>%
     select(-Churn) %>%

diff --git a/inst/notebooks/3-flow/R/functions.R b/inst/notebooks/3-flow/R/functions.R
@@ -77,11 +77,10 @@ train_model <- function(
     validation_split = 0.3,
     verbose = 0
   )
-  serialize_model(model)
+  model
 }
 
-confusion_matrix <- function(data, rec, serialized_model) {
-  model <- unserialize_model(serialized_model)
+confusion_matrix <- function(data, rec, model) {
   testing_data <- bake(rec, testing(data))
   x_test_tbl <- testing_data %>%
     select(-Churn) %>%

diff --git a/inst/notebooks/3-flow/R/plan.R b/inst/notebooks/3-flow/R/plan.R
@@ -1,7 +1,10 @@
 plan <- drake_plan(
   metrics = compare_models(conf),
   rec = prepare_recipe(data),
-  model = train_model(data, rec),
+  model = target(
+    train_model(data, rec),
+    format = "keras"
+  ),
   conf = confusion_matrix(data, rec, model),
   data = read_csv(
     file_in("../data/customer_churn.csv"),

diff --git a/inst/notebooks/4-plans/4-plans.Rmd b/inst/notebooks/4-plans/4-plans.Rmd
@@ -44,6 +44,7 @@ plan <- drake_plan(
   rec = prepare_recipe(data),
   model = target(
     train_model(data, rec, act1 = activation),
+    format = "keras",
     transform = map(activation = c("relu", "sigmoid", "softmax"))
   ),
   conf = target(
@@ -107,6 +108,7 @@ drake_plan(
   rec = prepare_recipe(data),
   model = target(
     train_model(data, rec, units1 = units),
+    format = "keras",
     transform = map(units = units)
   ),
   conf = target(
@@ -120,7 +122,7 @@ drake_plan(
 )
 ```
 
-We want the **values** stored in `units`, not the name of the symbol. To get the correct plan, we use `!!` from [tidy evaluation](https://tidyeval.tidyverse.org/).
+We want the **values** stored in `units`, not the name of the symbol "units". To get the correct plan, we use `!!` from [tidy evaluation](https://tidyeval.tidyverse.org/).
 
 ```{r, paged.print = FALSE}
 plan <- drake_plan(
@@ -132,6 +134,7 @@ plan <- drake_plan(
   rec = prepare_recipe(data),
   model = target(
     train_model(data, rec, units1 = units),
+    format = "keras",
     transform = map(units = !!units) # Use tidy evaluation here.
   ),
   conf = target(
@@ -166,6 +169,7 @@ plan <- drake_plan(
   rec = prepare_recipe(data),
   model = target(
     train_model(data, rec, units1 = units),
+    format = "keras",
     transform = map(units = !!units) # Use tidy evaluation here.
   ),
   conf = target(
@@ -200,6 +204,7 @@ drake_plan(
   rec = prepare_recipe(data),
   model = target(
     train_model(data, rec, units1 = units),
+    format = "keras",
     transform = map(units = !!units)
   ),
   conf = target(

diff --git a/inst/notebooks/4-plans/R/functions.R b/inst/notebooks/4-plans/R/functions.R
@@ -77,11 +77,10 @@ train_model <- function(
     validation_split = 0.3,
     verbose = 0
   )
-  serialize_model(model)
+  model
 }
 
-confusion_matrix <- function(data, rec, serialized_model) {
-  model <- unserialize_model(serialized_model)
+confusion_matrix <- function(data, rec, model) {
   testing_data <- bake(rec, testing(data))
   x_test_tbl <- testing_data %>%
     select(-Churn) %>%

diff --git a/inst/notebooks/5-files/.gitignore b/inst/notebooks/5-files/.gitignore
@@ -0,0 +1 @@
+model.h5