From d6a7f19fe1a576566c1db60745c7215b98c9f26b Mon Sep 17 00:00:00 2001 From: wlandau Date: Tue, 3 Dec 2019 04:16:06 -0500 Subject: [PATCH 1/5] Implement use_scratch for RDS storrs --- NEWS.md | 1 + R/driver_rds.R | 28 +++++++++++++++++++--------- R/hash.R | 30 +++++++++++++++++++----------- tests/testthat/test-driver-rds.R | 8 ++++++++ 4 files changed, 47 insertions(+), 20 deletions(-) diff --git a/NEWS.md b/NEWS.md index b5b0ab6..72e9acb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ ## storr 1.2.2 (2018-??-??) * Speed up the `$get_hash()` method of RDS drivers using C code and traits (#96, #98, @wlandau). +* Allow users to bypass scratch in RDS `storr`s (#116, @wlandau). ## storr 1.2.1 (2018-10-18) diff --git a/R/driver_rds.R b/R/driver_rds.R index 5a30f59..f5c61ac 100644 --- a/R/driver_rds.R +++ b/R/driver_rds.R @@ -74,6 +74,11 @@ ##' \code{\link{digest}}. If not given, then we will default to ##' "md5". ##' +##' @param use_scratch Logical, whether to create data and key files +##' in a scratch directory before moving them to their final destinations. +##' Set to \code{TRUE} to ensure atomic reads and writes, or set to +##' \code{FALSE} to increase speed on some file systems. +##' ##' @param default_namespace Default namespace (see ##' \code{\link{storr}}). ##' @export @@ -110,16 +115,17 @@ ##' st2$destroy() storr_rds <- function(path, compress = NULL, mangle_key = NULL, mangle_key_pad = NULL, hash_algorithm = NULL, - default_namespace = "objects") { - storr(driver_rds(path, compress, mangle_key, mangle_key_pad, hash_algorithm), + default_namespace = "objects", use_scratch = TRUE) { + storr(driver_rds(path, compress, mangle_key, mangle_key_pad, hash_algorithm, use_scratch), default_namespace) } ##' @export ##' @rdname storr_rds driver_rds <- function(path, compress = NULL, mangle_key = NULL, - mangle_key_pad = NULL, hash_algorithm = NULL) { - R6_driver_rds$new(path, compress, mangle_key, mangle_key_pad, hash_algorithm) + mangle_key_pad = NULL, hash_algorithm = NULL, + use_scratch = TRUE) { + R6_driver_rds$new(path, compress, mangle_key, mangle_key_pad, hash_algorithm, use_scratch) } R6_driver_rds <- R6::R6Class( @@ -129,6 +135,7 @@ R6_driver_rds <- R6::R6Class( ## This needs sorting before anyone writes their own driver! path = NULL, path_scratch = NULL, + use_scratch = NULL, compress = NULL, mangle_key = NULL, mangle_key_pad = NULL, @@ -137,7 +144,7 @@ R6_driver_rds <- R6::R6Class( traits = list(accept = "raw", throw_missing = TRUE), initialize = function(path, compress, mangle_key, mangle_key_pad, - hash_algorithm) { + hash_algorithm, use_scratch) { is_new <- !file.exists(file.path(path, "config")) dir_create(path) dir_create(file.path(path, "data")) @@ -145,8 +152,11 @@ R6_driver_rds <- R6::R6Class( dir_create(file.path(path, "config")) self$path <- normalizePath(path, mustWork = TRUE) - self$path_scratch <- file.path(self$path, "scratch") - dir_create(self$path_scratch) + if (use_scratch) { + self$path_scratch <- file.path(self$path, "scratch") + dir_create(self$path_scratch) + } + self$use_scratch <- use_scratch ## This is a bit of complicated dancing around to mantain ## backward compatibility while allowing better defaults in @@ -207,7 +217,7 @@ R6_driver_rds <- R6::R6Class( set_hash = function(key, namespace, hash) { dir_create(self$name_key("", namespace)) write_lines(hash, self$name_key(key, namespace), - scratch_dir = self$path_scratch) + scratch_dir = self$path_scratch, use_scratch = self$use_scratch) }, get_object = function(hash) { @@ -219,7 +229,7 @@ R6_driver_rds <- R6::R6Class( ## already and avoids seralising twice. assert_raw(value) write_serialized_rds(value, self$name_hash(hash), self$compress, - self$path_scratch) + self$path_scratch, use_scratch = self$use_scratch) }, exists_hash = function(key, namespace) { diff --git a/R/hash.R b/R/hash.R index 80fb64e..d68b8a9 100644 --- a/R/hash.R +++ b/R/hash.R @@ -73,9 +73,10 @@ serialize_to_raw <- function(x, ascii, xdr) { ## an object which cannot be retrieved later on, causing havoc ## upstream. write_serialized_rds <- function(value, filename, compress, - scratch_dir = NULL, long = 2^31 - 2) { + scratch_dir = NULL, long = 2^31 - 2, + use_scratch = TRUE) { withCallingHandlers( - try_write_serialized_rds(value, filename, compress, scratch_dir, long), + try_write_serialized_rds(value, filename, compress, scratch_dir, long, use_scratch), error = function(e) unlink(filename)) } @@ -84,9 +85,10 @@ write_serialized_rds <- function(value, filename, compress, ## close the connection on exit from try_write_serialized_rds and ## delete the file *after* that. try_write_serialized_rds <- function(value, filename, compress, - scratch_dir = NULL, long = 2^31 - 2) { - tmp <- tempfile(tmpdir = scratch_dir %||% tempdir()) + scratch_dir = NULL, long = 2^31 - 2, + use_scratch = TRUE) { + tmp <- ifelse(use_scratch, tempfile(tmpdir = scratch_dir %||% tempdir()), filename) con <- (if (compress) gzfile else file)(tmp, "wb") needs_close <- TRUE on.exit(if (needs_close) close(con), add = TRUE) @@ -99,16 +101,20 @@ try_write_serialized_rds <- function(value, filename, compress, } close(con) needs_close <- FALSE - file.rename(tmp, filename) + if (use_scratch) { + file.rename(tmp, filename) + } } ## Same pattern for write_lines. The difference is that this will ## delete the key on a failed write (otherwise there's a copy ## involved) -write_lines <- function(text, filename, ..., scratch_dir = NULL) { +write_lines <- function(text, filename, ..., + scratch_dir = NULL, use_scratch = TRUE) { withCallingHandlers( - try_write_lines(text, filename, ..., scratch_dir = scratch_dir), + try_write_lines(text, filename, ..., scratch_dir = scratch_dir, + use_scratch = use_scratch), error = function(e) unlink(filename)) } @@ -116,10 +122,12 @@ write_lines <- function(text, filename, ..., scratch_dir = NULL) { ## This implements write-then-move for writeLines, which gives us ## atomic writes and rewrites. If 'scratch' is on the same filesystem ## as dirname(filename), then the os's rename is atomic -try_write_lines <- function(text, filename, ..., scratch_dir) { - tmp <- tempfile(tmpdir = scratch_dir %||% tempdir()) +try_write_lines <- function(text, filename, ..., scratch_dir, use_scratch) { + tmp <- ifelse(use_scratch, tempfile(tmpdir = scratch_dir %||% tempdir()), filename) writeLines(text, tmp, ...) ## Not 100% necessary and strictly makes this nonatomic - unlink(filename) - file.rename(tmp, filename) + if (use_scratch) { + unlink(filename) + file.rename(tmp, filename) + } } diff --git a/tests/testthat/test-driver-rds.R b/tests/testthat/test-driver-rds.R index eaf6063..a108578 100644 --- a/tests/testthat/test-driver-rds.R +++ b/tests/testthat/test-driver-rds.R @@ -381,3 +381,11 @@ test_that("avoid race condition when writing in parallel", { ok <- vlapply(1:10, function(i) racy_write()) expect_true(all(ok)) }) + +test_that("use_scratch = FALSE (#116)", { + st <- storr_rds(tempfile(), use_scratch = FALSE) + st$set("a", "a") + expect_equal(st$get("a"), "a") + expect_null(st$driver$path_scratch) + expect_false(st$driver$use_scratch) +}) From 7459b4850d47d13445e8206345a6029b4e5db832 Mon Sep 17 00:00:00 2001 From: wlandau-lilly Date: Tue, 3 Dec 2019 11:36:53 -0500 Subject: [PATCH 2/5] Recreate docs --- DESCRIPTION | 2 +- man/storr.Rd | 2 +- man/storr_dbi.Rd | 23 ++++++++++++++++++----- man/storr_environment.Rd | 7 +++++-- man/storr_rds.Rd | 29 +++++++++++++++++++++++------ 5 files changed, 48 insertions(+), 15 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 306def7..410a0f3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -32,5 +32,5 @@ Suggests: rbenchmark, testthat (>= 1.0.0) VignetteBuilder: knitr -RoxygenNote: 6.1.1 +RoxygenNote: 7.0.0 Encoding: UTF-8 diff --git a/man/storr.Rd b/man/storr.Rd index 4d92b76..116857a 100644 --- a/man/storr.Rd +++ b/man/storr.Rd @@ -11,7 +11,7 @@ storr(driver, default_namespace = "objects") \item{default_namespace}{Default namespace to store objects in. By default \code{"objects"} is used, but this might be useful to -have two diffent \code{storr} objects pointing at the same +have two different \code{storr} objects pointing at the same underlying storage, but storing things in different namespaces.} } \description{ diff --git a/man/storr_dbi.Rd b/man/storr_dbi.Rd index efba7b3..39ffe9a 100644 --- a/man/storr_dbi.Rd +++ b/man/storr_dbi.Rd @@ -5,11 +5,24 @@ \alias{driver_dbi} \title{DBI storr driver} \usage{ -storr_dbi(tbl_data, tbl_keys, con, args = NULL, binary = NULL, - hash_algorithm = NULL, default_namespace = "objects") - -driver_dbi(tbl_data, tbl_keys, con, args = NULL, binary = NULL, - hash_algorithm = NULL) +storr_dbi( + tbl_data, + tbl_keys, + con, + args = NULL, + binary = NULL, + hash_algorithm = NULL, + default_namespace = "objects" +) + +driver_dbi( + tbl_data, + tbl_keys, + con, + args = NULL, + binary = NULL, + hash_algorithm = NULL +) } \arguments{ \item{tbl_data}{Name for the table that maps hashes to values} diff --git a/man/storr_environment.Rd b/man/storr_environment.Rd index 101c346..7881073 100644 --- a/man/storr_environment.Rd +++ b/man/storr_environment.Rd @@ -5,8 +5,11 @@ \alias{driver_environment} \title{Environment object cache driver} \usage{ -storr_environment(envir = NULL, hash_algorithm = NULL, - default_namespace = "objects") +storr_environment( + envir = NULL, + hash_algorithm = NULL, + default_namespace = "objects" +) driver_environment(envir = NULL, hash_algorithm = NULL) } diff --git a/man/storr_rds.Rd b/man/storr_rds.Rd index a5c4e23..aa355b4 100644 --- a/man/storr_rds.Rd +++ b/man/storr_rds.Rd @@ -5,12 +5,24 @@ \alias{driver_rds} \title{rds object cache driver} \usage{ -storr_rds(path, compress = NULL, mangle_key = NULL, - mangle_key_pad = NULL, hash_algorithm = NULL, - default_namespace = "objects") - -driver_rds(path, compress = NULL, mangle_key = NULL, - mangle_key_pad = NULL, hash_algorithm = NULL) +storr_rds( + path, + compress = NULL, + mangle_key = NULL, + mangle_key_pad = NULL, + hash_algorithm = NULL, + default_namespace = "objects", + use_scratch = TRUE +) + +driver_rds( + path, + compress = NULL, + mangle_key = NULL, + mangle_key_pad = NULL, + hash_algorithm = NULL, + use_scratch = TRUE +) } \arguments{ \item{path}{Path for the store. \code{tempdir()} is a good choice @@ -39,6 +51,11 @@ values are "md5", "sha1", and others supported by \item{default_namespace}{Default namespace (see \code{\link{storr}}).} + +\item{use_scratch}{Logical, whether to create data and key files +in a scratch directory before moving them to their final destinations. +Set to \code{TRUE} to ensure atomic reads and writes, or set to +\code{FALSE} to increase speed on some file systems.} } \description{ Object cache driver that saves objects using R's native From 67d8c742e4df94f7525146e72039f3b99a5f754b Mon Sep 17 00:00:00 2001 From: wlandau-lilly Date: Thu, 5 Dec 2019 11:20:59 -0500 Subject: [PATCH 3/5] Always create scratch dir --- R/driver_rds.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/R/driver_rds.R b/R/driver_rds.R index f5c61ac..b5eb9d1 100644 --- a/R/driver_rds.R +++ b/R/driver_rds.R @@ -151,11 +151,8 @@ R6_driver_rds <- R6::R6Class( dir_create(file.path(path, "keys")) dir_create(file.path(path, "config")) self$path <- normalizePath(path, mustWork = TRUE) - - if (use_scratch) { - self$path_scratch <- file.path(self$path, "scratch") - dir_create(self$path_scratch) - } + self$path_scratch <- file.path(self$path, "scratch") + dir_create(self$path_scratch) self$use_scratch <- use_scratch ## This is a bit of complicated dancing around to mantain From c3f9efd3f550853dbb645598092be7566d766388 Mon Sep 17 00:00:00 2001 From: wlandau-lilly Date: Thu, 5 Dec 2019 11:30:03 -0500 Subject: [PATCH 4/5] Update a new test --- tests/testthat/test-driver-rds.R | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/testthat/test-driver-rds.R b/tests/testthat/test-driver-rds.R index a108578..e3a9af0 100644 --- a/tests/testthat/test-driver-rds.R +++ b/tests/testthat/test-driver-rds.R @@ -386,6 +386,5 @@ test_that("use_scratch = FALSE (#116)", { st <- storr_rds(tempfile(), use_scratch = FALSE) st$set("a", "a") expect_equal(st$get("a"), "a") - expect_null(st$driver$path_scratch) expect_false(st$driver$use_scratch) }) From c250d67bd413e5280ffef25f01c6528fe8d2409e Mon Sep 17 00:00:00 2001 From: wlandau-lilly Date: Fri, 6 Dec 2019 20:34:20 -0500 Subject: [PATCH 5/5] Always use scratch for data --- NEWS.md | 2 +- R/driver_rds.R | 26 +++++++++++++------------- R/hash.R | 24 ++++++++++-------------- man/storr_rds.Rd | 10 +++++----- tests/testthat/test-driver-rds.R | 6 +++--- 5 files changed, 32 insertions(+), 36 deletions(-) diff --git a/NEWS.md b/NEWS.md index 72e9acb..72e8614 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,7 @@ ## storr 1.2.2 (2018-??-??) * Speed up the `$get_hash()` method of RDS drivers using C code and traits (#96, #98, @wlandau). -* Allow users to bypass scratch in RDS `storr`s (#116, @wlandau). +* Add a new `use_scratch_keys` argument to `driver_rds()` and `storr_rds()`. Allows users to bypass the `scratch` directory for keys. Speeds up processing on some file systems (#116, @wlandau). ## storr 1.2.1 (2018-10-18) diff --git a/R/driver_rds.R b/R/driver_rds.R index b5eb9d1..7a14f5e 100644 --- a/R/driver_rds.R +++ b/R/driver_rds.R @@ -74,10 +74,10 @@ ##' \code{\link{digest}}. If not given, then we will default to ##' "md5". ##' -##' @param use_scratch Logical, whether to create data and key files +##' @param use_scratch_keys Logical, whether to create key files ##' in a scratch directory before moving them to their final destinations. -##' Set to \code{TRUE} to ensure atomic reads and writes, or set to -##' \code{FALSE} to increase speed on some file systems. +##' Set to \code{TRUE} to ensure atomic reads and writes for keys, or set to +##' \code{FALSE} to increase speed on some platforms (namely Windows). ##' ##' @param default_namespace Default namespace (see ##' \code{\link{storr}}). @@ -115,8 +115,8 @@ ##' st2$destroy() storr_rds <- function(path, compress = NULL, mangle_key = NULL, mangle_key_pad = NULL, hash_algorithm = NULL, - default_namespace = "objects", use_scratch = TRUE) { - storr(driver_rds(path, compress, mangle_key, mangle_key_pad, hash_algorithm, use_scratch), + default_namespace = "objects", use_scratch_keys = TRUE) { + storr(driver_rds(path, compress, mangle_key, mangle_key_pad, hash_algorithm, use_scratch_keys), default_namespace) } @@ -124,8 +124,8 @@ storr_rds <- function(path, compress = NULL, mangle_key = NULL, ##' @rdname storr_rds driver_rds <- function(path, compress = NULL, mangle_key = NULL, mangle_key_pad = NULL, hash_algorithm = NULL, - use_scratch = TRUE) { - R6_driver_rds$new(path, compress, mangle_key, mangle_key_pad, hash_algorithm, use_scratch) + use_scratch_keys = TRUE) { + R6_driver_rds$new(path, compress, mangle_key, mangle_key_pad, hash_algorithm, use_scratch_keys) } R6_driver_rds <- R6::R6Class( @@ -135,7 +135,7 @@ R6_driver_rds <- R6::R6Class( ## This needs sorting before anyone writes their own driver! path = NULL, path_scratch = NULL, - use_scratch = NULL, + use_scratch_keys = NULL, compress = NULL, mangle_key = NULL, mangle_key_pad = NULL, @@ -144,7 +144,7 @@ R6_driver_rds <- R6::R6Class( traits = list(accept = "raw", throw_missing = TRUE), initialize = function(path, compress, mangle_key, mangle_key_pad, - hash_algorithm, use_scratch) { + hash_algorithm, use_scratch_keys) { is_new <- !file.exists(file.path(path, "config")) dir_create(path) dir_create(file.path(path, "data")) @@ -153,7 +153,7 @@ R6_driver_rds <- R6::R6Class( self$path <- normalizePath(path, mustWork = TRUE) self$path_scratch <- file.path(self$path, "scratch") dir_create(self$path_scratch) - self$use_scratch <- use_scratch + self$use_scratch_keys <- use_scratch_keys ## This is a bit of complicated dancing around to mantain ## backward compatibility while allowing better defaults in @@ -194,7 +194,7 @@ R6_driver_rds <- R6::R6Class( } self$hash_algorithm <- driver_rds_config(path, "hash_algorithm", hash_algorithm, "md5", TRUE) - + self$hash_length <- nchar( digest::digest(as.raw(0x00), self$hash_algorithm, serialize = FALSE)) }, @@ -214,7 +214,7 @@ R6_driver_rds <- R6::R6Class( set_hash = function(key, namespace, hash) { dir_create(self$name_key("", namespace)) write_lines(hash, self$name_key(key, namespace), - scratch_dir = self$path_scratch, use_scratch = self$use_scratch) + scratch_dir = self$path_scratch, use_scratch_keys = self$use_scratch_keys) }, get_object = function(hash) { @@ -226,7 +226,7 @@ R6_driver_rds <- R6::R6Class( ## already and avoids seralising twice. assert_raw(value) write_serialized_rds(value, self$name_hash(hash), self$compress, - self$path_scratch, use_scratch = self$use_scratch) + self$path_scratch) }, exists_hash = function(key, namespace) { diff --git a/R/hash.R b/R/hash.R index d68b8a9..a3bd5f0 100644 --- a/R/hash.R +++ b/R/hash.R @@ -73,10 +73,9 @@ serialize_to_raw <- function(x, ascii, xdr) { ## an object which cannot be retrieved later on, causing havoc ## upstream. write_serialized_rds <- function(value, filename, compress, - scratch_dir = NULL, long = 2^31 - 2, - use_scratch = TRUE) { + scratch_dir = NULL, long = 2^31 - 2) { withCallingHandlers( - try_write_serialized_rds(value, filename, compress, scratch_dir, long, use_scratch), + try_write_serialized_rds(value, filename, compress, scratch_dir, long), error = function(e) unlink(filename)) } @@ -85,10 +84,9 @@ write_serialized_rds <- function(value, filename, compress, ## close the connection on exit from try_write_serialized_rds and ## delete the file *after* that. try_write_serialized_rds <- function(value, filename, compress, - scratch_dir = NULL, long = 2^31 - 2, - use_scratch = TRUE) { + scratch_dir = NULL, long = 2^31 - 2) { - tmp <- ifelse(use_scratch, tempfile(tmpdir = scratch_dir %||% tempdir()), filename) + tmp <- tempfile(tmpdir = scratch_dir %||% tempdir()) con <- (if (compress) gzfile else file)(tmp, "wb") needs_close <- TRUE on.exit(if (needs_close) close(con), add = TRUE) @@ -101,9 +99,7 @@ try_write_serialized_rds <- function(value, filename, compress, } close(con) needs_close <- FALSE - if (use_scratch) { - file.rename(tmp, filename) - } + file.rename(tmp, filename) } @@ -111,10 +107,10 @@ try_write_serialized_rds <- function(value, filename, compress, ## delete the key on a failed write (otherwise there's a copy ## involved) write_lines <- function(text, filename, ..., - scratch_dir = NULL, use_scratch = TRUE) { + scratch_dir = NULL, use_scratch_keys = TRUE) { withCallingHandlers( try_write_lines(text, filename, ..., scratch_dir = scratch_dir, - use_scratch = use_scratch), + use_scratch_keys = use_scratch_keys), error = function(e) unlink(filename)) } @@ -122,11 +118,11 @@ write_lines <- function(text, filename, ..., ## This implements write-then-move for writeLines, which gives us ## atomic writes and rewrites. If 'scratch' is on the same filesystem ## as dirname(filename), then the os's rename is atomic -try_write_lines <- function(text, filename, ..., scratch_dir, use_scratch) { - tmp <- ifelse(use_scratch, tempfile(tmpdir = scratch_dir %||% tempdir()), filename) +try_write_lines <- function(text, filename, ..., scratch_dir, use_scratch_keys) { + tmp <- ifelse(use_scratch_keys, tempfile(tmpdir = scratch_dir %||% tempdir()), filename) writeLines(text, tmp, ...) ## Not 100% necessary and strictly makes this nonatomic - if (use_scratch) { + if (use_scratch_keys) { unlink(filename) file.rename(tmp, filename) } diff --git a/man/storr_rds.Rd b/man/storr_rds.Rd index aa355b4..7e4ca56 100644 --- a/man/storr_rds.Rd +++ b/man/storr_rds.Rd @@ -12,7 +12,7 @@ storr_rds( mangle_key_pad = NULL, hash_algorithm = NULL, default_namespace = "objects", - use_scratch = TRUE + use_scratch_keys = TRUE ) driver_rds( @@ -21,7 +21,7 @@ driver_rds( mangle_key = NULL, mangle_key_pad = NULL, hash_algorithm = NULL, - use_scratch = TRUE + use_scratch_keys = TRUE ) } \arguments{ @@ -52,10 +52,10 @@ values are "md5", "sha1", and others supported by \item{default_namespace}{Default namespace (see \code{\link{storr}}).} -\item{use_scratch}{Logical, whether to create data and key files +\item{use_scratch_keys}{Logical, whether to create key files in a scratch directory before moving them to their final destinations. -Set to \code{TRUE} to ensure atomic reads and writes, or set to -\code{FALSE} to increase speed on some file systems.} +Set to \code{TRUE} to ensure atomic reads and writes for keys, or set to +\code{FALSE} to increase speed on some platforms (namely Windows).} } \description{ Object cache driver that saves objects using R's native diff --git a/tests/testthat/test-driver-rds.R b/tests/testthat/test-driver-rds.R index e3a9af0..517f413 100644 --- a/tests/testthat/test-driver-rds.R +++ b/tests/testthat/test-driver-rds.R @@ -382,9 +382,9 @@ test_that("avoid race condition when writing in parallel", { expect_true(all(ok)) }) -test_that("use_scratch = FALSE (#116)", { - st <- storr_rds(tempfile(), use_scratch = FALSE) +test_that("use_scratch_keys = FALSE (#116)", { + st <- storr_rds(tempfile(), use_scratch_keys = FALSE) st$set("a", "a") expect_equal(st$get("a"), "a") - expect_false(st$driver$use_scratch) + expect_false(st$driver$use_scratch_keys) })