diff --git a/DESCRIPTION b/DESCRIPTION index 306def7..bd87e3b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -25,6 +25,7 @@ Suggests: DBI (>= 0.6), RSQLite (>= 1.1-2), RPostgres, + fst, knitr, mockr, parallel, diff --git a/NEWS.md b/NEWS.md index b5b0ab6..694a49d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,7 @@ ## storr 1.2.2 (2018-??-??) * Speed up the `$get_hash()` method of RDS drivers using C code and traits (#96, #98, @wlandau). +* Add support for `LZ4` and `ZSTD` compression (via `fst`) for RDS drivers (#110, #111, @wlandau, @MarcusKlik). ## storr 1.2.1 (2018-10-18) diff --git a/R/driver_rds.R b/R/driver_rds.R index 5a30f59..e176598 100644 --- a/R/driver_rds.R +++ b/R/driver_rds.R @@ -55,7 +55,28 @@ ##' might be nice for persistent application data. ##' ##' @param compress Compress the generated file? This saves a small -##' amount of space for a reasonable amount of time. +##' amount of space for a reasonable amount of time. Possible values: +##' \describe{ +##' \item{"none"}{No compression.} +##' \item{"gzip"}{gzip compression with \code{base::gzfile()} (default).} +##' \item{"lz4"}{lz4 compression via \code{fst::compress_fst()}. Very fast, but lower quality than zstd.} +##' \item{"zstd"}{zstd compression via \code{fst::compress_fst()}. Higher quality but slower than lz4.} +##' } +##' \code{"lz4"} is generally much faster than the other compression methods. +##' \code{"lz4"} and \code{"zstd"} require the \code{fst} package, +##' and they both use more memory (RAM) than the other choices. +##' +##' To preserve compatibility with earlier \code{storr}s, +##' \code{compress} can also be logical: +##' \code{TRUE} for \code{"gzfile"} and \code{FALSE} for \code{"none"}. +##' However, these values are not interchangeable for existings \code{storr}s. +##' For example, if you create a \code{storr} with \code{compress = TRUE}, +##' you must continue to use \code{compress = TRUE} +##' and not \code{compress = "gzfile"} when you recover it later. +##' +##' @param compression Numeric compression factor for \code{fst::compress_fst()}. +##' Between 0 and 100: 0 for lowest compression, 100 for maximum compression. +##' Only applies to \code{compress = "lz4"} and \code{compress = "zstd"}. ##' ##' @param mangle_key Mangle keys? If TRUE, then the key is encoded ##' using base64 before saving to the filesystem. See Details. @@ -76,6 +97,7 @@ ##' ##' @param default_namespace Default namespace (see ##' \code{\link{storr}}). +##' ##' @export ##' @examples ##' @@ -108,18 +130,18 @@ ##' # Clean up the two storrs: ##' st$destroy() ##' st2$destroy() -storr_rds <- function(path, compress = NULL, mangle_key = NULL, +storr_rds <- function(path, compress = NULL, compression = NULL, mangle_key = NULL, mangle_key_pad = NULL, hash_algorithm = NULL, default_namespace = "objects") { - storr(driver_rds(path, compress, mangle_key, mangle_key_pad, hash_algorithm), + storr(driver_rds(path, compress, compression, mangle_key, mangle_key_pad, hash_algorithm), default_namespace) } ##' @export ##' @rdname storr_rds -driver_rds <- function(path, compress = NULL, mangle_key = NULL, +driver_rds <- function(path, compress = NULL, compression = NULL, mangle_key = NULL, mangle_key_pad = NULL, hash_algorithm = NULL) { - R6_driver_rds$new(path, compress, mangle_key, mangle_key_pad, hash_algorithm) + R6_driver_rds$new(path, compress, compression, mangle_key, mangle_key_pad, hash_algorithm) } R6_driver_rds <- R6::R6Class( @@ -130,13 +152,14 @@ R6_driver_rds <- R6::R6Class( path = NULL, path_scratch = NULL, compress = NULL, + compression = NULL, mangle_key = NULL, mangle_key_pad = NULL, hash_algorithm = NULL, hash_length = NULL, traits = list(accept = "raw", throw_missing = TRUE), - initialize = function(path, compress, mangle_key, mangle_key_pad, + initialize = function(path, compress, compression, mangle_key, mangle_key_pad, hash_algorithm) { is_new <- !file.exists(file.path(path, "config")) dir_create(path) @@ -154,9 +177,10 @@ R6_driver_rds <- R6::R6Class( ## future versions of driver_rds can use to patch, warn or ## change behaviour with older versions of the storr. if (!is_new && !file.exists(driver_rds_config_file(path, "version"))) { - write_if_missing("1.0.1", driver_rds_config_file(path, "version")) + write_if_missing("1.2.2", driver_rds_config_file(path, "version")) write_if_missing("TRUE", driver_rds_config_file(path, "mangle_key_pad")) - write_if_missing("TRUE", driver_rds_config_file(path, "compress")) + write_if_missing("gzip", driver_rds_config_file(path, "compress")) + write_if_missing("0", driver_rds_config_file(path, "compression")) write_if_missing("md5", driver_rds_config_file(path, "hash_algorithm")) } ## Then write out the version number: @@ -177,10 +201,15 @@ R6_driver_rds <- R6::R6Class( FALSE, TRUE) if (!is.null(compress)) { - assert_scalar_logical(compress) + compress <- as.character(compress) } + self$compress <- driver_rds_config(path, "compress", compress, - TRUE, FALSE) + "TRUE", TRUE) + self$compress <- parse_rds_compress(self$compress) + + self$compression <- driver_rds_config(path, "compression", compression, + 0, FALSE) if (!is.null(hash_algorithm)) { assert_scalar_character(hash_algorithm) @@ -211,7 +240,12 @@ R6_driver_rds <- R6::R6Class( }, get_object = function(hash) { - read_rds(self$name_hash(hash)) + out <- read_rds(self$name_hash(hash)) + if (self$compress %in% c("lz4", "zstd")) { + out <- fst::decompress_fst(out) + out <- unserialize(out) + } + out }, set_object = function(hash, value) { @@ -219,7 +253,7 @@ R6_driver_rds <- R6::R6Class( ## already and avoids seralising twice. assert_raw(value) write_serialized_rds(value, self$name_hash(hash), self$compress, - self$path_scratch) + self$compression, self$path_scratch) }, exists_hash = function(key, namespace) { @@ -452,3 +486,16 @@ See 'Corrupt keys' within ?storr_rds for how to proceed" -> fmt message(sprintf(fmt, length(files), namespace, path, files)) corrupt_notices[[path]] <- now } + +parse_rds_compress <- function(compress) { + if (identical(compress, "TRUE")) { + "gzip" + } else if (identical(compress, "FALSE")) { + "none" + } else if (compress %in% c("lz4", "zstd")) { + loadNamespace("fst") + compress + } else { + compress + } +} diff --git a/R/hash.R b/R/hash.R index 80fb64e..3435b31 100644 --- a/R/hash.R +++ b/R/hash.R @@ -72,10 +72,10 @@ serialize_to_raw <- function(x, ascii, xdr) { ## been successful, otherwise the container will claim existence for ## an object which cannot be retrieved later on, causing havoc ## upstream. -write_serialized_rds <- function(value, filename, compress, +write_serialized_rds <- function(value, filename, compress, compression, scratch_dir = NULL, long = 2^31 - 2) { withCallingHandlers( - try_write_serialized_rds(value, filename, compress, scratch_dir, long), + try_write_serialized_rds(value, filename, compress, compression, scratch_dir, long), error = function(e) unlink(filename)) } @@ -83,11 +83,24 @@ write_serialized_rds <- function(value, filename, compress, ## The split here helps keep the order really consistent; we will ## close the connection on exit from try_write_serialized_rds and ## delete the file *after* that. -try_write_serialized_rds <- function(value, filename, compress, +try_write_serialized_rds <- function(value, filename, compress, compression, scratch_dir = NULL, long = 2^31 - 2) { tmp <- tempfile(tmpdir = scratch_dir %||% tempdir()) + if (compress %in% c("lz4", "zstd")) { + write_tmp_fst(value, tmp, compress, compression) + } else { + write_tmp_default(value, tmp, compress, scratch_dir, long) + } + file.rename(tmp, filename) +} - con <- (if (compress) gzfile else file)(tmp, "wb") +write_tmp_fst <- function(value, filename, compress, compression) { + saveRDS(fst::compress_fst(value, toupper(compress), compression), + filename, compress = FALSE) +} + +write_tmp_default <- function(value, filename, compress, scratch_dir, long) { + con <- (if (identical(compress, "gzip")) gzfile else file)(filename, "wb") needs_close <- TRUE on.exit(if (needs_close) close(con), add = TRUE) len <- length(value) @@ -99,10 +112,8 @@ try_write_serialized_rds <- function(value, filename, compress, } close(con) needs_close <- FALSE - file.rename(tmp, filename) } - ## Same pattern for write_lines. The difference is that this will ## delete the key on a failed write (otherwise there's a copy ## involved) diff --git a/R/utils.R b/R/utils.R index 256f157..a9afaeb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -116,7 +116,6 @@ assert_probably_storr_driver <- function(x, name = deparse(substitute(x))) { invisible(x) } - match_value <- function(x, choices, name = deparse(substitute(x))) { assert_scalar_character(x, name) i <- match(x, choices) diff --git a/man/storr.Rd b/man/storr.Rd index 4d92b76..116857a 100644 --- a/man/storr.Rd +++ b/man/storr.Rd @@ -11,7 +11,7 @@ storr(driver, default_namespace = "objects") \item{default_namespace}{Default namespace to store objects in. By default \code{"objects"} is used, but this might be useful to -have two diffent \code{storr} objects pointing at the same +have two different \code{storr} objects pointing at the same underlying storage, but storing things in different namespaces.} } \description{ diff --git a/man/storr_rds.Rd b/man/storr_rds.Rd index a5c4e23..a5751a6 100644 --- a/man/storr_rds.Rd +++ b/man/storr_rds.Rd @@ -5,12 +5,12 @@ \alias{driver_rds} \title{rds object cache driver} \usage{ -storr_rds(path, compress = NULL, mangle_key = NULL, - mangle_key_pad = NULL, hash_algorithm = NULL, +storr_rds(path, compress = NULL, compression = NULL, + mangle_key = NULL, mangle_key_pad = NULL, hash_algorithm = NULL, default_namespace = "objects") -driver_rds(path, compress = NULL, mangle_key = NULL, - mangle_key_pad = NULL, hash_algorithm = NULL) +driver_rds(path, compress = NULL, compression = NULL, + mangle_key = NULL, mangle_key_pad = NULL, hash_algorithm = NULL) } \arguments{ \item{path}{Path for the store. \code{tempdir()} is a good choice @@ -18,7 +18,28 @@ for ephemeral storage, The \code{rappdirs} package (on CRAN) might be nice for persistent application data.} \item{compress}{Compress the generated file? This saves a small -amount of space for a reasonable amount of time.} + amount of space for a reasonable amount of time. Possible values: + \describe{ + \item{"none"}{No compression.} + \item{"gzip"}{gzip compression with \code{base::gzfile()} (default).} + \item{"lz4"}{lz4 compression via \code{fst::compress_fst()}. Very fast, but lower quality than zstd.} + \item{"zstd"}{zstd compression via \code{fst::compress_fst()}. Higher quality but slower than lz4.} + } + \code{"lz4"} is generally much faster than the other compression methods. + \code{"lz4"} and \code{"zstd"} require the \code{fst} package, + and they both use more memory (RAM) than the other choices. + + To preserve compatibility with earlier \code{storr}s, + \code{compress} can also be logical: + \code{TRUE} for \code{"gzfile"} and \code{FALSE} for \code{"none"}. + However, these values are not interchangeable for existings \code{storr}s. + For example, if you create a \code{storr} with \code{compress = TRUE}, + you must continue to use \code{compress = TRUE} + and not \code{compress = "gzfile"} when you recover it later.} + +\item{compression}{Numeric compression factor for \code{fst::compress_fst()}. +Between 0 and 100: 0 for lowest compression, 100 for maximum compression. +Only applies to \code{compress = "lz4"} and \code{compress = "zstd"}.} \item{mangle_key}{Mangle keys? If TRUE, then the key is encoded using base64 before saving to the filesystem. See Details.} diff --git a/tests/testthat/test-driver-rds.R b/tests/testthat/test-driver-rds.R index eaf6063..268aa44 100644 --- a/tests/testthat/test-driver-rds.R +++ b/tests/testthat/test-driver-rds.R @@ -143,25 +143,40 @@ test_that("large vector support", { }) test_that("compression support", { + skip_if_not_installed("fst") + ## some data that will likely compress very well: data <- rep(1:10, each = 500) - st1 <- storr_rds(tempfile(), TRUE) - st2 <- storr_rds(tempfile(), FALSE) + st1 <- storr_rds(tempfile(), "gzip") + st2 <- storr_rds(tempfile(), "none") + st3 <- storr_rds(tempfile(), "lz4") + st4 <- storr_rds(tempfile(), "zstd") + on.exit({ st1$destroy() st2$destroy() + st3$destroy() + st4$destroy() }) - h1 <- st1$set("data", data) - h2 <- st2$set("data", data) + h1 <- st1$set("data", data, use_cache = FALSE) + h2 <- st2$set("data", data, use_cache = FALSE) + h3 <- st3$set("data", data, use_cache = FALSE) + h4 <- st4$set("data", data, use_cache = FALSE) - expect_identical(h1, h2) + expect_identical(h1, h2, h3, h4) expect_gt(file.size(st2$driver$name_hash(h2)), file.size(st1$driver$name_hash(h1))) - - expect_identical(st1$get("data"), data) - expect_identical(st2$get("data"), data) + expect_gt(file.size(st2$driver$name_hash(h2)), + file.size(st3$driver$name_hash(h3))) + expect_gt(file.size(st2$driver$name_hash(h2)), + file.size(st3$driver$name_hash(h4))) + + expect_identical(st1$get("data", use_cache = FALSE), data) + expect_identical(st2$get("data", use_cache = FALSE), data) + expect_identical(st3$get("data", use_cache = FALSE), data) + expect_identical(st4$get("data", use_cache = FALSE), data) }) test_that("backward compatibility", { @@ -181,6 +196,31 @@ test_that("backward compatibility", { class = "ConfigError") }) +test_that("backward compatibility: compression", { + ## In version 1.2.1 and earlier, the compress argument was a logical scalar. + path <- copy_to_tmp("v1.2.1_compress_TRUE") + st <- storr_rds(path) + expect_equal(st$list(), "key") + expect_equal(st$get("key"), "value") + expect_silent(st <- storr_rds(path, compress = TRUE)) + for (compress in list("none", "gzip", "lz4", "zstd", FALSE)) { + expect_error(st <- storr_rds(path, compress = compress), + "Incompatible value for compress", + class = "ConfigError") + } + + path <- copy_to_tmp("v1.2.1_compress_FALSE") + st <- storr_rds(path) + expect_equal(st$list(), "key") + expect_equal(st$get("key"), "value") + expect_silent(st <- storr_rds(path, compress = FALSE)) + for (compress in list("none", "gzip", "lz4", "zstd", TRUE)) { + expect_error(st <- storr_rds(path, compress = compress), + "Incompatible value for compress", + class = "ConfigError") + } +}) + test_that("mangledness padding backward compatibility", { ## In version 1.0.1 and earlier, mangling was always padded path <- copy_to_tmp("v1.0.1_mangled") diff --git a/tests/testthat/v1.2.1_compress_FALSE/config/compress b/tests/testthat/v1.2.1_compress_FALSE/config/compress new file mode 100644 index 0000000..f6d449c --- /dev/null +++ b/tests/testthat/v1.2.1_compress_FALSE/config/compress @@ -0,0 +1 @@ +FALSE diff --git a/tests/testthat/v1.2.1_compress_FALSE/config/hash_algorithm b/tests/testthat/v1.2.1_compress_FALSE/config/hash_algorithm new file mode 100644 index 0000000..9d39e6b --- /dev/null +++ b/tests/testthat/v1.2.1_compress_FALSE/config/hash_algorithm @@ -0,0 +1 @@ +md5 diff --git a/tests/testthat/v1.2.1_compress_FALSE/config/mangle_key b/tests/testthat/v1.2.1_compress_FALSE/config/mangle_key new file mode 100644 index 0000000..f6d449c --- /dev/null +++ b/tests/testthat/v1.2.1_compress_FALSE/config/mangle_key @@ -0,0 +1 @@ +FALSE diff --git a/tests/testthat/v1.2.1_compress_FALSE/config/mangle_key_pad b/tests/testthat/v1.2.1_compress_FALSE/config/mangle_key_pad new file mode 100644 index 0000000..f6d449c --- /dev/null +++ b/tests/testthat/v1.2.1_compress_FALSE/config/mangle_key_pad @@ -0,0 +1 @@ +FALSE diff --git a/tests/testthat/v1.2.1_compress_FALSE/config/version b/tests/testthat/v1.2.1_compress_FALSE/config/version new file mode 100644 index 0000000..6085e94 --- /dev/null +++ b/tests/testthat/v1.2.1_compress_FALSE/config/version @@ -0,0 +1 @@ +1.2.1 diff --git a/tests/testthat/v1.2.1_compress_FALSE/data/6abd4188389195dfdfa9a37665e4cca6.rds b/tests/testthat/v1.2.1_compress_FALSE/data/6abd4188389195dfdfa9a37665e4cca6.rds new file mode 100644 index 0000000..8f47868 Binary files /dev/null and b/tests/testthat/v1.2.1_compress_FALSE/data/6abd4188389195dfdfa9a37665e4cca6.rds differ diff --git a/tests/testthat/v1.2.1_compress_FALSE/keys/objects/key b/tests/testthat/v1.2.1_compress_FALSE/keys/objects/key new file mode 100644 index 0000000..9b41691 --- /dev/null +++ b/tests/testthat/v1.2.1_compress_FALSE/keys/objects/key @@ -0,0 +1 @@ +6abd4188389195dfdfa9a37665e4cca6 diff --git a/tests/testthat/v1.2.1_compress_TRUE/config/compress b/tests/testthat/v1.2.1_compress_TRUE/config/compress new file mode 100644 index 0000000..ef2f513 --- /dev/null +++ b/tests/testthat/v1.2.1_compress_TRUE/config/compress @@ -0,0 +1 @@ +TRUE diff --git a/tests/testthat/v1.2.1_compress_TRUE/config/hash_algorithm b/tests/testthat/v1.2.1_compress_TRUE/config/hash_algorithm new file mode 100644 index 0000000..9d39e6b --- /dev/null +++ b/tests/testthat/v1.2.1_compress_TRUE/config/hash_algorithm @@ -0,0 +1 @@ +md5 diff --git a/tests/testthat/v1.2.1_compress_TRUE/config/mangle_key b/tests/testthat/v1.2.1_compress_TRUE/config/mangle_key new file mode 100644 index 0000000..f6d449c --- /dev/null +++ b/tests/testthat/v1.2.1_compress_TRUE/config/mangle_key @@ -0,0 +1 @@ +FALSE diff --git a/tests/testthat/v1.2.1_compress_TRUE/config/mangle_key_pad b/tests/testthat/v1.2.1_compress_TRUE/config/mangle_key_pad new file mode 100644 index 0000000..f6d449c --- /dev/null +++ b/tests/testthat/v1.2.1_compress_TRUE/config/mangle_key_pad @@ -0,0 +1 @@ +FALSE diff --git a/tests/testthat/v1.2.1_compress_TRUE/config/version b/tests/testthat/v1.2.1_compress_TRUE/config/version new file mode 100644 index 0000000..6085e94 --- /dev/null +++ b/tests/testthat/v1.2.1_compress_TRUE/config/version @@ -0,0 +1 @@ +1.2.1 diff --git a/tests/testthat/v1.2.1_compress_TRUE/data/6abd4188389195dfdfa9a37665e4cca6.rds b/tests/testthat/v1.2.1_compress_TRUE/data/6abd4188389195dfdfa9a37665e4cca6.rds new file mode 100644 index 0000000..9e05a97 Binary files /dev/null and b/tests/testthat/v1.2.1_compress_TRUE/data/6abd4188389195dfdfa9a37665e4cca6.rds differ diff --git a/tests/testthat/v1.2.1_compress_TRUE/keys/objects/key b/tests/testthat/v1.2.1_compress_TRUE/keys/objects/key new file mode 100644 index 0000000..9b41691 --- /dev/null +++ b/tests/testthat/v1.2.1_compress_TRUE/keys/objects/key @@ -0,0 +1 @@ +6abd4188389195dfdfa9a37665e4cca6