Conversation
|
Hi @wlandau, if I understand correctly, you will always be serializing the complete Because you do not need random access to the stored vector, the meta-data stored in the fst file is a bit of an overkill. You might also test a setup where you compress the
perhaps you can add this option to the benchmark and see what happens! best |
|
Thanks for the useful advice, @MarcusKlik! Looks like we might be able to use library(fst)
library(magrittr)
library(microbenchmark)
library(pryr)
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
library(storr)
data <- runif(2.5e8)
object_size(data)
#> 2 GB
serialize(data, NULL, ascii = FALSE) %>%
compress_fst() %>%
object_size()
#> 1.28 GB
s1 <- storr_fst(tempfile())
s2 <- storr_fst(tempfile(), compress = TRUE)
s3 <- storr_rds(tempfile())
f <- function(data) {
raw <- serialize(data, NULL, ascii = FALSE)
cmp <- compress_fst(raw)
saveRDS(cmp, tempfile(), compress = FALSE)
}
microbenchmark(
fst = s1$set("x", data),
fst_cmp = s2$set("x", data),
rds = s3$set("x", data),
f = f(data),
times = 1
)
#> Unit: seconds
#> expr min lq mean median uq max
#> fst 12.043528 12.043528 12.043528 12.043528 12.043528 12.043528
#> fst_cmp 12.056381 12.056381 12.056381 12.056381 12.056381 12.056381
#> rds 206.944071 206.944071 206.944071 206.944071 206.944071 206.944071
#> f 8.960977 8.960977 8.960977 8.960977 8.960977 8.960977
#> neval
#> 1
#> 1
#> 1
#> 1Created on 2019-06-18 by the reprex package (v0.3.0) |
|
@MarcusKlik, I think #109 (comment) is a the best answer to https://stackoverflow.com/questions/56614592/faster-way-to-slice-a-raw-vector. If you post, I will accept and grant you the reputation points. |
|
After fstpackage/fst@c8bfde9, 519c38d, and d06e6f7, library(digest)
library(fst)
library(pryr)
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
library(storr)
x <- runif(3e8)
object_size(x)
#> 2.4 GB
s <- storr_fst(tempfile())
s$set(key = "x", value = x, use_cache = FALSE)
y <- s$get("x", use_cache = FALSE)
digest(x) == digest(y)
#> [1] TRUECreated on 2019-06-18 by the reprex package (v0.3.0) |
|
After looking at fstpackage/fst@c8bfde9, I merged #109 ( So do we go with #109 or #111? Maybe both? #109 is a bit faster for large data, but the implementation is more complicated, and it requires development +2 GB data benchmarks: library(digest)
library(fst)
library(microbenchmark)
library(pryr)
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
library(storr)
x <- runif(3e8)
object_size(x)
#> 2.4 GB
fst_none <- storr_fst(tempfile(), compress = "none")
fst_cmpr <- storr_fst(tempfile(), compress = "fst")
rds_none <- storr_rds(tempfile(), compress = "none")
rds_cmpr <- storr_rds(tempfile(), compress = "fst")
rds_default <- storr_rds(tempfile(), compress = "gzfile")
microbenchmark::microbenchmark(
fst_none = fst_none$set(key = "x", value = x, use_cache = FALSE),
fst_cmpr = fst_cmpr$set(key = "x", value = x, use_cache = FALSE),
rds_none = rds_none$set(key = "x", value = x, use_cache = FALSE),
rds_cmpr = rds_cmpr$set(key = "x", value = x, use_cache = FALSE),
rds_default = rds_default$set(key = "x", value = x, use_cache = FALSE),
times = 1
)
#> Repacking large object
#> Repacking large object
#> Unit: seconds
#> expr min lq mean median uq
#> fst_none 9.943442 9.943442 9.943442 9.943442 9.943442
#> fst_cmpr 16.299916 16.299916 16.299916 16.299916 16.299916
#> rds_none 13.164031 13.164031 13.164031 13.164031 13.164031
#> rds_cmpr 16.212105 16.212105 16.212105 16.212105 16.212105
#> rds_default 263.741792 263.741792 263.741792 263.741792 263.741792
#> max neval
#> 9.943442 1
#> 16.299916 1
#> 13.164031 1
#> 16.212105 1
#> 263.741792 1
microbenchmark::microbenchmark(
fst_none = fst_none$get(key = "x", use_cache = FALSE),
fst_cmpr = fst_cmpr$get(key = "x", use_cache = FALSE),
rds_none = rds_none$get(key = "x", use_cache = FALSE),
rds_cmpr = rds_cmpr$get(key = "x", use_cache = FALSE),
rds_default = rds_default$get(key = "x", use_cache = FALSE),
times = 1
)
#> Unit: seconds
#> expr min lq mean median uq max
#> fst_none 3.997610 3.997610 3.997610 3.997610 3.997610 3.997610
#> fst_cmpr 9.683951 9.683951 9.683951 9.683951 9.683951 9.683951
#> rds_none 2.891874 2.891874 2.891874 2.891874 2.891874 2.891874
#> rds_cmpr 4.645483 4.645483 4.645483 4.645483 4.645483 4.645483
#> rds_default 12.585657 12.585657 12.585657 12.585657 12.585657 12.585657
#> neval
#> 1
#> 1
#> 1
#> 1
#> 1Created on 2019-06-18 by the reprex package (v0.3.0) Small data benchmarks: library(digest)
library(fst)
library(microbenchmark)
library(pryr)
#> Registered S3 method overwritten by 'pryr':
#> method from
#> print.bytes Rcpp
library(storr)
fst_none <- storr_fst(tempfile(), compress = "none")
fst_cmpr <- storr_fst(tempfile(), compress = "fst")
rds_none <- storr_rds(tempfile(), compress = "none")
rds_cmpr <- storr_rds(tempfile(), compress = "fst")
rds_default <- storr_rds(tempfile(), compress = "gzfile")
microbenchmark::microbenchmark(
fst_none = fst_none$set(key = "x", value = runif(1), use_cache = FALSE),
fst_cmpr = fst_cmpr$set(key = "x", value = runif(1), use_cache = FALSE),
rds_none = rds_none$set(key = "x", value = runif(1), use_cache = FALSE),
rds_cmpr = rds_cmpr$set(key = "x", value = runif(1), use_cache = FALSE),
rds_default = rds_default$set(key = "x", value = runif(1), use_cache = FALSE)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> fst_none 264.506 273.3300 291.2796 287.7015 304.0750 488.368 100
#> fst_cmpr 284.696 292.3405 333.9626 312.5445 326.6185 1240.974 100
#> rds_none 231.126 240.5605 256.2129 252.6815 266.9450 438.347 100
#> rds_cmpr 249.724 257.7000 301.7618 267.2055 285.0685 2301.713 100
#> rds_default 239.317 253.3230 295.7192 300.4380 319.8265 815.004 100
microbenchmark::microbenchmark(
fst_none = fst_none$get(key = "x", use_cache = FALSE),
fst_cmpr = fst_cmpr$get(key = "x", use_cache = FALSE),
rds_none = rds_none$get(key = "x", use_cache = FALSE),
rds_cmpr = rds_cmpr$get(key = "x", use_cache = FALSE),
rds_default = rds_default$get(key = "x", use_cache = FALSE)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> fst_none 119.854 127.4390 484.00473 131.7765 144.9625 34477.090 100
#> fst_cmpr 125.987 131.3730 144.91864 135.6095 143.7470 237.292 100
#> rds_none 76.280 81.6835 115.04263 85.8315 93.7595 2406.733 100
#> rds_cmpr 91.058 95.7800 111.86227 99.9195 106.4455 513.389 100
#> rds_default 76.906 81.3260 90.50064 86.5245 93.3555 156.133 100Created on 2019-06-18 by the reprex package (v0.3.0) |
This PR is a sketch of the
fstdriver I suggested in #108. It completely based on the RDS driver. Instead of saving the serialized raw vector withwriteBin(), we wrap it in a data frame and save it withwrite_fst(). Optional compression is powered bycompress_fst(). If this works out, I plan to add functionality to set the serialization method based on the class/type of the object, re #77 (comment) (thinking of Keras models).A quick glance at speed is encouraging (with 4 OpenMP threads on a mid-tier Kubuntu machine). cc @MarcusKlik.
Created on 2019-06-17 by the reprex package (v0.3.0)
Issues and questions
R6_driver_fstcurrently inherits fromR6_driver_rds. Should it? A lot of code needs to be copied anyway because of hard-coded references to RDS, and I am not sure I feel about the current one-foot-in-and-one-foot-out approach.