From 8aaf3c9f96773a43f4ed65fca70952dc2a725d38 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 7 Mar 2024 14:34:16 -0800 Subject: [PATCH 1/5] Added i) directory support, ii) FP rate args, iii) No-Save option --- Cargo.lock | 499 ++++++++++++++++++++++++++++++++++------------------ Cargo.toml | 7 +- src/main.rs | 129 ++++++++++++-- 3 files changed, 450 insertions(+), 185 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e67c77..51ac852 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,53 +10,101 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", + "zerocopy", ] +[[package]] +name = "anstream" +version = "0.6.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" + +[[package]] +name = "anstyle-parse" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +dependencies = [ + "anstyle", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" + [[package]] name = "bff" version = "0.1.0" dependencies = [ "ahash", + "anyhow", "byteorder", "clap", "flate2", + "glob", + "human_bytes", + "indicatif", "rand", "serde_json", + "sysinfo", "threadpool", "unicode-segmentation", ] -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "487f1e0fcbe47deb8b0574e646def1c903389d95241dd1bbcc6ce4a715dfc0c1" - [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.0.79" +version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" [[package]] name = "cfg-if" @@ -66,27 +114,33 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.1.11" +version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42dfd32784433290c51d92c438bb72ea5063797fc3cc9a21a8c4346bebbb2098" +checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" dependencies = [ - "bitflags 2.0.2", + "clap_builder", "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" +dependencies = [ + "anstream", + "anstyle", "clap_lex", - "is-terminal", - "once_cell", "strsim", - "termcolor", ] [[package]] name = "clap_derive" -version = "4.1.9" +version = "4.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fddf67631444a3a3e3e5ac51c36a5e01335302de677bd78759eaa90ab1f46644" +checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" dependencies = [ "heck", - "proc-macro-error", "proc-macro2", "quote", "syn", @@ -94,57 +148,95 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.3.3" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "033f6b7a4acb1f358c742aaca805c939ee73b4c6209ae4318ec7aca81c42e646" -dependencies = [ - "os_str_bytes", -] +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "cmake" -version = "0.1.49" +version = "0.1.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db34956e100b30725f2eb215f90d4871051239535632f84fea3bc92722c66b7c" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" dependencies = [ "cc", ] +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "unicode-width", + "windows-sys", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + [[package]] name = "crc32fast" -version = "1.3.2" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" dependencies = [ "cfg-if", ] [[package]] -name = "errno" -version = "0.2.8" +name = "crossbeam-deque" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "errno-dragonfly", - "libc", - "winapi", + "crossbeam-epoch", + "crossbeam-utils", ] [[package]] -name = "errno-dragonfly" -version = "0.1.2" +name = "crossbeam-epoch" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "cc", - "libc", + "crossbeam-utils", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + +[[package]] +name = "either" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" + +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "flate2" -version = "1.0.25" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" dependencies = [ "crc32fast", "libz-ng-sys", @@ -153,15 +245,21 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "libc", "wasi", ] +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "heck" version = "0.4.1" @@ -170,100 +268,111 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.2.6" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" -dependencies = [ - "libc", -] +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" [[package]] -name = "hermit-abi" -version = "0.3.1" +name = "human_bytes" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +checksum = "91f255a4535024abf7640cb288260811fc14794f62b063652ed349f9a6c2348e" [[package]] -name = "io-lifetimes" -version = "1.0.9" +name = "indicatif" +version = "0.17.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09270fd4fa1111bc614ed2246c7ef56239a3063d5be0d1ec3b589c505d400aeb" +checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" dependencies = [ - "hermit-abi 0.3.1", - "libc", - "windows-sys", + "console", + "instant", + "number_prefix", + "portable-atomic", + "unicode-width", ] [[package]] -name = "is-terminal" -version = "0.4.5" +name = "instant" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8687c819457e979cc940d09cb16e42a1bf70aa6b60a549de6d3a62a0ee90c69e" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ - "hermit-abi 0.3.1", - "io-lifetimes", - "rustix", - "windows-sys", + "cfg-if", ] [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" + +[[package]] +name = "lazy_static" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.140" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "libz-ng-sys" -version = "1.1.8" +version = "1.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4399ae96a9966bf581e726de86969f803a81b7ce795fcd5480e640589457e0f2" +checksum = "c6409efc61b12687963e602df8ecf70e8ddacf95bc6576bcf16e3ac6328083c5" dependencies = [ "cmake", "libc", ] [[package]] -name = "linux-raw-sys" -version = "0.1.4" +name = "miniz_oxide" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +dependencies = [ + "adler", +] [[package]] -name = "miniz_oxide" -version = "0.6.2" +name = "ntapi" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" dependencies = [ - "adler", + "winapi", ] [[package]] name = "num_cpus" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.2.6", + "hermit-abi", "libc", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "once_cell" -version = "1.17.1" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] -name = "os_str_bytes" -version = "6.5.0" +name = "portable-atomic" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" +checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" [[package]] name = "ppv-lite86" @@ -271,44 +380,20 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - [[package]] name = "proc-macro2" -version = "1.0.53" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba466839c78239c09faf015484e5cc04860f88242cff4d03eb038f04b4699b73" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.26" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -344,36 +429,56 @@ dependencies = [ ] [[package]] -name = "rustix" -version = "0.36.11" +name = "rayon" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db4165c9963ab29e422d6c26fbc1d37f15bace6b2810221f9d925023480fcf0e" +checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd" dependencies = [ - "bitflags 1.3.2", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys", - "windows-sys", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", ] [[package]] name = "ryu" -version = "1.0.13" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" +checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" [[package]] name = "serde" -version = "1.0.158" +version = "1.0.197" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "771d4d9c4163ee138805e12c710dd365e4f44be8be0503cb1bb9eb989425d9c9" +checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "serde_json" -version = "1.0.94" +version = "1.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c533a59c9d8a93a09c6ab31f0fd5e5f4dd1b8fc9434804029839884765d04ea" +checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" dependencies = [ "itoa", "ryu", @@ -382,15 +487,15 @@ dependencies = [ [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" [[package]] name = "syn" -version = "1.0.109" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -398,12 +503,18 @@ dependencies = [ ] [[package]] -name = "termcolor" -version = "1.2.0" +name = "sysinfo" +version = "0.30.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +checksum = "0c385888ef380a852a16209afc8cfad22795dd8873d69c9a14d2e2088f118d18" dependencies = [ - "winapi-util", + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "rayon", + "windows", ] [[package]] @@ -417,15 +528,27 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.8" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-segmentation" -version = "1.10.1" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" + +[[package]] +name = "unicode-width" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" + +[[package]] +name = "utf8parse" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "version_check" @@ -456,34 +579,44 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] -name = "winapi-util" -version = "0.1.5" +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" dependencies = [ - "winapi", + "windows-core", + "windows-targets", ] [[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" +name = "windows-core" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets", +] [[package]] name = "windows-sys" -version = "0.45.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", @@ -496,42 +629,62 @@ dependencies = [ [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" [[package]] name = "windows_aarch64_msvc" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" [[package]] name = "windows_i686_gnu" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" [[package]] name = "windows_i686_msvc" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" [[package]] name = "windows_x86_64_gnu" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" [[package]] name = "windows_x86_64_msvc" -version = "0.42.2" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" + +[[package]] +name = "zerocopy" +version = "0.7.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index ad43e7d..8a475cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +sysinfo="0.30.7" clap = { version = "4.1.11", features = ["derive"] } flate2 = { version = "1.0", features = ["zlib-ng"], default-features = false } serde_json = "1.0" @@ -13,4 +14,8 @@ unicode-segmentation = "1.7" rand = "0.8.4" ahash = { version = "0.8.1", features = ["runtime-rng"] } byteorder = "1" -threadpool = "1.8.1" \ No newline at end of file +threadpool = "1.8.1" +human_bytes = "0.4.3" +indicatif = "0.17.8" +glob = "0.3.1" +anyhow = "1.0.80" diff --git a/src/main.rs b/src/main.rs index 1fad7c7..b1cac9d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,13 @@ use ahash::RandomState; +use anyhow::{anyhow, Result}; use byteorder::{LittleEndian, NativeEndian, ReadBytesExt, WriteBytesExt}; use clap::Parser; use flate2::read::MultiGzDecoder; use flate2::write::GzEncoder; use flate2::Compression; +use glob::glob; +use human_bytes::human_bytes; +use indicatif::{ProgressBar,ProgressStyle}; use rand::Rng; use serde_json::Value; use std::clone::Clone; @@ -15,11 +19,16 @@ use std::io::{BufRead, BufReader, BufWriter, Write}; use std::mem::size_of; use std::path::PathBuf; use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; +use std::time::{Instant}; use std::thread::available_parallelism; +use sysinfo::{ + System, +}; use threadpool::ThreadPool; use unicode_segmentation::UnicodeSegmentation; + #[derive(Parser, Debug)] struct Args { #[arg(long)] @@ -27,9 +36,17 @@ struct Args { /// The size of the bloom filter in bytes. If the filter already exists, this parameter is /// ignored. - #[arg(long)] + /// If ==0 this _requires_ that fp_rate is > 0 + #[arg(long, default_value_t=0)] bloom_filter_size: usize, + /// The desired per-ngram false positive rate. If bloom_filter_size is not specified, this MUST + /// be specified, and the filter size will be computed using this FP rate and optimal number of + /// hashers. Maxes out at 90% of system RAM + #[arg(long, default_value_t=0.01)] + fp_rate: f64, + + /// The number of expected ngrams. This is used to calculate the optimal number of hashers. /// If the filter already exists, this parameter is ignored. #[arg(long)] @@ -59,6 +76,12 @@ struct Args { #[arg(long, default_value_t = false)] no_update_bloom_filter: bool, + /// Whether or not to save the bloom filter at the end. Defaults to false (i.e., saves the bloom filter) + /// If this is True, the bloom filter will NOT be saved, regardless of what no_update_bloom_filter suggests + #[arg(long, default_value_t = false)] + no_save_bloom_filter: bool, + + /// If this is true, we keep the input intact, but we add an annotation to each document that /// explains which spans from the text would have been deleted. #[arg(long, default_value_t = false)] @@ -315,6 +338,62 @@ impl BloomFilter { } } + + +fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize) -> usize { + /* Uses binary search to find optimal size of bloom filter using optimal number of hashers + and provided ngram counts + */ + // compute 90% of system ram + let mut sys = System::new_all(); + sys.refresh_all(); + + + let mut lo = 1 as usize; + let mut hi = ((sys.total_memory() as f64) * 0.9) as usize; + + // Save some time by checking endpoint first + if BloomFilter::prob_of_false_positive(hi, expected_ngram_count, + BloomFilter::optimal_number_of_hashers(hi, expected_ngram_count)) > fp_rate { + return hi; + } + + // Then do binary search to find optimal size + while lo < hi-1 { // -1 here because binsearch powers of 2 scare me + let mid = lo + (hi - lo) / 2; + let num_hashers = BloomFilter::optimal_number_of_hashers(mid, expected_ngram_count); + let computed_fp = BloomFilter::prob_of_false_positive(mid, expected_ngram_count, num_hashers) ; + if computed_fp > fp_rate { + // FP rate too high, need to go bigger + lo = mid + 1; + } else { + // FP rate too low, can make bloom filter smaller + hi = mid -1; + } + } + hi +} + + +fn expand_dirs(paths: &[PathBuf]) -> Result> { + let mut files = vec![]; + for path in paths { + if path.is_dir() { + let path_str = path + .to_str() + .ok_or_else(|| anyhow!("invalid path '{}'", path.to_string_lossy()))?; + for entry in glob(&format!("{}/**/*.json*.gz", path_str))? { + files.push(entry?.to_path_buf()); + } + } else { + files.push(path.clone()); + } + } + + Ok(files) +} + + #[allow(clippy::too_many_arguments)] // TODO : abstract parameters into a struct fn process_file( input_file: &PathBuf, @@ -328,6 +407,7 @@ fn process_file( annotate_attribute_only: bool, whole_document: bool, whole_paragraphs: bool, + pbar: &Arc>, ) -> Result<(), io::Error> { let input_file = OpenOptions::new() .read(true) @@ -444,33 +524,41 @@ fn process_file( serde_json::to_writer(&mut writer, &data)?; writer.write_all(b"\n")?; } - + pbar.lock().unwrap().inc(1); Ok(()) } fn main() { let args = Args::parse(); + let inputs = expand_dirs(&args.inputs).unwrap(); + println!("Parsed {:?} input files...", inputs.len()); let threads = if args.threads == 0 { available_parallelism().unwrap().get() } else { args.threads }; + + let now = Instant::now(); + let mut bloom_filter_size = args.bloom_filter_size; let bloom_filter = if args.bloom_filter_file.exists() { println!("Loading bloom filter from {:?}...", args.bloom_filter_file); BloomFilter::from_file(&args.bloom_filter_file).unwrap() } else { println!("Creating new bloom filter..."); + if args.bloom_filter_size == 0 { + bloom_filter_size = compute_bloom_size(args.fp_rate, args.expected_ngram_count); + } let num_hashers = BloomFilter::optimal_number_of_hashers( - args.bloom_filter_size, + bloom_filter_size, args.expected_ngram_count, ); - BloomFilter::new(args.bloom_filter_size, num_hashers) + BloomFilter::new(bloom_filter_size, num_hashers) }; let bloom_filter = Arc::new(bloom_filter); println!( - "Bloom filter loaded. ({} hashers)", - bloom_filter.hash_builders.len() + "\t...Bloom filter loaded. ({} hashers) ({} seconds)", + bloom_filter.hash_builders.len(), now.elapsed().as_secs() ); let p = bloom_filter.my_prob_of_false_positive(args.expected_ngram_count); @@ -487,6 +575,8 @@ fn main() { } let suggested_size = BloomFilter::suggest_size_in_bytes(args.expected_ngram_count); + println!("Suggested size is {} | Actual size is {} ", + human_bytes(suggested_size as f64), human_bytes(bloom_filter.size_in_bytes() as f64)); if suggested_size * 2 < bloom_filter.size_in_bytes() { println!( "WARNING: Your bloom filter is more than twice as large as suggested for {} elements. \ @@ -494,15 +584,31 @@ fn main() { args.expected_ngram_count ); } + // Build Progress bar (do some hacky arc/mutex wrapping) + let num_files = inputs.len() as u64; + + let pbar = ProgressBar::new(num_files) + .with_style( + ProgressStyle::with_template( + "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", + ).unwrap() + ); + pbar.inc(0); + //let pbar = ProgressBar::new(num_files); + let now = Instant::now(); + //pbar.set_style(ProgressStyle::with_template( + // "[{elapsed_precise}] {wide_bar:0.cyan/blue} [{pos:>7}/{len:7} {eta}]").unwrap()); + let pbar = Arc::new(Mutex::new(pbar)); let threadpool = ThreadPool::new(threads); - for input in args.inputs { + for input in inputs { let mut output = args.output_directory.clone(); output.push(input.file_name().unwrap()); let bloom_filter = bloom_filter.clone(); + let pbar = pbar.clone(); threadpool.execute(move || { - println!("Processing {input:?}..."); + //println!("Processing {input:?}..."); process_file( &input, &output, @@ -515,13 +621,14 @@ fn main() { args.annotate_attribute_only, args.whole_document, args.whole_paragraphs, + &pbar ) .unwrap(); }); } threadpool.join(); - - if !args.no_update_bloom_filter { + println!("Completed deduplication in {} seconds", now.elapsed().as_secs()); + if (!args.no_update_bloom_filter) && (!args.no_save_bloom_filter) { println!("Writing bloom filter to {:?}...", args.bloom_filter_file); bloom_filter.write_to_file(&args.bloom_filter_file).unwrap(); println!("Bloom filter written."); From 10b192aa7f04af2302511c921dffa4abdece7db6 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 7 Mar 2024 15:44:44 -0800 Subject: [PATCH 2/5] Made changes requseted in PR --- src/main.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main.rs b/src/main.rs index b1cac9d..932c53c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -355,11 +355,14 @@ fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize) -> usize { // Save some time by checking endpoint first if BloomFilter::prob_of_false_positive(hi, expected_ngram_count, BloomFilter::optimal_number_of_hashers(hi, expected_ngram_count)) > fp_rate { + println!( + "WARNING: To achieve desired false-positive rate, you'd need >90% of system RAM. Defaulting to 90% \ + system RAM."); return hi; } // Then do binary search to find optimal size - while lo < hi-1 { // -1 here because binsearch powers of 2 scare me + while lo < hi-1 { let mid = lo + (hi - lo) / 2; let num_hashers = BloomFilter::optimal_number_of_hashers(mid, expected_ngram_count); let computed_fp = BloomFilter::prob_of_false_positive(mid, expected_ngram_count, num_hashers) ; @@ -593,13 +596,11 @@ fn main() { "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", ).unwrap() ); - pbar.inc(0); - //let pbar = ProgressBar::new(num_files); - let now = Instant::now(); - //pbar.set_style(ProgressStyle::with_template( - // "[{elapsed_precise}] {wide_bar:0.cyan/blue} [{pos:>7}/{len:7} {eta}]").unwrap()); + pbar.inc(0); // initalizes pbar let pbar = Arc::new(Mutex::new(pbar)); + + let now = Instant::now(); let threadpool = ThreadPool::new(threads); for input in inputs { let mut output = args.output_directory.clone(); @@ -608,7 +609,6 @@ fn main() { let pbar = pbar.clone(); threadpool.execute(move || { - //println!("Processing {input:?}..."); process_file( &input, &output, From a6b63fd0e6a30429bd71a54c685e2bcc2a1ddd4b Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 7 Mar 2024 16:27:11 -0800 Subject: [PATCH 3/5] Added --no-progress flag --- src/main.rs | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 932c53c..f2fce7f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -105,6 +105,11 @@ struct Args { #[arg(long, default_value_t = false)] whole_paragraphs: bool, + /// If you don't want to include the progress bar, set this to true. + /// Will print out filenames as they get processed if this is true + #[arg(long, default_value_t = false)] + no_progress: bool, + /// The number of threads to use for processing. /// If this is 0, the number of threads is automatically determined. #[arg(long, short = 't', default_value_t = 0)] @@ -411,6 +416,7 @@ fn process_file( whole_document: bool, whole_paragraphs: bool, pbar: &Arc>, + no_progress: bool, ) -> Result<(), io::Error> { let input_file = OpenOptions::new() .read(true) @@ -527,7 +533,9 @@ fn process_file( serde_json::to_writer(&mut writer, &data)?; writer.write_all(b"\n")?; } - pbar.lock().unwrap().inc(1); + if !no_progress { + pbar.lock().unwrap().inc(1); + } Ok(()) } @@ -596,8 +604,10 @@ fn main() { "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", ).unwrap() ); - pbar.inc(0); // initalizes pbar let pbar = Arc::new(Mutex::new(pbar)); + if !args.no_progress { + pbar.lock().unwrap().inc(0); // initalizes pbar + } let now = Instant::now(); @@ -609,6 +619,10 @@ fn main() { let pbar = pbar.clone(); threadpool.execute(move || { + if args.no_progress { + println!("Processing {input:?}..."); + + } process_file( &input, &output, @@ -621,7 +635,8 @@ fn main() { args.annotate_attribute_only, args.whole_document, args.whole_paragraphs, - &pbar + &pbar, + args.no_progress, ) .unwrap(); }); From a7b7ccf5f1e8add9b1af307a09dd5647c0c38442 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 13 Mar 2024 12:16:42 -0700 Subject: [PATCH 4/5] Added options for no-progress (cleaner signature) | Updated README.md to describe some new features --- README.md | 18 +++++++++++++++++- src/main.rs | 25 ++++++++++++++++--------- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2e500b7..f4be835 100644 --- a/README.md +++ b/README.md @@ -44,12 +44,28 @@ target/release/bff \ --bloom-filter-size 274877906944 \ --expected-ngram-count 1000000000 \ --output-directory deduped/ \ - *.json.gz + *.json.gz # Can also pass a directory containing .json.gz files here ``` Each input file will run in its own thread, and the filter will be shared between them. In the end, as before the filter will be written to disk. +### Automatically choosing filter size +To automatically compute the size of the filter, you can instead specify a false-positive rate. `bff` will create a bloom filter to attain that false positive rate, up to 90% of the system RAM. Note that false positive rate is per token-ngram, so the chance of a whole paragraph/document being marked as a false-positive will actually be quite less than the specified `fp-rate`: + +```bash +target/release/bff \ + --bloom-filter-file filter.bff \ + --fp-rate 0.01 \ + --expected-ngram-count 1000000000 \ + --output-directory deduped/ \ + input_dir/ +``` + +Each input file will run in its own thread, and the filter will be shared between them. +In the end, as before the filter will be written to disk. + + ### Pre-load the filter You can stick ngrams into the filter ahead of time, for example if you want to decontaminate your dataset: diff --git a/src/main.rs b/src/main.rs index f2fce7f..16c2b7e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -415,8 +415,7 @@ fn process_file( annotate_attribute_only: bool, whole_document: bool, whole_paragraphs: bool, - pbar: &Arc>, - no_progress: bool, + pbar_option: &Option>>, ) -> Result<(), io::Error> { let input_file = OpenOptions::new() .read(true) @@ -533,9 +532,11 @@ fn process_file( serde_json::to_writer(&mut writer, &data)?; writer.write_all(b"\n")?; } - if !no_progress { - pbar.lock().unwrap().inc(1); + match pbar_option { + Some(pbar) => pbar.lock().unwrap().inc(1), + None => (), } + Ok(()) } @@ -605,23 +606,30 @@ fn main() { ).unwrap() ); let pbar = Arc::new(Mutex::new(pbar)); + + if !args.no_progress { - pbar.lock().unwrap().inc(0); // initalizes pbar + pbar.lock().unwrap().inc(0); // initializes pbar } + let now = Instant::now(); let threadpool = ThreadPool::new(threads); for input in inputs { let mut output = args.output_directory.clone(); output.push(input.file_name().unwrap()); let bloom_filter = bloom_filter.clone(); - let pbar = pbar.clone(); + + let pbar_option: Option>> = if args.no_progress { + None + } else { + Some(pbar.clone()) + }; threadpool.execute(move || { if args.no_progress { println!("Processing {input:?}..."); - } process_file( &input, @@ -635,8 +643,7 @@ fn main() { args.annotate_attribute_only, args.whole_document, args.whole_paragraphs, - &pbar, - args.no_progress, + &pbar_option, ) .unwrap(); }); From 2cc9cacafc4adc5313cdb084b616182de33663b7 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 13 Mar 2024 12:17:35 -0700 Subject: [PATCH 5/5] Cleaned up readme duplicates --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index f4be835..bf6060b 100644 --- a/README.md +++ b/README.md @@ -62,9 +62,6 @@ target/release/bff \ input_dir/ ``` -Each input file will run in its own thread, and the filter will be shared between them. -In the end, as before the filter will be written to disk. - ### Pre-load the filter