From 77e51600898990a79ed251a7f7078da1959bf5c6 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 29 Feb 2024 12:45:17 -0800 Subject: [PATCH 01/56] Barebones python/S3 implementation --- bff_v0.py | 169 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 9 ++- 2 files changed, 173 insertions(+), 5 deletions(-) create mode 100644 bff_v0.py diff --git a/bff_v0.py b/bff_v0.py new file mode 100644 index 0000000..a37da9d --- /dev/null +++ b/bff_v0.py @@ -0,0 +1,169 @@ +""" Quick'n'dirty mapping of bff for python before I can make it pure-rust +How this works: (using ray) +- map to DOWNLOAD all files and store on some local location +- use this local mapping to feed to bff.rs +- map to UPLOAD all files to some output location +*Yes I know this is clunky and requires 2x dataset size in local storage + +ASSUMES .jsonl.gz files, with 'text' field and FLAT file structure on s3 +i.e. + s3://bucket/input_dir/ + file_00.jsonl.gz + file_01.jsonl.gz + . + . + . + file_N.jsonl.gz + # AND NO SUBDIRECTORIES! + +TODOS: +- Add default tokenizer/counter to get good filter/ngram sizes automatically +- Make this pure rust + + Only need |num_threads| * |file_size| *2 + |filter_size| memory + + Sadly I code rust like a preschooler, so I'm working on this =( +- Add a progress bar vs printing filenames w/in rust code +""" + + +import argparse +import ray +import boto3 +from pathlib import Path +import os +from tqdm.auto import tqdm +import glob +import subprocess + + +RUST_CMD = os.path.join(os.path.dirname(__file__), 'target/release/bff') + +# ================================================= +# = Ray/S3 Utils = +# ================================================= + +def to_iterator(obj_ids, batch_size=100): + while obj_ids: + done, obj_ids = ray.wait(obj_ids, num_returns=min(batch_size, len(obj_ids))) + for d in done: + yield ray.get(d) + + +def block_until_complete(ret): + """Use this when we want to efficiently wait until an iterable + of ray object ids finishes + """ + + for _ in tqdm(to_iterator(ret), total=len(ret)): + pass + ret = ray.get(ret) + + +def copy_to_s3(local_obj, remote_dir, disable_tqdm=True): + if remote_dir == None: + return + remote_obj = os.path.join(remote_dir, os.path.basename(local_obj)).replace("s3://", "") + bucket, key = remote_obj.split("/", 1) + s3 = boto3.client("s3") + local_obj_size = os.path.getsize(local_obj) + with tqdm(total=local_obj_size, unit="B", unit_scale=True, desc=local_obj, disable=disable_tqdm) as pbar: + s3.upload_file(Bucket=bucket, Filename=local_obj, Key=key, Callback=lambda b: pbar.update(b)) + +@ray.remote +def RAY_copy_to_s3(local_obj, remote_dir, disable_tqdm=True): + copy_to_s3(local_obj, remote_dir, disable_tqdm=disable_tqdm) + + +def copy_from_s3(remote_obj, local_dir, disable_tqdm=True): + bucket, key = remote_obj.replace("s3://", "").split("/", 1) + s3 = boto3.client("s3") + remote_obj_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"] + target = os.path.join(local_dir, os.path.basename(key)) + with tqdm(total=remote_obj_size, unit="B", unit_scale=True, desc=remote_obj, disable=disable_tqdm) as pbar: + s3.download_file(Bucket=bucket, Key=key, Filename=target, Callback=lambda b: pbar.update(b)) + +@ray.remote +def RAY_copy_from_s3(remote_obj, local_dir, disable_tqdm=True): + copy_from_s3(remote_obj, local_dir, disable_tqdm=disable_tqdm) + + +def list_s3_keys(prefix, suffix='.jsonl.gz'): + """ Lists full s3 uri's of all keys that match both the prefix and suffix. + if Suffix==None => no filtering on suffx + """ + s3 = boto3.resource("s3") + bucket_name, path_within_bucket = prefix.replace("s3://", "").split("/", 1) + bucket = s3.Bucket(bucket_name) + + s3_uris = [] + for x in bucket.objects.filter(Prefix=path_within_bucket): + if suffix == None or x.key.endswith(suffix): + s3_uris.append(os.path.join('s3://%s' % bucket_name, x.key)) + return s3_uris + + + +# ================================================ +# = Main block = +# ================================================ + + +def run_bff(tmp_dir, input_dir, output_dir, filter_size, expected_ngram_count): + # Subprocess DIY python<->rust bindings. Might be better to use maturin, but this is a bit simpler + cmd = '%s --bloom-filter-file %s --bloom-filter-size %s --expected-ngram-count %s --output-directory %s %s' % ( + RUST_CMD, + os.path.join(tmp_dir, 'filter.bff'), + filter_size, + expected_ngram_count, + os.path.join(output_dir, ''), + os.path.join(input_dir, '*') + ) + return_code = subprocess.run(cmd, shell=True).returncode + assert return_code == 0 + + +def main(s3_input_dir, s3_output_dir, filter_size, + expected_ngram_count, tmp_storage_dir, suffix='.jsonl.gz'): + # Step 0: initialize ray and args and local storage directories + ray.init(ignore_reinit_error=True) + + input_dir = os.path.join(tmp_storage_dir, 'input') + output_dir = os.path.join(tmp_storage_dir, 'output') + for p in input_dir, output_dir: + Path(p).mkdir(parents=True, exist_ok=True) + + # step 1: collect and download s3 files to disk + print("Collecting and downloading s3 files...") + s3_uris = list_s3_keys(s3_input_dir, suffix=suffix)[:10] + download_refs = [RAY_copy_from_s3.remote(uri, input_dir) for uri in s3_uris] + block_until_complete(download_refs) + + # Step 2: Run BFF + print("Running BFF on %s local files..." % len(s3_uris)) + run_bff(tmp_storage_dir, input_dir, output_dir, filter_size, expected_ngram_count) + + + # Step 3: upload output files to S3 + print("Uploading filtered files...") + output_files = glob.glob(os.path.join(output_dir, '*%s' % suffix if suffix != None else '*')) + upload_refs = [RAY_copy_to_s3.remote(f, s3_output_dir) for f in output_files] + block_until_complete(upload_refs) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + # S3 adapter args + parser.add_argument('--s3-input-dir', type=str) + parser.add_argument('--s3-output-dir', type=str) + + # Args passed to rust function + parser.add_argument('--filter-size', type=int) + parser.add_argument('--expected-ngram-count', type=int) + parser.add_argument('--tmp-storage-dir', type=str, default='/tmp/bff') + + args = parser.parse_args() + main(s3_input_dir=args.s3_input_dir, + s3_output_dir=args.s3_output_dir, + filter_size=args.filter_size, + expected_ngram_count=args.expected_ngram_count, + tmp_storage_dir=args.tmp_storage_dir) diff --git a/src/main.rs b/src/main.rs index 1fad7c7..00a62a5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -175,12 +175,11 @@ impl BloomFilter { hash_builder_seeds.push(seeds); } - let mut bits = Vec::new(); let number_of_u32 = size_in_bytes / size_of::(); - bits.reserve_exact(number_of_u32); - for _ in 0..number_of_u32 { - bits.push(AtomicU32::new(0)); - } + let mut bits = { + (0..number_of_u32).map(|_| AtomicU32::default()).collect() + }; + Self { bits, From ab026494a1a4ff6a56631333eae68e4b9338b594 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Tue, 5 Mar 2024 15:15:17 -0800 Subject: [PATCH 02/56] Bash script written --- Cargo.toml | 3 +- scripts/s3_script.sh | 97 ++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 36 ++++++++++++---- 3 files changed, 128 insertions(+), 8 deletions(-) create mode 100644 scripts/s3_script.sh diff --git a/Cargo.toml b/Cargo.toml index ad43e7d..0ef6c6f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,4 +13,5 @@ unicode-segmentation = "1.7" rand = "0.8.4" ahash = { version = "0.8.1", features = ["runtime-rng"] } byteorder = "1" -threadpool = "1.8.1" \ No newline at end of file +threadpool = "1.8.1" +indicatif = "*" \ No newline at end of file diff --git a/scripts/s3_script.sh b/scripts/s3_script.sh new file mode 100644 index 0000000..1883ece --- /dev/null +++ b/scripts/s3_script.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +#Mostly GPT-written bash script to do the following: +#1. Download all jsonl.gz files from s3-src to stash-dir +#2. Compute arguments to feed to bff. Defaults: +# a. filter-size: (Main memory) * 0.90 = filter size +# b. expected-ngram-count: use wimbd to compute +#3. Run BFF +#4. Upload back to S3 +# + +# Parse the named arguments +while [[ $# -gt 0 ]]; do + key="$1" + + case $key in + -i|--s3-input) + s3_input_dir="$2" + shift + shift + ;; + -t|--temp-dir) + temp_dir="$2" + shift + shift + ;; + -o|--s3-output) + s3_output_dir="$2" + shift + shift + ;; + *) + echo "Unknown option: $key" + exit 1 + ;; + esac +done + +input_files="$temp_dir/input" +output_files="$temp_dir/output" +mkdir -p $input_files +mkdir -p $output_files + +# ======================================== +# = Download from S3 = +# ======================================== +aws s3 cp $s3_input_dir $input_files --recursive + + + +# ======================================================= +# = Compute stats to give to script = +# ======================================================= + +# Use wimbd to get total tokens +wimbd_stats=$(wimbd stats $input_files/*.jsonl.gz) + +total_tokens=$(echo "$wimbd_stats" | grep "^total tokens" | sed 's/^total tokens: //' | tr -d ',') +total_documents=$(echo "$wimbd_stats" | grep "^total documents" | sed 's/^total documents: //' | tr -d ',') +expected_ngrams=$(( total_tokens-total_documents )) + + +echo "EXPECTED NGRAMS $expected_ngrams" + + +# do os-varying memory count +if [ "$(uname)" = "Linux" ]; then + total_memory=$(free | awk 'NR==2 {print $2}'); +fi + +# For macOS +if [ "$(uname)"="Darwin" ]; then + total_memory=$(sysctl -n hw.memsize) +fi + +bloom_filter_size=$(( total_memory * 1 / 100 )) +bloom_filter_size=${bloom_filter_size%.*} + +echo "FILTER $bloom_filter_size" + +# ====================================================== +# = Actually run bff = +# ====================================================== +target/release/bff --bloom-filter-file filter.bff --bloom-filter-size $bloom_filter_size --expected-ngram-count $expected_ngrams --output-directory $output_files $input_files/*.jsonl.gz + + +# ================================================== +# = And then upload back to S3 = +# ================================================== +aws s3 cp $output_files $s3_output_dir --recursive + + + + + + + diff --git a/src/main.rs b/src/main.rs index 00a62a5..1b4e739 100644 --- a/src/main.rs +++ b/src/main.rs @@ -15,10 +15,14 @@ use std::io::{BufRead, BufReader, BufWriter, Write}; use std::mem::size_of; use std::path::PathBuf; use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::Arc; +use std::sync::{Arc,Mutex}; use std::thread::available_parallelism; +use std::time::Instant; use threadpool::ThreadPool; use unicode_segmentation::UnicodeSegmentation; +use indicatif::{ProgressBar,ProgressStyle}; + + #[derive(Parser, Debug)] struct Args { @@ -327,6 +331,7 @@ fn process_file( annotate_attribute_only: bool, whole_document: bool, whole_paragraphs: bool, + pbar: Arc> ) -> Result<(), io::Error> { let input_file = OpenOptions::new() .read(true) @@ -443,7 +448,7 @@ fn process_file( serde_json::to_writer(&mut writer, &data)?; writer.write_all(b"\n")?; } - + pbar.lock().unwrap().inc(1); // increment progress bar Ok(()) } @@ -455,6 +460,7 @@ fn main() { args.threads }; + let now = Instant::now(); let bloom_filter = if args.bloom_filter_file.exists() { println!("Loading bloom filter from {:?}...", args.bloom_filter_file); BloomFilter::from_file(&args.bloom_filter_file).unwrap() @@ -468,8 +474,8 @@ fn main() { }; let bloom_filter = Arc::new(bloom_filter); println!( - "Bloom filter loaded. ({} hashers)", - bloom_filter.hash_builders.len() + "Bloom filter loaded. ({} hashers) ({} ms)", + bloom_filter.hash_builders.len(), now.elapsed().as_millis() ); let p = bloom_filter.my_prob_of_false_positive(args.expected_ngram_count); @@ -494,14 +500,29 @@ fn main() { ); } + // Build Progress bar (do some hacky arc/mutex wrapping) + let num_files = args.inputs.len() as u64; + + let pbar = ProgressBar::new(num_files) + .with_style( + ProgressStyle::with_template( + "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", + ).unwrap() + ); + //let pbar = ProgressBar::new(num_files); + let now = Instant::now(); + //pbar.set_style(ProgressStyle::with_template( + // "[{elapsed_precise}] {wide_bar:0.cyan/blue} [{pos:>7}/{len:7} {eta}]").unwrap()); + let pbar = Arc::new(Mutex::new(pbar)); + let threadpool = ThreadPool::new(threads); for input in args.inputs { let mut output = args.output_directory.clone(); output.push(input.file_name().unwrap()); let bloom_filter = bloom_filter.clone(); - + let pbar = pbar.clone(); threadpool.execute(move || { - println!("Processing {input:?}..."); + // println!("Processing {input:?}..."); // Commenting out so ProgressBar works more cleanly process_file( &input, &output, @@ -514,12 +535,13 @@ fn main() { args.annotate_attribute_only, args.whole_document, args.whole_paragraphs, + pbar, ) .unwrap(); }); } threadpool.join(); - + println!("Processed {} files in {} s", num_files, now.elapsed().as_secs()); if !args.no_update_bloom_filter { println!("Writing bloom filter to {:?}...", args.bloom_filter_file); bloom_filter.write_to_file(&args.bloom_filter_file).unwrap(); From 8aaf3c9f96773a43f4ed65fca70952dc2a725d38 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 7 Mar 2024 14:34:16 -0800 Subject: [PATCH 03/56] Added i) directory support, ii) FP rate args, iii) No-Save option --- Cargo.lock | 499 ++++++++++++++++++++++++++++++++++------------------ Cargo.toml | 7 +- src/main.rs | 129 ++++++++++++-- 3 files changed, 450 insertions(+), 185 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0e67c77..51ac852 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10,53 +10,101 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", + "zerocopy", ] +[[package]] +name = "anstream" +version = "0.6.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" + +[[package]] +name = "anstyle-parse" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +dependencies = [ + "windows-sys", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +dependencies = [ + "anstyle", + "windows-sys", +] + +[[package]] +name = "anyhow" +version = "1.0.80" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" + [[package]] name = "bff" version = "0.1.0" dependencies = [ "ahash", + "anyhow", "byteorder", "clap", "flate2", + "glob", + "human_bytes", + "indicatif", "rand", "serde_json", + "sysinfo", "threadpool", "unicode-segmentation", ] -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "487f1e0fcbe47deb8b0574e646def1c903389d95241dd1bbcc6ce4a715dfc0c1" - [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cc" -version = "1.0.79" +version = "1.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +checksum = "8cd6604a82acf3039f1144f54b8eb34e91ffba622051189e71b781822d5ee1f5" [[package]] name = "cfg-if" @@ -66,27 +114,33 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" -version = "4.1.11" +version = "4.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42dfd32784433290c51d92c438bb72ea5063797fc3cc9a21a8c4346bebbb2098" +checksum = "b230ab84b0ffdf890d5a10abdbc8b83ae1c4918275daea1ab8801f71536b2651" dependencies = [ - "bitflags 2.0.2", + "clap_builder", "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" +dependencies = [ + "anstream", + "anstyle", "clap_lex", - "is-terminal", - "once_cell", "strsim", - "termcolor", ] [[package]] name = "clap_derive" -version = "4.1.9" +version = "4.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fddf67631444a3a3e3e5ac51c36a5e01335302de677bd78759eaa90ab1f46644" +checksum = "307bc0538d5f0f83b8248db3087aa92fe504e4691294d0c96c0eabc33f47ba47" dependencies = [ "heck", - "proc-macro-error", "proc-macro2", "quote", "syn", @@ -94,57 +148,95 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.3.3" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "033f6b7a4acb1f358c742aaca805c939ee73b4c6209ae4318ec7aca81c42e646" -dependencies = [ - "os_str_bytes", -] +checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" [[package]] name = "cmake" -version = "0.1.49" +version = "0.1.50" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db34956e100b30725f2eb215f90d4871051239535632f84fea3bc92722c66b7c" +checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" dependencies = [ "cc", ] +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "unicode-width", + "windows-sys", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + [[package]] name = "crc32fast" -version = "1.3.2" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" dependencies = [ "cfg-if", ] [[package]] -name = "errno" -version = "0.2.8" +name = "crossbeam-deque" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" dependencies = [ - "errno-dragonfly", - "libc", - "winapi", + "crossbeam-epoch", + "crossbeam-utils", ] [[package]] -name = "errno-dragonfly" -version = "0.1.2" +name = "crossbeam-epoch" +version = "0.9.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" dependencies = [ - "cc", - "libc", + "crossbeam-utils", ] +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + +[[package]] +name = "either" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" + +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "flate2" -version = "1.0.25" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" dependencies = [ "crc32fast", "libz-ng-sys", @@ -153,15 +245,21 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.8" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" +checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" dependencies = [ "cfg-if", "libc", "wasi", ] +[[package]] +name = "glob" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" + [[package]] name = "heck" version = "0.4.1" @@ -170,100 +268,111 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" [[package]] name = "hermit-abi" -version = "0.2.6" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7" -dependencies = [ - "libc", -] +checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" [[package]] -name = "hermit-abi" -version = "0.3.1" +name = "human_bytes" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +checksum = "91f255a4535024abf7640cb288260811fc14794f62b063652ed349f9a6c2348e" [[package]] -name = "io-lifetimes" -version = "1.0.9" +name = "indicatif" +version = "0.17.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09270fd4fa1111bc614ed2246c7ef56239a3063d5be0d1ec3b589c505d400aeb" +checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" dependencies = [ - "hermit-abi 0.3.1", - "libc", - "windows-sys", + "console", + "instant", + "number_prefix", + "portable-atomic", + "unicode-width", ] [[package]] -name = "is-terminal" -version = "0.4.5" +name = "instant" +version = "0.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8687c819457e979cc940d09cb16e42a1bf70aa6b60a549de6d3a62a0ee90c69e" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" dependencies = [ - "hermit-abi 0.3.1", - "io-lifetimes", - "rustix", - "windows-sys", + "cfg-if", ] [[package]] name = "itoa" -version = "1.0.6" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" + +[[package]] +name = "lazy_static" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" -version = "0.2.140" +version = "0.2.153" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c" +checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" [[package]] name = "libz-ng-sys" -version = "1.1.8" +version = "1.1.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4399ae96a9966bf581e726de86969f803a81b7ce795fcd5480e640589457e0f2" +checksum = "c6409efc61b12687963e602df8ecf70e8ddacf95bc6576bcf16e3ac6328083c5" dependencies = [ "cmake", "libc", ] [[package]] -name = "linux-raw-sys" -version = "0.1.4" +name = "miniz_oxide" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f051f77a7c8e6957c0696eac88f26b0117e54f52d3fc682ab19397a8812846a4" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +dependencies = [ + "adler", +] [[package]] -name = "miniz_oxide" -version = "0.6.2" +name = "ntapi" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +checksum = "e8a3895c6391c39d7fe7ebc444a87eb2991b2a0bc718fdabd071eec617fc68e4" dependencies = [ - "adler", + "winapi", ] [[package]] name = "num_cpus" -version = "1.15.0" +version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.2.6", + "hermit-abi", "libc", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "once_cell" -version = "1.17.1" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] -name = "os_str_bytes" -version = "6.5.0" +name = "portable-atomic" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ceedf44fb00f2d1984b0bc98102627ce622e083e49a5bacdb3e514fa4238e267" +checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" [[package]] name = "ppv-lite86" @@ -271,44 +380,20 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - [[package]] name = "proc-macro2" -version = "1.0.53" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba466839c78239c09faf015484e5cc04860f88242cff4d03eb038f04b4699b73" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.26" +version = "1.0.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" dependencies = [ "proc-macro2", ] @@ -344,36 +429,56 @@ dependencies = [ ] [[package]] -name = "rustix" -version = "0.36.11" +name = "rayon" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db4165c9963ab29e422d6c26fbc1d37f15bace6b2810221f9d925023480fcf0e" +checksum = "e4963ed1bc86e4f3ee217022bd855b297cef07fb9eac5dfa1f788b220b49b3bd" dependencies = [ - "bitflags 1.3.2", - "errno", - "io-lifetimes", - "libc", - "linux-raw-sys", - "windows-sys", + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", ] [[package]] name = "ryu" -version = "1.0.13" +version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041" +checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" [[package]] name = "serde" -version = "1.0.158" +version = "1.0.197" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.197" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "771d4d9c4163ee138805e12c710dd365e4f44be8be0503cb1bb9eb989425d9c9" +checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "serde_json" -version = "1.0.94" +version = "1.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c533a59c9d8a93a09c6ab31f0fd5e5f4dd1b8fc9434804029839884765d04ea" +checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" dependencies = [ "itoa", "ryu", @@ -382,15 +487,15 @@ dependencies = [ [[package]] name = "strsim" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +checksum = "5ee073c9e4cd00e28217186dbe12796d692868f432bf2e97ee73bed0c56dfa01" [[package]] name = "syn" -version = "1.0.109" +version = "2.0.52" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" dependencies = [ "proc-macro2", "quote", @@ -398,12 +503,18 @@ dependencies = [ ] [[package]] -name = "termcolor" -version = "1.2.0" +name = "sysinfo" +version = "0.30.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6" +checksum = "0c385888ef380a852a16209afc8cfad22795dd8873d69c9a14d2e2088f118d18" dependencies = [ - "winapi-util", + "cfg-if", + "core-foundation-sys", + "libc", + "ntapi", + "once_cell", + "rayon", + "windows", ] [[package]] @@ -417,15 +528,27 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.8" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-segmentation" -version = "1.10.1" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" + +[[package]] +name = "unicode-width" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" + +[[package]] +name = "utf8parse" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "version_check" @@ -456,34 +579,44 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] -name = "winapi-util" -version = "0.1.5" +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" dependencies = [ - "winapi", + "windows-core", + "windows-targets", ] [[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" +name = "windows-core" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets", +] [[package]] name = "windows-sys" -version = "0.45.0" +version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ "windows-targets", ] [[package]] name = "windows-targets" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" +checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" dependencies = [ "windows_aarch64_gnullvm", "windows_aarch64_msvc", @@ -496,42 +629,62 @@ dependencies = [ [[package]] name = "windows_aarch64_gnullvm" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" +checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" [[package]] name = "windows_aarch64_msvc" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" +checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" [[package]] name = "windows_i686_gnu" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" +checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" [[package]] name = "windows_i686_msvc" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" +checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" [[package]] name = "windows_x86_64_gnu" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" +checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" [[package]] name = "windows_x86_64_gnullvm" -version = "0.42.2" +version = "0.52.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" +checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" [[package]] name = "windows_x86_64_msvc" -version = "0.42.2" +version = "0.52.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" + +[[package]] +name = "zerocopy" +version = "0.7.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index ad43e7d..8a475cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +sysinfo="0.30.7" clap = { version = "4.1.11", features = ["derive"] } flate2 = { version = "1.0", features = ["zlib-ng"], default-features = false } serde_json = "1.0" @@ -13,4 +14,8 @@ unicode-segmentation = "1.7" rand = "0.8.4" ahash = { version = "0.8.1", features = ["runtime-rng"] } byteorder = "1" -threadpool = "1.8.1" \ No newline at end of file +threadpool = "1.8.1" +human_bytes = "0.4.3" +indicatif = "0.17.8" +glob = "0.3.1" +anyhow = "1.0.80" diff --git a/src/main.rs b/src/main.rs index 1fad7c7..b1cac9d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,9 +1,13 @@ use ahash::RandomState; +use anyhow::{anyhow, Result}; use byteorder::{LittleEndian, NativeEndian, ReadBytesExt, WriteBytesExt}; use clap::Parser; use flate2::read::MultiGzDecoder; use flate2::write::GzEncoder; use flate2::Compression; +use glob::glob; +use human_bytes::human_bytes; +use indicatif::{ProgressBar,ProgressStyle}; use rand::Rng; use serde_json::Value; use std::clone::Clone; @@ -15,11 +19,16 @@ use std::io::{BufRead, BufReader, BufWriter, Write}; use std::mem::size_of; use std::path::PathBuf; use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::Arc; +use std::sync::{Arc, Mutex}; +use std::time::{Instant}; use std::thread::available_parallelism; +use sysinfo::{ + System, +}; use threadpool::ThreadPool; use unicode_segmentation::UnicodeSegmentation; + #[derive(Parser, Debug)] struct Args { #[arg(long)] @@ -27,9 +36,17 @@ struct Args { /// The size of the bloom filter in bytes. If the filter already exists, this parameter is /// ignored. - #[arg(long)] + /// If ==0 this _requires_ that fp_rate is > 0 + #[arg(long, default_value_t=0)] bloom_filter_size: usize, + /// The desired per-ngram false positive rate. If bloom_filter_size is not specified, this MUST + /// be specified, and the filter size will be computed using this FP rate and optimal number of + /// hashers. Maxes out at 90% of system RAM + #[arg(long, default_value_t=0.01)] + fp_rate: f64, + + /// The number of expected ngrams. This is used to calculate the optimal number of hashers. /// If the filter already exists, this parameter is ignored. #[arg(long)] @@ -59,6 +76,12 @@ struct Args { #[arg(long, default_value_t = false)] no_update_bloom_filter: bool, + /// Whether or not to save the bloom filter at the end. Defaults to false (i.e., saves the bloom filter) + /// If this is True, the bloom filter will NOT be saved, regardless of what no_update_bloom_filter suggests + #[arg(long, default_value_t = false)] + no_save_bloom_filter: bool, + + /// If this is true, we keep the input intact, but we add an annotation to each document that /// explains which spans from the text would have been deleted. #[arg(long, default_value_t = false)] @@ -315,6 +338,62 @@ impl BloomFilter { } } + + +fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize) -> usize { + /* Uses binary search to find optimal size of bloom filter using optimal number of hashers + and provided ngram counts + */ + // compute 90% of system ram + let mut sys = System::new_all(); + sys.refresh_all(); + + + let mut lo = 1 as usize; + let mut hi = ((sys.total_memory() as f64) * 0.9) as usize; + + // Save some time by checking endpoint first + if BloomFilter::prob_of_false_positive(hi, expected_ngram_count, + BloomFilter::optimal_number_of_hashers(hi, expected_ngram_count)) > fp_rate { + return hi; + } + + // Then do binary search to find optimal size + while lo < hi-1 { // -1 here because binsearch powers of 2 scare me + let mid = lo + (hi - lo) / 2; + let num_hashers = BloomFilter::optimal_number_of_hashers(mid, expected_ngram_count); + let computed_fp = BloomFilter::prob_of_false_positive(mid, expected_ngram_count, num_hashers) ; + if computed_fp > fp_rate { + // FP rate too high, need to go bigger + lo = mid + 1; + } else { + // FP rate too low, can make bloom filter smaller + hi = mid -1; + } + } + hi +} + + +fn expand_dirs(paths: &[PathBuf]) -> Result> { + let mut files = vec![]; + for path in paths { + if path.is_dir() { + let path_str = path + .to_str() + .ok_or_else(|| anyhow!("invalid path '{}'", path.to_string_lossy()))?; + for entry in glob(&format!("{}/**/*.json*.gz", path_str))? { + files.push(entry?.to_path_buf()); + } + } else { + files.push(path.clone()); + } + } + + Ok(files) +} + + #[allow(clippy::too_many_arguments)] // TODO : abstract parameters into a struct fn process_file( input_file: &PathBuf, @@ -328,6 +407,7 @@ fn process_file( annotate_attribute_only: bool, whole_document: bool, whole_paragraphs: bool, + pbar: &Arc>, ) -> Result<(), io::Error> { let input_file = OpenOptions::new() .read(true) @@ -444,33 +524,41 @@ fn process_file( serde_json::to_writer(&mut writer, &data)?; writer.write_all(b"\n")?; } - + pbar.lock().unwrap().inc(1); Ok(()) } fn main() { let args = Args::parse(); + let inputs = expand_dirs(&args.inputs).unwrap(); + println!("Parsed {:?} input files...", inputs.len()); let threads = if args.threads == 0 { available_parallelism().unwrap().get() } else { args.threads }; + + let now = Instant::now(); + let mut bloom_filter_size = args.bloom_filter_size; let bloom_filter = if args.bloom_filter_file.exists() { println!("Loading bloom filter from {:?}...", args.bloom_filter_file); BloomFilter::from_file(&args.bloom_filter_file).unwrap() } else { println!("Creating new bloom filter..."); + if args.bloom_filter_size == 0 { + bloom_filter_size = compute_bloom_size(args.fp_rate, args.expected_ngram_count); + } let num_hashers = BloomFilter::optimal_number_of_hashers( - args.bloom_filter_size, + bloom_filter_size, args.expected_ngram_count, ); - BloomFilter::new(args.bloom_filter_size, num_hashers) + BloomFilter::new(bloom_filter_size, num_hashers) }; let bloom_filter = Arc::new(bloom_filter); println!( - "Bloom filter loaded. ({} hashers)", - bloom_filter.hash_builders.len() + "\t...Bloom filter loaded. ({} hashers) ({} seconds)", + bloom_filter.hash_builders.len(), now.elapsed().as_secs() ); let p = bloom_filter.my_prob_of_false_positive(args.expected_ngram_count); @@ -487,6 +575,8 @@ fn main() { } let suggested_size = BloomFilter::suggest_size_in_bytes(args.expected_ngram_count); + println!("Suggested size is {} | Actual size is {} ", + human_bytes(suggested_size as f64), human_bytes(bloom_filter.size_in_bytes() as f64)); if suggested_size * 2 < bloom_filter.size_in_bytes() { println!( "WARNING: Your bloom filter is more than twice as large as suggested for {} elements. \ @@ -494,15 +584,31 @@ fn main() { args.expected_ngram_count ); } + // Build Progress bar (do some hacky arc/mutex wrapping) + let num_files = inputs.len() as u64; + + let pbar = ProgressBar::new(num_files) + .with_style( + ProgressStyle::with_template( + "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", + ).unwrap() + ); + pbar.inc(0); + //let pbar = ProgressBar::new(num_files); + let now = Instant::now(); + //pbar.set_style(ProgressStyle::with_template( + // "[{elapsed_precise}] {wide_bar:0.cyan/blue} [{pos:>7}/{len:7} {eta}]").unwrap()); + let pbar = Arc::new(Mutex::new(pbar)); let threadpool = ThreadPool::new(threads); - for input in args.inputs { + for input in inputs { let mut output = args.output_directory.clone(); output.push(input.file_name().unwrap()); let bloom_filter = bloom_filter.clone(); + let pbar = pbar.clone(); threadpool.execute(move || { - println!("Processing {input:?}..."); + //println!("Processing {input:?}..."); process_file( &input, &output, @@ -515,13 +621,14 @@ fn main() { args.annotate_attribute_only, args.whole_document, args.whole_paragraphs, + &pbar ) .unwrap(); }); } threadpool.join(); - - if !args.no_update_bloom_filter { + println!("Completed deduplication in {} seconds", now.elapsed().as_secs()); + if (!args.no_update_bloom_filter) && (!args.no_save_bloom_filter) { println!("Writing bloom filter to {:?}...", args.bloom_filter_file); bloom_filter.write_to_file(&args.bloom_filter_file).unwrap(); println!("Bloom filter written."); From 10b192aa7f04af2302511c921dffa4abdece7db6 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 7 Mar 2024 15:44:44 -0800 Subject: [PATCH 04/56] Made changes requseted in PR --- src/main.rs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/main.rs b/src/main.rs index b1cac9d..932c53c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -355,11 +355,14 @@ fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize) -> usize { // Save some time by checking endpoint first if BloomFilter::prob_of_false_positive(hi, expected_ngram_count, BloomFilter::optimal_number_of_hashers(hi, expected_ngram_count)) > fp_rate { + println!( + "WARNING: To achieve desired false-positive rate, you'd need >90% of system RAM. Defaulting to 90% \ + system RAM."); return hi; } // Then do binary search to find optimal size - while lo < hi-1 { // -1 here because binsearch powers of 2 scare me + while lo < hi-1 { let mid = lo + (hi - lo) / 2; let num_hashers = BloomFilter::optimal_number_of_hashers(mid, expected_ngram_count); let computed_fp = BloomFilter::prob_of_false_positive(mid, expected_ngram_count, num_hashers) ; @@ -593,13 +596,11 @@ fn main() { "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", ).unwrap() ); - pbar.inc(0); - //let pbar = ProgressBar::new(num_files); - let now = Instant::now(); - //pbar.set_style(ProgressStyle::with_template( - // "[{elapsed_precise}] {wide_bar:0.cyan/blue} [{pos:>7}/{len:7} {eta}]").unwrap()); + pbar.inc(0); // initalizes pbar let pbar = Arc::new(Mutex::new(pbar)); + + let now = Instant::now(); let threadpool = ThreadPool::new(threads); for input in inputs { let mut output = args.output_directory.clone(); @@ -608,7 +609,6 @@ fn main() { let pbar = pbar.clone(); threadpool.execute(move || { - //println!("Processing {input:?}..."); process_file( &input, &output, From a6b63fd0e6a30429bd71a54c685e2bcc2a1ddd4b Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 7 Mar 2024 16:27:11 -0800 Subject: [PATCH 05/56] Added --no-progress flag --- src/main.rs | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 932c53c..f2fce7f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -105,6 +105,11 @@ struct Args { #[arg(long, default_value_t = false)] whole_paragraphs: bool, + /// If you don't want to include the progress bar, set this to true. + /// Will print out filenames as they get processed if this is true + #[arg(long, default_value_t = false)] + no_progress: bool, + /// The number of threads to use for processing. /// If this is 0, the number of threads is automatically determined. #[arg(long, short = 't', default_value_t = 0)] @@ -411,6 +416,7 @@ fn process_file( whole_document: bool, whole_paragraphs: bool, pbar: &Arc>, + no_progress: bool, ) -> Result<(), io::Error> { let input_file = OpenOptions::new() .read(true) @@ -527,7 +533,9 @@ fn process_file( serde_json::to_writer(&mut writer, &data)?; writer.write_all(b"\n")?; } - pbar.lock().unwrap().inc(1); + if !no_progress { + pbar.lock().unwrap().inc(1); + } Ok(()) } @@ -596,8 +604,10 @@ fn main() { "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", ).unwrap() ); - pbar.inc(0); // initalizes pbar let pbar = Arc::new(Mutex::new(pbar)); + if !args.no_progress { + pbar.lock().unwrap().inc(0); // initalizes pbar + } let now = Instant::now(); @@ -609,6 +619,10 @@ fn main() { let pbar = pbar.clone(); threadpool.execute(move || { + if args.no_progress { + println!("Processing {input:?}..."); + + } process_file( &input, &output, @@ -621,7 +635,8 @@ fn main() { args.annotate_attribute_only, args.whole_document, args.whole_paragraphs, - &pbar + &pbar, + args.no_progress, ) .unwrap(); }); From a7b7ccf5f1e8add9b1af307a09dd5647c0c38442 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 13 Mar 2024 12:16:42 -0700 Subject: [PATCH 06/56] Added options for no-progress (cleaner signature) | Updated README.md to describe some new features --- README.md | 18 +++++++++++++++++- src/main.rs | 25 ++++++++++++++++--------- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 2e500b7..f4be835 100644 --- a/README.md +++ b/README.md @@ -44,12 +44,28 @@ target/release/bff \ --bloom-filter-size 274877906944 \ --expected-ngram-count 1000000000 \ --output-directory deduped/ \ - *.json.gz + *.json.gz # Can also pass a directory containing .json.gz files here ``` Each input file will run in its own thread, and the filter will be shared between them. In the end, as before the filter will be written to disk. +### Automatically choosing filter size +To automatically compute the size of the filter, you can instead specify a false-positive rate. `bff` will create a bloom filter to attain that false positive rate, up to 90% of the system RAM. Note that false positive rate is per token-ngram, so the chance of a whole paragraph/document being marked as a false-positive will actually be quite less than the specified `fp-rate`: + +```bash +target/release/bff \ + --bloom-filter-file filter.bff \ + --fp-rate 0.01 \ + --expected-ngram-count 1000000000 \ + --output-directory deduped/ \ + input_dir/ +``` + +Each input file will run in its own thread, and the filter will be shared between them. +In the end, as before the filter will be written to disk. + + ### Pre-load the filter You can stick ngrams into the filter ahead of time, for example if you want to decontaminate your dataset: diff --git a/src/main.rs b/src/main.rs index f2fce7f..16c2b7e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -415,8 +415,7 @@ fn process_file( annotate_attribute_only: bool, whole_document: bool, whole_paragraphs: bool, - pbar: &Arc>, - no_progress: bool, + pbar_option: &Option>>, ) -> Result<(), io::Error> { let input_file = OpenOptions::new() .read(true) @@ -533,9 +532,11 @@ fn process_file( serde_json::to_writer(&mut writer, &data)?; writer.write_all(b"\n")?; } - if !no_progress { - pbar.lock().unwrap().inc(1); + match pbar_option { + Some(pbar) => pbar.lock().unwrap().inc(1), + None => (), } + Ok(()) } @@ -605,23 +606,30 @@ fn main() { ).unwrap() ); let pbar = Arc::new(Mutex::new(pbar)); + + if !args.no_progress { - pbar.lock().unwrap().inc(0); // initalizes pbar + pbar.lock().unwrap().inc(0); // initializes pbar } + let now = Instant::now(); let threadpool = ThreadPool::new(threads); for input in inputs { let mut output = args.output_directory.clone(); output.push(input.file_name().unwrap()); let bloom_filter = bloom_filter.clone(); - let pbar = pbar.clone(); + + let pbar_option: Option>> = if args.no_progress { + None + } else { + Some(pbar.clone()) + }; threadpool.execute(move || { if args.no_progress { println!("Processing {input:?}..."); - } process_file( &input, @@ -635,8 +643,7 @@ fn main() { args.annotate_attribute_only, args.whole_document, args.whole_paragraphs, - &pbar, - args.no_progress, + &pbar_option, ) .unwrap(); }); From 2cc9cacafc4adc5313cdb084b616182de33663b7 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 13 Mar 2024 12:17:35 -0700 Subject: [PATCH 07/56] Cleaned up readme duplicates --- README.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/README.md b/README.md index f4be835..bf6060b 100644 --- a/README.md +++ b/README.md @@ -62,9 +62,6 @@ target/release/bff \ input_dir/ ``` -Each input file will run in its own thread, and the filter will be shared between them. -In the end, as before the filter will be written to disk. - ### Pre-load the filter From 5be38a8b16f563ad6133e6d77be9180519bf0eae Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 13 Mar 2024 16:30:53 -0700 Subject: [PATCH 08/56] Bash script fixed --- Cargo.toml | 4 ---- scripts/s3_script.sh | 6 ++++-- src/main.rs | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7a0f339..8a475cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,11 +15,7 @@ rand = "0.8.4" ahash = { version = "0.8.1", features = ["runtime-rng"] } byteorder = "1" threadpool = "1.8.1" -<<<<<<< HEAD -indicatif = "*" -======= human_bytes = "0.4.3" indicatif = "0.17.8" glob = "0.3.1" anyhow = "1.0.80" ->>>>>>> main diff --git a/scripts/s3_script.sh b/scripts/s3_script.sh index 798e61b..1ffee70 100644 --- a/scripts/s3_script.sh +++ b/scripts/s3_script.sh @@ -66,7 +66,9 @@ aws s3 cp $s3_input_dir $input_files --recursive # ======================================================= # Use wimbd to get total tokens -if [ "$expected_ngrams" == "-1"]; then +echo "PRENGR, $expected_ngrams" +if [ "$expected_ngrams" == "-1" ]; then + echo "IN THIS BLOCK" wimbd_stats=$(wimbd stats $input_files/*.jsonl.gz) total_tokens=$(echo "$wimbd_stats" | grep "^total tokens" | sed 's/^total tokens: //' | tr -d ',') total_documents=$(echo "$wimbd_stats" | grep "^total documents" | sed 's/^total documents: //' | tr -d ',') @@ -82,7 +84,7 @@ echo "NGRAMS $expected_ngrams" # = Actually run bff = # ====================================================== rm -f filter.bff # Always rebuilds the filter froms scratch -target/release/bff --bloom-filter-file filter.bff --bloom-filter-size $bloom_filter_size --expected-ngram-count $expected_ngrams --output-directory $output_files $input_files +target/release/bff --bloom-filter-file filter.bff --expected-ngram-count $expected_ngrams --output-directory $output_files $input_files # ================================================== # = And then upload back to S3 = diff --git a/src/main.rs b/src/main.rs index 2ce8223..96a8270 100644 --- a/src/main.rs +++ b/src/main.rs @@ -27,7 +27,6 @@ use sysinfo::{ }; use threadpool::ThreadPool; use unicode_segmentation::UnicodeSegmentation; -use indicatif::{ProgressBar,ProgressStyle}; From 091ca4bd25ba5f3578f1a06148b7cf29e9d97394 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 14 Mar 2024 11:18:14 -0700 Subject: [PATCH 09/56] first commit --- Cargo.toml | 4 ++ scripts/create_size_groups.py | 57 ++++++++++++++++++++++++ src/main.rs | 84 +++++++++++++++++++++++------------ 3 files changed, 116 insertions(+), 29 deletions(-) create mode 100644 scripts/create_size_groups.py diff --git a/Cargo.toml b/Cargo.toml index 8a475cb..6148cbd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,3 +19,7 @@ human_bytes = "0.4.3" indicatif = "0.17.8" glob = "0.3.1" anyhow = "1.0.80" +rusoto_s3 = "0.48.0" +rusoto_core = "0.48.0" +rusoto_credential = "0.48.0" +tokio="*" \ No newline at end of file diff --git a/scripts/create_size_groups.py b/scripts/create_size_groups.py new file mode 100644 index 0000000..a6e71eb --- /dev/null +++ b/scripts/create_size_groups.py @@ -0,0 +1,57 @@ +""" +Simple adapter script that takes in the outputs of an +`aws s3 ls ...` command (that's been piped to an outfile) +and a max-file-size and groups these into files +""" + + +import argparse + + + +def create_groups(input_lines, group_size_in_bytes): + groups = [] + cur_group, cur_group_size = [], 0 + + parse_line = lambda line: line.strip().split(' ')[-2:]# outputs (size_in_bytes:string) + for line in input_lines: + next_line_bytes, next_line_file = parse_line(line) + next_line_bytes = int(next_line_bytes) + if cur_group_size + next_line_bytes > group_size_in_bytes: # make new group + if len(cur_group) > 0: + groups.append(cur_group) + cur_group_size = next_line_bytes + cur_group = [next_line_file] + else: + cur_group_size += next_line_bytes + cur_group.append(next_line_bytes) + + groups.append(cur_group) + return groups + + +def write_groupfile(groups, output_file): + """ Modify this to make it easy to read in rust""" + with open(output_file, 'w') as f: + for group in groups: + f.write(','.join(group) + '\n') + + +def main(input_file, output_file, group_size_in_bytes): + input_lines = open(input_file, 'r').readlines() + groups = create_groups(input_lines, group_size_in_bytes) + write_groupfile(groups, output_file) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--input', type=str, required=True) + parser.add_argument('--output', type=str, required=True) + parser.add_argument('--groupsize', type=int, required=True) + + args = parser.parse_args() + main(input_file=args.input, + output=args.output, + group_size_in_bytes=args.groupsize) + diff --git a/src/main.rs b/src/main.rs index 96a8270..6267f57 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,7 @@ +extern crate rusoto_core; +extern crate rusoto_s3; +extern crate rusoto_credential; + use ahash::RandomState; use anyhow::{anyhow, Result}; use byteorder::{LittleEndian, NativeEndian, ReadBytesExt, WriteBytesExt}; @@ -29,7 +33,9 @@ use threadpool::ThreadPool; use unicode_segmentation::UnicodeSegmentation; - +/************************************************ + * ARGUMENTS + * **********************************************/ #[derive(Parser, Debug)] struct Args { @@ -128,16 +134,11 @@ struct Args { output_directory: PathBuf, } -fn tokenize(s: &str) -> impl Iterator { - s.split_word_bounds().filter(|w| { - for c in w.chars() { - if !c.is_whitespace() { - return true; - } - } - false - }) -} + +/************************************************ + * Bloom filter stuff + * **********************************************/ + struct BloomFilter { bits: Vec, @@ -384,24 +385,6 @@ fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize) -> usize { } -fn expand_dirs(paths: &[PathBuf]) -> Result> { - let mut files = vec![]; - for path in paths { - if path.is_dir() { - let path_str = path - .to_str() - .ok_or_else(|| anyhow!("invalid path '{}'", path.to_string_lossy()))?; - for entry in glob(&format!("{}/**/*.json*.gz", path_str))? { - files.push(entry?.to_path_buf()); - } - } else { - files.push(path.clone()); - } - } - - Ok(files) -} - #[allow(clippy::too_many_arguments)] // TODO : abstract parameters into a struct fn process_file( @@ -540,6 +523,49 @@ fn process_file( Ok(()) } +fn tokenize(s: &str) -> impl Iterator { + s.split_word_bounds().filter(|w| { + for c in w.chars() { + if !c.is_whitespace() { + return true; + } + } + false + }) +} + + + +/************************************************ + * I/O stuff + * **********************************************/ + + + + +fn expand_dirs(paths: &[PathBuf]) -> Result> { + let mut files = vec![]; + for path in paths { + if path.is_dir() { + let path_str = path + .to_str() + .ok_or_else(|| anyhow!("invalid path '{}'", path.to_string_lossy()))?; + for entry in glob(&format!("{}/**/*.json*.gz", path_str))? { + files.push(entry?.to_path_buf()); + } + } else { + files.push(path.clone()); + } + } + + Ok(files) +} + + +/************************************************ + * Main Function + * **********************************************/ + fn main() { let args = Args::parse(); let inputs = expand_dirs(&args.inputs).unwrap(); From 884018071b8c5ef3670344f6f472ef1c4be00f1d Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 14 Mar 2024 11:21:44 -0700 Subject: [PATCH 10/56] Added fp rate to scripts --- scripts/s3_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/s3_script.sh b/scripts/s3_script.sh index 1ffee70..d85bb74 100644 --- a/scripts/s3_script.sh +++ b/scripts/s3_script.sh @@ -84,7 +84,7 @@ echo "NGRAMS $expected_ngrams" # = Actually run bff = # ====================================================== rm -f filter.bff # Always rebuilds the filter froms scratch -target/release/bff --bloom-filter-file filter.bff --expected-ngram-count $expected_ngrams --output-directory $output_files $input_files +target/release/bff --bloom-filter-file filter.bff --expected-ngram-count $expected_ngrams --fp-rate $fp_rate --output-directory $output_files $input_files # ================================================== # = And then upload back to S3 = From 0516941938b50658eb84179b59884dbea356eb06 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 14 Mar 2024 11:28:01 -0700 Subject: [PATCH 11/56] added no-save-bloom-filter option in bash script by default --- scripts/s3_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/s3_script.sh b/scripts/s3_script.sh index d85bb74..88c1768 100644 --- a/scripts/s3_script.sh +++ b/scripts/s3_script.sh @@ -84,7 +84,7 @@ echo "NGRAMS $expected_ngrams" # = Actually run bff = # ====================================================== rm -f filter.bff # Always rebuilds the filter froms scratch -target/release/bff --bloom-filter-file filter.bff --expected-ngram-count $expected_ngrams --fp-rate $fp_rate --output-directory $output_files $input_files +target/release/bff --bloom-filter-file filter.bff --expected-ngram-count $expected_ngrams --fp-rate $fp_rate --output-directory $output_files --no-save-bloom-filter $input_files # ================================================== # = And then upload back to S3 = From dee44c93aad8dc442c61e8a9969d8b9026050e8c Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Fri, 15 Mar 2024 11:31:09 -0700 Subject: [PATCH 12/56] Hacky AWS script seems to work okay --- Cargo.toml | 5 +- scripts/create_size_groups.py | 17 ++- scripts/s3_script.sh | 2 +- src/main.rs | 251 +++++++++++++++++++++++++++++++--- 4 files changed, 243 insertions(+), 32 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6148cbd..6327293 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,4 @@ human_bytes = "0.4.3" indicatif = "0.17.8" glob = "0.3.1" anyhow = "1.0.80" -rusoto_s3 = "0.48.0" -rusoto_core = "0.48.0" -rusoto_credential = "0.48.0" -tokio="*" \ No newline at end of file +tokio = { version = "1.36.0", features = ["full"] } diff --git a/scripts/create_size_groups.py b/scripts/create_size_groups.py index a6e71eb..d40f51a 100644 --- a/scripts/create_size_groups.py +++ b/scripts/create_size_groups.py @@ -6,16 +6,17 @@ import argparse +import re - -def create_groups(input_lines, group_size_in_bytes): +def create_groups(input_lines, group_size_in_bytes, re_suffix): groups = [] cur_group, cur_group_size = [], 0 parse_line = lambda line: line.strip().split(' ')[-2:]# outputs (size_in_bytes:string) for line in input_lines: next_line_bytes, next_line_file = parse_line(line) + if re.search(re_suffix, next_line_file) == None: continue next_line_bytes = int(next_line_bytes) if cur_group_size + next_line_bytes > group_size_in_bytes: # make new group if len(cur_group) > 0: @@ -24,7 +25,7 @@ def create_groups(input_lines, group_size_in_bytes): cur_group = [next_line_file] else: cur_group_size += next_line_bytes - cur_group.append(next_line_bytes) + cur_group.append(next_line_file) groups.append(cur_group) return groups @@ -37,9 +38,9 @@ def write_groupfile(groups, output_file): f.write(','.join(group) + '\n') -def main(input_file, output_file, group_size_in_bytes): +def main(input_file, output_file, group_size_in_bytes, re_suffix): input_lines = open(input_file, 'r').readlines() - groups = create_groups(input_lines, group_size_in_bytes) + groups = create_groups(input_lines, group_size_in_bytes, re_suffix) write_groupfile(groups, output_file) @@ -49,9 +50,11 @@ def main(input_file, output_file, group_size_in_bytes): parser.add_argument('--input', type=str, required=True) parser.add_argument('--output', type=str, required=True) parser.add_argument('--groupsize', type=int, required=True) + parser.add_argument('--suffix', type=str, default=r'\.jsonl?\.gz$') args = parser.parse_args() main(input_file=args.input, - output=args.output, - group_size_in_bytes=args.groupsize) + output_file=args.output, + group_size_in_bytes=args.groupsize, + re_suffix=args.suffix) diff --git a/scripts/s3_script.sh b/scripts/s3_script.sh index 1ffee70..88c1768 100644 --- a/scripts/s3_script.sh +++ b/scripts/s3_script.sh @@ -84,7 +84,7 @@ echo "NGRAMS $expected_ngrams" # = Actually run bff = # ====================================================== rm -f filter.bff # Always rebuilds the filter froms scratch -target/release/bff --bloom-filter-file filter.bff --expected-ngram-count $expected_ngrams --output-directory $output_files $input_files +target/release/bff --bloom-filter-file filter.bff --expected-ngram-count $expected_ngrams --fp-rate $fp_rate --output-directory $output_files --no-save-bloom-filter $input_files # ================================================== # = And then upload back to S3 = diff --git a/src/main.rs b/src/main.rs index 6267f57..909e5b2 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,3 @@ -extern crate rusoto_core; -extern crate rusoto_s3; -extern crate rusoto_credential; - use ahash::RandomState; use anyhow::{anyhow, Result}; use byteorder::{LittleEndian, NativeEndian, ReadBytesExt, WriteBytesExt}; @@ -16,12 +12,15 @@ use rand::Rng; use serde_json::Value; use std::clone::Clone; use std::collections::VecDeque; -use std::fs::OpenOptions; +use std::fs; +use std::fs::{OpenOptions, File}; use std::hash::{BuildHasher, Hash, Hasher}; use std::io; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::mem::size_of; -use std::path::PathBuf; +use std::path::{PathBuf, Path}; +use std::process::{Command}; +use std::string::String; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; use std::time::{Instant}; @@ -29,13 +28,14 @@ use std::thread::available_parallelism; use sysinfo::{ System, }; +use tokio::task::JoinSet; use threadpool::ThreadPool; use unicode_segmentation::UnicodeSegmentation; -/************************************************ - * ARGUMENTS - * **********************************************/ +/*================================================= += Arguments = +=================================================*/ #[derive(Parser, Debug)] struct Args { @@ -132,13 +132,40 @@ struct Args { /// in this directory. #[arg(long, short = 'o')] output_directory: PathBuf, -} -/************************************************ - * Bloom filter stuff - * **********************************************/ + //////////////// + // Hacky S3 stuff: + // Need: + // - groupfile + // - path within bucket + // - location of temp dir + // - output directory + /////////// + + /// Path within bucket. Basically the prefix of the s3 items + #[arg(long)] + path_within_bucket: String, + + /// Groupfile + #[arg(long, short='g')] + groupfile: PathBuf, + + /// Temp dir + #[arg(long)] + temp_dir: PathBuf, + + /// s3 output directory + #[arg(long)] + s3_output: PathBuf, + +} + + +/*=================================================== += Bloom Filter stuff = +===================================================*/ struct BloomFilter { bits: Vec, @@ -536,12 +563,10 @@ fn tokenize(s: &str) -> impl Iterator { -/************************************************ - * I/O stuff - * **********************************************/ - - +/*======================================================== += I/O Stuff = +========================================================*/ fn expand_dirs(paths: &[PathBuf]) -> Result> { let mut files = vec![]; @@ -557,16 +582,202 @@ fn expand_dirs(paths: &[PathBuf]) -> Result> { files.push(path.clone()); } } - Ok(files) } +fn gather_groups(path_within_bucket: String, groupfile: PathBuf) -> Vec> { + let file = File::open(groupfile).expect("Failed to open file!"); + let reader = BufReader::new(file); + + let parsed_data: Vec> = reader + .lines() + .flatten() + .map(|line| line.split(',').map(|s| + format!("{}{}", path_within_bucket, s.to_string())).collect()) + .collect(); + parsed_data +} + + +fn clear_dir(dir_path: &PathBuf) { + // Creates directory dir if it doesn't exist + // Deletes all contents from dir if it does + + // Create the directory if it doesn't exist + if !dir_path.exists() { + fs::create_dir_all(dir_path) + .expect("Failed to create directory"); + println!("Directory created: {:?}", dir_path); + } else { + // Delete all contents of the directory + remove_dir_contents(&dir_path) + .expect("Failed to remove directory contents"); + println!("Directory contents removed: {:?}", dir_path); + }} + + + +fn remove_dir_contents(dir_path_buf: &PathBuf) -> std::io::Result<()> { + let dir_path: &Path = dir_path_buf.as_ref(); + for entry in fs::read_dir(dir_path)? { + let entry = entry?; + let entry_path = entry.path(); + + if entry_path.is_dir() { + remove_dir_contents(&entry_path)?; + fs::remove_dir_all(entry_path)?; + } else { + fs::remove_file(entry_path)?; + } + } + + Ok(()) +} + +async fn aws_s3_cp_group(group: &Vec, output_loc: &PathBuf) { + let mut join_set = JoinSet::new(); + let output_loc_string : String = output_loc.display().to_string(); + for inner_el in group { + let child = Command::new("aws") + .arg("s3") + .arg("cp") + .arg(inner_el) + .arg(output_loc_string.clone()) + .spawn().expect("Failed to spawn!"); + join_set.spawn(async move { + child.wait_with_output().expect("Failed to finish task!"); + }); + } + + let mut completed = 0; + while completed < group.len() { + if let Some(_res) = join_set.join_next().await { + completed = completed + 1; + } + } +} + +/*============================================================= += Main function part 2 = +=============================================================*/ +#[tokio::main] +async fn main() { + let args = Args::parse(); + + // Initialize {Basic things} + let threads = if args.threads == 0 { + available_parallelism().unwrap().get() + } else { + args.threads + }; + let now = Instant::now(); + + + // Initialize groups + let groups = gather_groups(args.path_within_bucket, args.groupfile); + println!("GROUPS {:?}", groups); + // Initialize bloom filter + let mut bloom_filter_size = args.bloom_filter_size; + let bloom_filter = if args.bloom_filter_file.exists() { + println!("Loading bloom filter from {:?}...", args.bloom_filter_file); + BloomFilter::from_file(&args.bloom_filter_file).unwrap() + } else { + println!("Creating new bloom filter..."); + if args.bloom_filter_size == 0 { + bloom_filter_size = compute_bloom_size(args.fp_rate, args.expected_ngram_count); + } + let num_hashers = BloomFilter::optimal_number_of_hashers( + bloom_filter_size, + args.expected_ngram_count, + ); + BloomFilter::new(bloom_filter_size, num_hashers) + }; + let bloom_filter = Arc::new(bloom_filter); + println!( + "\t...Bloom filter loaded. ({} hashers) ({} seconds)", + bloom_filter.hash_builders.len(), now.elapsed().as_secs() + ); + + let p = bloom_filter.my_prob_of_false_positive(args.expected_ngram_count); + if p >= 0.5 { + println!( + "WARNING: Probability of a false positive after {} elements is {}.", + args.expected_ngram_count, p + ); + } else { + println!( + "Probability of a false positive after {} elements: {}", + args.expected_ngram_count, p + ); + } + + println!("Bloom filter size is {} ", human_bytes(bloom_filter.size_in_bytes() as f64)); + + + // Then loop over groups + for group in &groups { + println!("WORKING ON GROUP {:?}", group); + // First clean up the temp directory and download the group of s3 files + clear_dir(&args.temp_dir); + let temp_input_dir = args.temp_dir.join("input/"); + let temp_output_dir = args.temp_dir.join("output/"); + match fs::create_dir(&temp_output_dir) { + Ok(_) => (), + Err(err) => eprintln!("Error creating directory: {}",err), + } + aws_s3_cp_group(&group, &temp_input_dir).await; + // Then loop over all the files and wait until done processing + let threadpool = ThreadPool::new(threads); + if let Ok(inputs) = fs::read_dir(temp_input_dir) { + for input in inputs { + if let Ok(input) = input { + let path_buf: PathBuf = input.path(); + let mut output = temp_output_dir.clone(); + output.push(input.file_name()); + println!("I/O {:?}, {:?}", path_buf, output); + + process_file( + &path_buf, + &output, + &bloom_filter, + args.max_ngram_size, + args.min_ngram_size, + !args.no_update_bloom_filter, + args.filtering_threshold, + args.annotate_only, + args.annotate_attribute_only, + args.whole_document, + args.whole_paragraphs, + &None, + ) + .unwrap(); + } + } + } + threadpool.join(); + let to_upload_files = expand_dirs(&[args.temp_dir.join("output")]).unwrap(); + println!("TO UPLOAD, {:?}", to_upload_files); + let to_upload_files = to_upload_files.into_iter().map(|s| s.display().to_string()).collect(); + aws_s3_cp_group(&to_upload_files, &args.s3_output).await; + println!("FINISHED ONE LOOP!"); + + } + + // And finally upload the temp directory to s3 + + +} + + + + + /************************************************ * Main Function * **********************************************/ -fn main() { +fn main_dep() { let args = Args::parse(); let inputs = expand_dirs(&args.inputs).unwrap(); println!("Parsed {:?} input files...", inputs.len()); From 3832421e6992a1e3b7971a479dfe511773709c95 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Sat, 16 Mar 2024 18:38:16 -0700 Subject: [PATCH 13/56] s3 stream stuff added --- Cargo.toml | 4 +- src/main.rs | 257 ++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 204 insertions(+), 57 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6327293..a580420 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ edition = "2021" [dependencies] sysinfo="0.30.7" clap = { version = "4.1.11", features = ["derive"] } -flate2 = { version = "1.0", features = ["zlib-ng"], default-features = false } +flate2 = { version = "1.0.28", features = ["zlib-ng"], default-features = false } serde_json = "1.0" unicode-segmentation = "1.7" rand = "0.8.4" @@ -20,3 +20,5 @@ indicatif = "0.17.8" glob = "0.3.1" anyhow = "1.0.80" tokio = { version = "1.36.0", features = ["full"] } +aws-config = "1.1.8" +aws-sdk-s3 = "1.19.1" \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 909e5b2..369a4b3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,5 @@ use ahash::RandomState; -use anyhow::{anyhow, Result}; +use anyhow::{anyhow, Result, Error}; use byteorder::{LittleEndian, NativeEndian, ReadBytesExt, WriteBytesExt}; use clap::Parser; use flate2::read::MultiGzDecoder; @@ -16,6 +16,7 @@ use std::fs; use std::fs::{OpenOptions, File}; use std::hash::{BuildHasher, Hash, Hasher}; use std::io; +use std::io::{Cursor, Read}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::mem::size_of; use std::path::{PathBuf, Path}; @@ -31,6 +32,10 @@ use sysinfo::{ use tokio::task::JoinSet; use threadpool::ThreadPool; use unicode_segmentation::UnicodeSegmentation; +use aws_config::meta::region::RegionProviderChain; +use aws_config::BehaviorVersion; +use aws_sdk_s3::{Client}; +use aws_sdk_s3::primitives::ByteStream; /*================================================= @@ -137,7 +142,8 @@ struct Args { //////////////// // Hacky S3 stuff: // Need: - // - groupfile + // - list of input_output file maps + // // - path within bucket // - location of temp dir // - output directory @@ -146,20 +152,7 @@ struct Args { /// Path within bucket. Basically the prefix of the s3 items #[arg(long)] - path_within_bucket: String, - - /// Groupfile - #[arg(long, short='g')] - groupfile: PathBuf, - - /// Temp dir - #[arg(long)] - temp_dir: PathBuf, - - /// s3 output directory - #[arg(long)] - s3_output: PathBuf, - + s3_io: PathBuf } @@ -550,6 +543,171 @@ fn process_file( Ok(()) } + + +#[allow(clippy::too_many_arguments)] // TODO : abstract parameters into a struct +async fn process_file_s3( + s3_bucket: &String, + s3_input: &String, + s3_output: &String, + bloom_filter: &Arc, + max_ngram_size: usize, + min_ngram_size: usize, + update_bloom_filter: bool, + filtering_threshold: f64, + annotate_only: bool, + annotate_attribute_only: bool, + whole_document: bool, + whole_paragraphs: bool, +) -> Result<(), Error> { + + + // Phase 1a: Build s3 client + let region_provider = RegionProviderChain::first_try("us-west-2"); + let config = aws_config::defaults(BehaviorVersion::latest()) + .region(region_provider) + .load() + .await; + let client = Client::new(&config); + + + // Phase 1b: read data into lines + println!("S3 INPUT {} {}", s3_bucket, s3_input); + let object = client + .get_object() + .bucket(s3_bucket) + .key(s3_input) + .send() + .await?; + let data = object.body.collect().await?; + let data = data.into_bytes(); + let mut gz = MultiGzDecoder::new(&data[..]); + let mut input_string = String::new(); + gz.read_to_string(&mut input_string)?; + + // Phase 1c: Setup output buffer to upload->s3 eventually... + let mut output_data = Vec::new(); + let mut writer = GzEncoder::new(Cursor::new(&mut output_data), Compression::default()); + let mut count = 0; + + for line in input_string.lines() { + count += 1; + continue; + let line = line; + let mut data: Value = serde_json::from_str(&line).unwrap(); + let text = data["text"].as_str().unwrap(); + + let newlines = if whole_document { + vec![0, text.len()] + } else { + let mut newlines = Vec::new(); + newlines.push(0); + for i in text.match_indices('\n') { + newlines.push(i.0); + } + newlines.push(text.len()); + newlines + }; + let mut windows_to_remove = Vec::new(); + let mut total_contained_ngrams = 0; + + for paragraph_window in newlines.windows(2) { + let paragraph = &text[paragraph_window[0]..paragraph_window[1]]; + + // calculate hashes for the paragraph + let mut hashes: Vec> = Vec::new(); + let mut ngram: VecDeque<&str> = VecDeque::with_capacity(max_ngram_size); + for token in tokenize(paragraph) { + ngram.push_back(token); + // If not hashing whole paragraphs, add ngrams to the bloom filter as they reach max size + if !whole_paragraphs && ngram.len() >= max_ngram_size { + hashes.push(bloom_filter.hashes(&ngram)); + ngram.pop_front(); + } + } + // If the paragraph was too short, put in a shorter ngram, so we can dedupe short + // paragraphs exactly. + if hashes.is_empty() && ngram.len() >= min_ngram_size { + hashes.push(bloom_filter.hashes(&ngram)); + } + + let contained_ngrams = hashes + .iter() + .filter(|ngram| bloom_filter.contains_hashes(ngram)) + .count(); + total_contained_ngrams += contained_ngrams; + + // calculate how many ngrams are in the bloom filter + let number_of_ngrams = hashes.len(); + + // produce output + let too_many_duplicate_ngrams = + contained_ngrams as f64 / number_of_ngrams as f64 > filtering_threshold; + if too_many_duplicate_ngrams { + windows_to_remove.push(paragraph_window); + } else if update_bloom_filter { + for ngram in hashes { + bloom_filter.insert_hashes(&ngram); + } + } + } + + // if annotate_attribute_only or annotate_only, add the annotation to the json + if annotate_attribute_only || annotate_only { + data["bff_duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap(); + data["bff_contained_ngram_count"] = + serde_json::to_value(total_contained_ngrams).unwrap(); + } else { + let mut output_paragraphs = String::new(); + let mut last_end = 0; + for paragraph_window in windows_to_remove { + output_paragraphs.push_str(&text[last_end..paragraph_window[0]]); + last_end = paragraph_window[1]; + } + output_paragraphs.push_str(&text[last_end..]); + data["text"] = Value::String(output_paragraphs); + data["bff_contained_ngram_count_before_dedupe"] = + serde_json::to_value(total_contained_ngrams).unwrap(); + } + + if annotate_attribute_only { + // Allowed fields + let allowed_fields = [ + "bff_duplicate_spans", + "bff_contained_ngram_count", + "id", + "source", + ]; + + // Iterate through the keys of the JSON object and remove any field that is not in the allowed_fields list + if let Value::Object(ref mut map) = data { + map.retain(|key, _| allowed_fields.contains(&key.as_str())); + } + + } + + serde_json::to_writer(&mut writer, &data)?; + writer.write_all(b"\n")?; + } + println!("COUNTED {} LINES", count); + return Ok(()); + // to finalize, write to s3 + writer.finish().unwrap(); + let bytes_to_upload = ByteStream::from(output_data); + client + .put_object() + .bucket(s3_bucket) + .key(s3_output) + .body(bytes_to_upload) + .send() + .await?; + println!("COUNT WORKS {}", count); + Ok(()) +} + + + + fn tokenize(s: &str) -> impl Iterator { s.split_word_bounds().filter(|w| { for c in w.chars() { @@ -658,6 +816,15 @@ async fn aws_s3_cp_group(group: &Vec, output_loc: &PathBuf) { } } +fn split_bucket_path(uri: &str) -> Option<(String, String)> { + + if let Some((bucket, path)) = uri.split_once('/') { + Some((bucket.to_string(), path.to_string())) + } else { + None + } +} + /*============================================================= = Main function part 2 = =============================================================*/ @@ -674,9 +841,7 @@ async fn main() { let now = Instant::now(); - // Initialize groups - let groups = gather_groups(args.path_within_bucket, args.groupfile); - println!("GROUPS {:?}", groups); + // Initialize bloom filter let mut bloom_filter_size = args.bloom_filter_size; let bloom_filter = if args.bloom_filter_file.exists() { @@ -715,31 +880,20 @@ async fn main() { println!("Bloom filter size is {} ", human_bytes(bloom_filter.size_in_bytes() as f64)); - // Then loop over groups - for group in &groups { - println!("WORKING ON GROUP {:?}", group); - // First clean up the temp directory and download the group of s3 files - clear_dir(&args.temp_dir); - let temp_input_dir = args.temp_dir.join("input/"); - let temp_output_dir = args.temp_dir.join("output/"); - match fs::create_dir(&temp_output_dir) { - Ok(_) => (), - Err(err) => eprintln!("Error creating directory: {}",err), - } - aws_s3_cp_group(&group, &temp_input_dir).await; - // Then loop over all the files and wait until done processing - let threadpool = ThreadPool::new(threads); - if let Ok(inputs) = fs::read_dir(temp_input_dir) { - for input in inputs { - if let Ok(input) = input { - let path_buf: PathBuf = input.path(); - let mut output = temp_output_dir.clone(); - output.push(input.file_name()); - println!("I/O {:?}, {:?}", path_buf, output); - - process_file( - &path_buf, - &output, + + let io_file = File::open(args.s3_io).expect("Failed to open io file"); + let reader = BufReader::new(io_file); + for line in reader.lines() { + if let Ok(line) = line { + let parts: Vec<&str> = line.split(',').collect(); + let input_file = parts[0].replace("s3://", ""); + let output_file = parts[1].replace("s3://", ""); + println!("{} {}", input_file, output_file); + let (bucket, input_path) = split_bucket_path(&input_file).unwrap(); + let (_, output_path) = split_bucket_path(&output_file).unwrap(); + + process_file_s3(&bucket, &input_path, + &output_path, &bloom_filter, args.max_ngram_size, args.min_ngram_size, @@ -749,21 +903,12 @@ async fn main() { args.annotate_attribute_only, args.whole_document, args.whole_paragraphs, - &None, ) - .unwrap(); - } - } - } - threadpool.join(); - let to_upload_files = expand_dirs(&[args.temp_dir.join("output")]).unwrap(); - println!("TO UPLOAD, {:?}", to_upload_files); - let to_upload_files = to_upload_files.into_iter().map(|s| s.display().to_string()).collect(); - aws_s3_cp_group(&to_upload_files, &args.s3_output).await; - println!("FINISHED ONE LOOP!"); - + .await.unwrap(); + } } + // And finally upload the temp directory to s3 From b886faa0cafaad7d17d6d801c09a21bb91a9e766 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Sat, 16 Mar 2024 18:57:34 -0700 Subject: [PATCH 14/56] Oops, forgot to add threading --- src/main.rs | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/src/main.rs b/src/main.rs index 369a4b3..b53966b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -883,16 +883,27 @@ async fn main() { let io_file = File::open(args.s3_io).expect("Failed to open io file"); let reader = BufReader::new(io_file); + let threadpool = ThreadPool::new(threads); + for line in reader.lines() { - if let Ok(line) = line { - let parts: Vec<&str> = line.split(',').collect(); - let input_file = parts[0].replace("s3://", ""); - let output_file = parts[1].replace("s3://", ""); - println!("{} {}", input_file, output_file); - let (bucket, input_path) = split_bucket_path(&input_file).unwrap(); - let (_, output_path) = split_bucket_path(&output_file).unwrap(); - - process_file_s3(&bucket, &input_path, + let bloom_filter = bloom_filter.clone(); + + threadpool.execute(move || { + + + + + if let Ok(line) = line { + let parts: Vec<&str> = line.split(',').collect(); + let input_file = parts[0].replace("s3://", ""); + let output_file = parts[1].replace("s3://", ""); + println!("{} {}", input_file, output_file); + let (bucket, input_path) = split_bucket_path(&input_file).unwrap(); + let (_, output_path) = split_bucket_path(&output_file).unwrap(); + + let rt = tokio::runtime::Runtime::new().unwrap(); + let res = rt.block_on( + process_file_s3(&bucket, &input_path, &output_path, &bloom_filter, args.max_ngram_size, @@ -903,10 +914,18 @@ async fn main() { args.annotate_attribute_only, args.whole_document, args.whole_paragraphs, - ) - .await.unwrap(); - } + ) + ); + } + }); + + + + + } + threadpool.join(); + // And finally upload the temp directory to s3 From 9eb69b0bb45300ecbe39bd31a91a546877726030 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 18 Mar 2024 15:00:14 -0700 Subject: [PATCH 15/56] Subcommands for local + s3 stuff built --- Cargo.toml | 2 +- src/main.rs | 970 +++++++++++++++++++++------------------------------- 2 files changed, 399 insertions(+), 573 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a580420..f7387c0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,4 +21,4 @@ glob = "0.3.1" anyhow = "1.0.80" tokio = { version = "1.36.0", features = ["full"] } aws-config = "1.1.8" -aws-sdk-s3 = "1.19.1" \ No newline at end of file +aws-sdk-s3 = "1.19.1" diff --git a/src/main.rs b/src/main.rs index b53966b..75dd7fe 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ use ahash::RandomState; use anyhow::{anyhow, Result, Error}; use byteorder::{LittleEndian, NativeEndian, ReadBytesExt, WriteBytesExt}; -use clap::Parser; +use clap::{Args, Parser, Subcommand}; use flate2::read::MultiGzDecoder; use flate2::write::GzEncoder; use flate2::Compression; @@ -12,15 +12,13 @@ use rand::Rng; use serde_json::Value; use std::clone::Clone; use std::collections::VecDeque; -use std::fs; use std::fs::{OpenOptions, File}; use std::hash::{BuildHasher, Hash, Hasher}; use std::io; use std::io::{Cursor, Read}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::mem::size_of; -use std::path::{PathBuf, Path}; -use std::process::{Command}; +use std::path::{PathBuf}; use std::string::String; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; @@ -29,7 +27,6 @@ use std::thread::available_parallelism; use sysinfo::{ System, }; -use tokio::task::JoinSet; use threadpool::ThreadPool; use unicode_segmentation::UnicodeSegmentation; use aws_config::meta::region::RegionProviderChain; @@ -38,124 +35,132 @@ use aws_sdk_s3::{Client}; use aws_sdk_s3::primitives::ByteStream; -/*================================================= -= Arguments = -=================================================*/ -#[derive(Parser, Debug)] -struct Args { - #[arg(long)] - bloom_filter_file: PathBuf, +/*======================================================= += Argument Struct = +=======================================================*/ - /// The size of the bloom filter in bytes. If the filter already exists, this parameter is - /// ignored. - /// If ==0 this _requires_ that fp_rate is > 0 - #[arg(long, default_value_t=0)] - bloom_filter_size: usize, +#[derive(Parser)] +#[clap(author, version, about, long_about = None)] +struct ArgParser { + #[clap(subcommand)] + command: Commands, +} - /// The desired per-ngram false positive rate. If bloom_filter_size is not specified, this MUST - /// be specified, and the filter size will be computed using this FP rate and optimal number of - /// hashers. Maxes out at 90% of system RAM - #[arg(long, default_value_t=0.01)] - fp_rate: f64, +#[derive(Debug, Clone, Args)] +struct BffArgs{ + /* + -- BLOOM FILTER KWARGS + + bloom_filter_location: where we save/load the bloom filter + + expected_ngram_count: how many ngrams we're expecting + + fp_rate: false positive (per ngram) we're expecting + -- BLOOM FILTER HYPERPARAMS + + min_ngram_size (default: 5), smallest ngram size to consider + + max_ngram_size (default: 13), largest ngram size to consider + + filtering_threshold (default 0.80), threshold used to determine if text is duplicate + + -- BLOOM FILTER OVERRIDE KWARGS: + + bloom_filter_size (default: 0), if >0 we force the filter to have this size + + no_update_bloom_filter (default: false), if true, we never update the bloom filter + + no_save_bloom_filter (default: false), if true, we don't save the bloom filter at the end + + annotate_only (default: false), if true we leave text intact but annotate with which spans are duplicates + + whole_document (default: false), if true, we dedup across the whole document (spanning pargaraphs) + + whole_paragraph (default: false), if true, we don't match ngrams but rather whole paragraphs + + no_progress (default: false), if true, we don't display a progress bar, instead printing out files as we handle them + + threads: (default: 0), if > 0, we force use of this many threads, o/w it's automatically computed + */ + + // Bloom filter kwargs + #[arg(required = true, long)] + bloom_filter_file: PathBuf, - /// The number of expected ngrams. This is used to calculate the optimal number of hashers. - /// If the filter already exists, this parameter is ignored. - #[arg(long)] - expected_ngram_count: usize, + #[arg(required = true, long)] + expected_ngram_count: usize, - /// The smallest ngram size to consider. Paragraphs that have fewer than this number of tokens - /// are not deduplicated and always kept. These ngrams are never added to the bloom filter. - /// Note that this value only matters if the paragraph has fewer tokens than the max ngram size. + #[arg(required = true, long)] + fp_rate: f64, + + // Bloom filter hyperparams #[arg(long, default_value_t = 5)] - min_ngram_size: usize, + min_ngram_size: usize, - /// The largest ngram size to consider. Paragraphs are deduplicated based on the number of - /// ngrams of this size that are already present in the bloom filter. #[arg(long, default_value_t = 13)] - max_ngram_size: usize, + max_ngram_size: usize, - /// If this fraction of ngrams of the max ngram size are already present in the bloom filter, - /// the paragraph is considered a duplicate and is discarded. - /// Set this to 0 to never produce any output. This is useful when you want to prime the filter - /// with some content that should be considered duplicates, without deduplicating that content - /// itself. #[arg(long, default_value_t = 0.80)] filtering_threshold: f64, - /// Whether or not to update the bloom filter. If this is true, the filter is not updated, but - /// the input is still deduplicated based on the filter. Default is false. - #[arg(long, default_value_t = false)] - no_update_bloom_filter: bool, + // Bloom filter override args + #[arg(long, default_value_t=0)] + bloom_filter_size: usize, - /// Whether or not to save the bloom filter at the end. Defaults to false (i.e., saves the bloom filter) - /// If this is True, the bloom filter will NOT be saved, regardless of what no_update_bloom_filter suggests #[arg(long, default_value_t = false)] - no_save_bloom_filter: bool, - + no_update_bloom_filter: bool, - /// If this is true, we keep the input intact, but we add an annotation to each document that - /// explains which spans from the text would have been deleted. #[arg(long, default_value_t = false)] - annotate_only: bool, + no_save_bloom_filter: bool, - /// If this is true, we only write out document id and source, and annotate which spans would - /// have been deleted. This produces an attribute file per the llm-data specification. #[arg(long, default_value_t = false)] - annotate_attribute_only: bool, + annotate_attribute_only: bool, - /// If you want ngrams to span across paragraph breaks, set this to true. - /// This also means that bff will only remove a complete document at a time. When this happens - /// the resulting document will be empty. This also means that deduplication within a document - /// no longer works. All in all, it might be best to only use this when you're also using - /// --annotate-only. #[arg(long, default_value_t = false)] - whole_document: bool, + whole_document: bool, - /// If you want to always match whole paragraphs instead of ngrams, set this to true. - /// Paragraphs smaller than min_ngram_size will still be excluded. #[arg(long, default_value_t = false)] whole_paragraphs: bool, - /// If you don't want to include the progress bar, set this to true. - /// Will print out filenames as they get processed if this is true #[arg(long, default_value_t = false)] no_progress: bool, - /// The number of threads to use for processing. - /// If this is 0, the number of threads is automatically determined. #[arg(long, short = 't', default_value_t = 0)] - threads: usize, + threads: usize, +} + + + +#[derive(Subcommand, Debug)] +enum Commands { + /* Two commands here: + - `bff` is for LOCAL files (local in -> local out) + - `bff_s3` is for S3 files (S3 in -> S3 out) + Where each takes default arguments of: - /// Input files. These are expected to be gzip compressed newline-delimited JSON files with a - /// "text" field. - #[arg(index = 1)] - inputs: Vec, - /// Output directory. The output files will have the same name as the input files, but be placed - /// in this directory. - #[arg(long, short = 'o')] - output_directory: PathBuf, + And then subcommand arguments + -- bff: + + inputs: file or files (directories okay) of gzip compressed newline-delimited JSON files with a 'text' field + + output_directory: where the deduplicated files get loaded to + + -- bff_s3: + + s3_io_file: file where each line is like "s3://bucket/path/to/input.jsonl.gz,s3://bucket/path/to/output.jsonl.gz" + which explicitly lists all input files and where they should go once deduped + */ + + #[clap(arg_required_else_help = true)] + Bff { + // subcommand arguments + #[arg(required=true, long)] + inputs: Vec, + #[arg(required=true, long)] + output_directory: PathBuf, - //////////////// - // Hacky S3 stuff: - // Need: - // - list of input_output file maps - // - // - path within bucket - // - location of temp dir - // - output directory - /////////// + #[command(flatten)] + bff_args: BffArgs, + }, + BffRemote { + #[arg(required=true, long)] + s3io: PathBuf, - /// Path within bucket. Basically the prefix of the s3 items - #[arg(long)] - s3_io: PathBuf + #[command(flatten)] + bff_args: BffArgs + } } + /*=================================================== = Bloom Filter stuff = ===================================================*/ @@ -188,19 +193,6 @@ impl BloomFilter { (1.0 - (1.0 - (1.0 / m)).powf(k * n)).powf(k) } - fn suggest_size_in_bytes(expected_elements: usize) -> usize { - let mut size_in_bytes = 1024 * 1024; - while size_in_bytes < usize::MAX / 2 - && Self::prob_of_false_positive( - size_in_bytes, - expected_elements, - Self::optimal_number_of_hashers(size_in_bytes, expected_elements), - ) > 0.01 - { - size_in_bytes *= 2; - } - size_in_bytes - } fn my_prob_of_false_positive(&self, expected_elements: usize) -> f64 { Self::prob_of_false_positive( @@ -227,7 +219,7 @@ impl BloomFilter { } let number_of_u32 = size_in_bytes / size_of::(); - let mut bits = { + let bits = { (0..number_of_u32).map(|_| AtomicU32::default()).collect() }; @@ -363,10 +355,45 @@ impl BloomFilter { let hashes = self.hashes(s); self.contains_hashes(&hashes) } + + + fn from_args(bff_args: &BffArgs) -> Self { + /* Uses a BFFArgs object to build a bloom filter + Logic: + - Check if file exists, if so, just load it and return + - Get size: + + if size is explicitly speciifed, use this + + otherwise, compute based on ngrams + fp rate + - Return + */ + let mut bloom_filter_size = bff_args.bloom_filter_size; + + let bloom_filter = if bff_args.bloom_filter_file.exists() { + println!("Loading bloom filter from {:?}...", bff_args.bloom_filter_file); + BloomFilter::from_file(&bff_args.bloom_filter_file).unwrap() + } else { + println!("Creating new bloom filter..."); + if bff_args.bloom_filter_size == 0 { + bloom_filter_size = compute_bloom_size(bff_args.fp_rate, bff_args.expected_ngram_count); + } + let num_hashers = BloomFilter::optimal_number_of_hashers( + bloom_filter_size, + bff_args.expected_ngram_count, + ); + BloomFilter::new(bloom_filter_size, num_hashers) + }; + + + println!("Bloom filter has size {} | FP Rate {:?}", + human_bytes(bloom_filter.size_in_bytes() as f64), + bloom_filter.my_prob_of_false_positive(bff_args.expected_ngram_count)); + bloom_filter + } } + fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize) -> usize { /* Uses binary search to find optimal size of bloom filter using optimal number of hashers and provided ngram counts @@ -405,22 +432,16 @@ fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize) -> usize { } - -#[allow(clippy::too_many_arguments)] // TODO : abstract parameters into a struct +#[allow(clippy::too_many_arguments)] fn process_file( input_file: &PathBuf, output_file: &PathBuf, bloom_filter: &Arc, - max_ngram_size: usize, - min_ngram_size: usize, - update_bloom_filter: bool, - filtering_threshold: f64, - annotate_only: bool, - annotate_attribute_only: bool, - whole_document: bool, - whole_paragraphs: bool, + bff_args: &BffArgs, pbar_option: &Option>>, ) -> Result<(), io::Error> { + + // Setup input/output writers let input_file = OpenOptions::new() .read(true) .write(false) @@ -439,101 +460,11 @@ fn process_file( GzEncoder::new(output_file, Compression::default()), ); - for line in reader.lines() { - let line = line.unwrap(); - let mut data: Value = serde_json::from_str(&line).unwrap(); - let text = data["text"].as_str().unwrap(); - - let newlines = if whole_document { - vec![0, text.len()] - } else { - let mut newlines = Vec::new(); - newlines.push(0); - for i in text.match_indices('\n') { - newlines.push(i.0); - } - newlines.push(text.len()); - newlines - }; - let mut windows_to_remove = Vec::new(); - let mut total_contained_ngrams = 0; - - for paragraph_window in newlines.windows(2) { - let paragraph = &text[paragraph_window[0]..paragraph_window[1]]; - - // calculate hashes for the paragraph - let mut hashes: Vec> = Vec::new(); - let mut ngram: VecDeque<&str> = VecDeque::with_capacity(max_ngram_size); - for token in tokenize(paragraph) { - ngram.push_back(token); - // If not hashing whole paragraphs, add ngrams to the bloom filter as they reach max size - if !whole_paragraphs && ngram.len() >= max_ngram_size { - hashes.push(bloom_filter.hashes(&ngram)); - ngram.pop_front(); - } - } - // If the paragraph was too short, put in a shorter ngram, so we can dedupe short - // paragraphs exactly. - if hashes.is_empty() && ngram.len() >= min_ngram_size { - hashes.push(bloom_filter.hashes(&ngram)); - } - - let contained_ngrams = hashes - .iter() - .filter(|ngram| bloom_filter.contains_hashes(ngram)) - .count(); - total_contained_ngrams += contained_ngrams; - - // calculate how many ngrams are in the bloom filter - let number_of_ngrams = hashes.len(); - - // produce output - let too_many_duplicate_ngrams = - contained_ngrams as f64 / number_of_ngrams as f64 > filtering_threshold; - if too_many_duplicate_ngrams { - windows_to_remove.push(paragraph_window); - } else if update_bloom_filter { - for ngram in hashes { - bloom_filter.insert_hashes(&ngram); - } - } - } - // if annotate_attribute_only or annotate_only, add the annotation to the json - if annotate_attribute_only || annotate_only { - data["bff_duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap(); - data["bff_contained_ngram_count"] = - serde_json::to_value(total_contained_ngrams).unwrap(); - } else { - let mut output_paragraphs = String::new(); - let mut last_end = 0; - for paragraph_window in windows_to_remove { - output_paragraphs.push_str(&text[last_end..paragraph_window[0]]); - last_end = paragraph_window[1]; - } - output_paragraphs.push_str(&text[last_end..]); - data["text"] = Value::String(output_paragraphs); - data["bff_contained_ngram_count_before_dedupe"] = - serde_json::to_value(total_contained_ngrams).unwrap(); - } - - if annotate_attribute_only { - // Allowed fields - let allowed_fields = [ - "bff_duplicate_spans", - "bff_contained_ngram_count", - "id", - "source", - ]; - - // Iterate through the keys of the JSON object and remove any field that is not in the allowed_fields list - if let Value::Object(ref mut map) = data { - map.retain(|key, _| allowed_fields.contains(&key.as_str())); - } - - } - - serde_json::to_writer(&mut writer, &data)?; + // Loop over lines and do BFF stuff + for line in reader.lines() { + let dedup_data = process_line(&line.unwrap(), &bloom_filter, &bff_args); + serde_json::to_writer(&mut writer, &dedup_data)?; writer.write_all(b"\n")?; } match pbar_option { @@ -545,20 +476,14 @@ fn process_file( -#[allow(clippy::too_many_arguments)] // TODO : abstract parameters into a struct + async fn process_file_s3( s3_bucket: &String, s3_input: &String, s3_output: &String, bloom_filter: &Arc, - max_ngram_size: usize, - min_ngram_size: usize, - update_bloom_filter: bool, - filtering_threshold: f64, - annotate_only: bool, - annotate_attribute_only: bool, - whole_document: bool, - whole_paragraphs: bool, + bff_args: &BffArgs, + pbar_option: &Option>>, ) -> Result<(), Error> { @@ -572,125 +497,30 @@ async fn process_file_s3( // Phase 1b: read data into lines - println!("S3 INPUT {} {}", s3_bucket, s3_input); let object = client .get_object() .bucket(s3_bucket) .key(s3_input) .send() .await?; - let data = object.body.collect().await?; - let data = data.into_bytes(); - let mut gz = MultiGzDecoder::new(&data[..]); + let s3_data = object.body.collect().await?; + let s3_data = s3_data.into_bytes(); + let mut gz = MultiGzDecoder::new(&s3_data[..]); let mut input_string = String::new(); gz.read_to_string(&mut input_string)?; // Phase 1c: Setup output buffer to upload->s3 eventually... let mut output_data = Vec::new(); let mut writer = GzEncoder::new(Cursor::new(&mut output_data), Compression::default()); - let mut count = 0; + let mut count = 0; for line in input_string.lines() { count += 1; - continue; - let line = line; - let mut data: Value = serde_json::from_str(&line).unwrap(); - let text = data["text"].as_str().unwrap(); - - let newlines = if whole_document { - vec![0, text.len()] - } else { - let mut newlines = Vec::new(); - newlines.push(0); - for i in text.match_indices('\n') { - newlines.push(i.0); - } - newlines.push(text.len()); - newlines - }; - let mut windows_to_remove = Vec::new(); - let mut total_contained_ngrams = 0; - - for paragraph_window in newlines.windows(2) { - let paragraph = &text[paragraph_window[0]..paragraph_window[1]]; - - // calculate hashes for the paragraph - let mut hashes: Vec> = Vec::new(); - let mut ngram: VecDeque<&str> = VecDeque::with_capacity(max_ngram_size); - for token in tokenize(paragraph) { - ngram.push_back(token); - // If not hashing whole paragraphs, add ngrams to the bloom filter as they reach max size - if !whole_paragraphs && ngram.len() >= max_ngram_size { - hashes.push(bloom_filter.hashes(&ngram)); - ngram.pop_front(); - } - } - // If the paragraph was too short, put in a shorter ngram, so we can dedupe short - // paragraphs exactly. - if hashes.is_empty() && ngram.len() >= min_ngram_size { - hashes.push(bloom_filter.hashes(&ngram)); - } - - let contained_ngrams = hashes - .iter() - .filter(|ngram| bloom_filter.contains_hashes(ngram)) - .count(); - total_contained_ngrams += contained_ngrams; - - // calculate how many ngrams are in the bloom filter - let number_of_ngrams = hashes.len(); - - // produce output - let too_many_duplicate_ngrams = - contained_ngrams as f64 / number_of_ngrams as f64 > filtering_threshold; - if too_many_duplicate_ngrams { - windows_to_remove.push(paragraph_window); - } else if update_bloom_filter { - for ngram in hashes { - bloom_filter.insert_hashes(&ngram); - } - } - } - - // if annotate_attribute_only or annotate_only, add the annotation to the json - if annotate_attribute_only || annotate_only { - data["bff_duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap(); - data["bff_contained_ngram_count"] = - serde_json::to_value(total_contained_ngrams).unwrap(); - } else { - let mut output_paragraphs = String::new(); - let mut last_end = 0; - for paragraph_window in windows_to_remove { - output_paragraphs.push_str(&text[last_end..paragraph_window[0]]); - last_end = paragraph_window[1]; - } - output_paragraphs.push_str(&text[last_end..]); - data["text"] = Value::String(output_paragraphs); - data["bff_contained_ngram_count_before_dedupe"] = - serde_json::to_value(total_contained_ngrams).unwrap(); - } - - if annotate_attribute_only { - // Allowed fields - let allowed_fields = [ - "bff_duplicate_spans", - "bff_contained_ngram_count", - "id", - "source", - ]; - - // Iterate through the keys of the JSON object and remove any field that is not in the allowed_fields list - if let Value::Object(ref mut map) = data { - map.retain(|key, _| allowed_fields.contains(&key.as_str())); - } - - } - - serde_json::to_writer(&mut writer, &data)?; + let dedup_data = process_line(&line.to_string(), &bloom_filter, &bff_args); + serde_json::to_writer(&mut writer, &dedup_data)?; writer.write_all(b"\n")?; } - println!("COUNTED {} LINES", count); - return Ok(()); + println!("Number of lines in {:?} is {}", s3_input, count); // to finalize, write to s3 writer.finish().unwrap(); let bytes_to_upload = ByteStream::from(output_data); @@ -701,11 +531,110 @@ async fn process_file_s3( .body(bytes_to_upload) .send() .await?; - println!("COUNT WORKS {}", count); + match pbar_option { + Some(pbar) => pbar.lock().unwrap().inc(1), + None => (), + } + Ok(()) } +fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) -> serde_json::Value{ + let mut data: Value = serde_json::from_str(&line).unwrap(); + let text = data["text"].as_str().unwrap(); + + let newlines = if bff_args.whole_document { + vec![0, text.len()] + } else { + let mut newlines = Vec::new(); + newlines.push(0); + for i in text.match_indices('\n') { + newlines.push(i.0); + } + newlines.push(text.len()); + newlines + }; + let mut windows_to_remove = Vec::new(); + let mut total_contained_ngrams = 0; + + for paragraph_window in newlines.windows(2) { + let paragraph = &text[paragraph_window[0]..paragraph_window[1]]; + + // calculate hashes for the paragraph + let mut hashes: Vec> = Vec::new(); + let mut ngram: VecDeque<&str> = VecDeque::with_capacity(bff_args.max_ngram_size); + for token in tokenize(paragraph) { + ngram.push_back(token); + // If not hashing whole paragraphs, add ngrams to the bloom filter as they reach max size + if !bff_args.whole_paragraphs && ngram.len() >= bff_args.max_ngram_size { + hashes.push(bloom_filter.hashes(&ngram)); + ngram.pop_front(); + } + } + // If the paragraph was too short, put in a shorter ngram, so we can dedupe short + // paragraphs exactly. + if hashes.is_empty() && ngram.len() >= bff_args.min_ngram_size { + hashes.push(bloom_filter.hashes(&ngram)); + } + + let contained_ngrams = hashes + .iter() + .filter(|ngram| bloom_filter.contains_hashes(ngram)) + .count(); + total_contained_ngrams += contained_ngrams; + + // calculate how many ngrams are in the bloom filter + let number_of_ngrams = hashes.len(); + + // produce output + let too_many_duplicate_ngrams = + contained_ngrams as f64 / number_of_ngrams as f64 > bff_args.filtering_threshold; + if too_many_duplicate_ngrams { + windows_to_remove.push(paragraph_window); + } else if !bff_args.no_update_bloom_filter { + for ngram in hashes { + bloom_filter.insert_hashes(&ngram); + } + } + } + + // if annotate_attribute_only or annotate_only, add the annotation to the json + if bff_args.annotate_attribute_only { + data["bff_duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap(); + data["bff_contained_ngram_count"] = + serde_json::to_value(total_contained_ngrams).unwrap(); + } else { + let mut output_paragraphs = String::new(); + let mut last_end = 0; + for paragraph_window in windows_to_remove { + output_paragraphs.push_str(&text[last_end..paragraph_window[0]]); + last_end = paragraph_window[1]; + } + output_paragraphs.push_str(&text[last_end..]); + data["text"] = Value::String(output_paragraphs); + data["bff_contained_ngram_count_before_dedupe"] = + serde_json::to_value(total_contained_ngrams).unwrap(); + } + + if bff_args.annotate_attribute_only { + // Allowed fields + let allowed_fields = [ + "bff_duplicate_spans", + "bff_contained_ngram_count", + "id", + "source", + ]; + + // Iterate through the keys of the JSON object and remove any field that is not in the allowed_fields list + if let Value::Object(ref mut map) = data { + map.retain(|key, _| allowed_fields.contains(&key.as_str())); + } + } + data +} + + fn tokenize(s: &str) -> impl Iterator { @@ -744,78 +673,6 @@ fn expand_dirs(paths: &[PathBuf]) -> Result> { } -fn gather_groups(path_within_bucket: String, groupfile: PathBuf) -> Vec> { - let file = File::open(groupfile).expect("Failed to open file!"); - let reader = BufReader::new(file); - - let parsed_data: Vec> = reader - .lines() - .flatten() - .map(|line| line.split(',').map(|s| - format!("{}{}", path_within_bucket, s.to_string())).collect()) - .collect(); - parsed_data -} - - -fn clear_dir(dir_path: &PathBuf) { - // Creates directory dir if it doesn't exist - // Deletes all contents from dir if it does - - // Create the directory if it doesn't exist - if !dir_path.exists() { - fs::create_dir_all(dir_path) - .expect("Failed to create directory"); - println!("Directory created: {:?}", dir_path); - } else { - // Delete all contents of the directory - remove_dir_contents(&dir_path) - .expect("Failed to remove directory contents"); - println!("Directory contents removed: {:?}", dir_path); - }} - - - -fn remove_dir_contents(dir_path_buf: &PathBuf) -> std::io::Result<()> { - let dir_path: &Path = dir_path_buf.as_ref(); - for entry in fs::read_dir(dir_path)? { - let entry = entry?; - let entry_path = entry.path(); - - if entry_path.is_dir() { - remove_dir_contents(&entry_path)?; - fs::remove_dir_all(entry_path)?; - } else { - fs::remove_file(entry_path)?; - } - } - - Ok(()) -} - -async fn aws_s3_cp_group(group: &Vec, output_loc: &PathBuf) { - let mut join_set = JoinSet::new(); - let output_loc_string : String = output_loc.display().to_string(); - for inner_el in group { - let child = Command::new("aws") - .arg("s3") - .arg("cp") - .arg(inner_el) - .arg(output_loc_string.clone()) - .spawn().expect("Failed to spawn!"); - join_set.spawn(async move { - child.wait_with_output().expect("Failed to finish task!"); - }); - } - - let mut completed = 0; - while completed < group.len() { - if let Some(_res) = join_set.join_next().await { - completed = completed + 1; - } - } -} - fn split_bucket_path(uri: &str) -> Option<(String, String)> { if let Some((bucket, path)) = uri.split_once('/') { @@ -825,234 +682,203 @@ fn split_bucket_path(uri: &str) -> Option<(String, String)> { } } + + /*============================================================= -= Main function part 2 = += Main Function = =============================================================*/ #[tokio::main] -async fn main() { - let args = Args::parse(); +async fn main() -> std::io::Result<()> { + let args = ArgParser::parse(); + + match &args.command { + Commands::Bff {inputs, output_directory, bff_args} => + { + bff(inputs, output_directory, &bff_args)?; + }, + Commands::BffRemote {s3io, bff_args} => { + bff_s3(s3io, &bff_args).await?; + } + } + Ok(()) +} - // Initialize {Basic things} - let threads = if args.threads == 0 { - available_parallelism().unwrap().get() - } else { - args.threads - }; - let now = Instant::now(); +fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> std::io::Result<()> { + /* + General pseudocode: + Setup: + - Build/setup the bloom filter + - Expand all the inputs + - Setup progress bar + Main loop: + - loop over all files and process them + Finalize: + - Write bff if needed + */ - // Initialize bloom filter - let mut bloom_filter_size = args.bloom_filter_size; - let bloom_filter = if args.bloom_filter_file.exists() { - println!("Loading bloom filter from {:?}...", args.bloom_filter_file); - BloomFilter::from_file(&args.bloom_filter_file).unwrap() - } else { - println!("Creating new bloom filter..."); - if args.bloom_filter_size == 0 { - bloom_filter_size = compute_bloom_size(args.fp_rate, args.expected_ngram_count); - } - let num_hashers = BloomFilter::optimal_number_of_hashers( - bloom_filter_size, - args.expected_ngram_count, - ); - BloomFilter::new(bloom_filter_size, num_hashers) - }; - let bloom_filter = Arc::new(bloom_filter); - println!( - "\t...Bloom filter loaded. ({} hashers) ({} seconds)", - bloom_filter.hash_builders.len(), now.elapsed().as_secs() - ); + // SETUP PHASE - let p = bloom_filter.my_prob_of_false_positive(args.expected_ngram_count); - if p >= 0.5 { - println!( - "WARNING: Probability of a false positive after {} elements is {}.", - args.expected_ngram_count, p - ); - } else { - println!( - "Probability of a false positive after {} elements: {}", - args.expected_ngram_count, p + let start_time = Instant::now(); + let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); + let all_inputs = expand_dirs(inputs).unwrap(); + let pbar = ProgressBar::new(all_inputs.len() as u64) + .with_style( + ProgressStyle::with_template( + "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", + ).unwrap() ); + let pbar = Arc::new(Mutex::new(pbar)); + if !bff_args.no_progress { + pbar.lock().unwrap().inc(0); // initializes pbar } + println!("Completed setup phase in {:?} seconds", start_time.elapsed().as_secs()); - println!("Bloom filter size is {} ", human_bytes(bloom_filter.size_in_bytes() as f64)); - - - let io_file = File::open(args.s3_io).expect("Failed to open io file"); - let reader = BufReader::new(io_file); + // LOOP PHASE (W/ Threadpool) + let threads = if bff_args.threads == 0 { + available_parallelism().unwrap().get() + } else { + bff_args.threads + }; + let loop_start_time = Instant::now(); let threadpool = ThreadPool::new(threads); - - for line in reader.lines() { + for input in all_inputs { + let mut output = output_directory.clone(); + output.push(input.file_name().unwrap()); let bloom_filter = bloom_filter.clone(); + let bff_args = bff_args.clone(); - threadpool.execute(move || { - - - - - if let Ok(line) = line { - let parts: Vec<&str> = line.split(',').collect(); - let input_file = parts[0].replace("s3://", ""); - let output_file = parts[1].replace("s3://", ""); - println!("{} {}", input_file, output_file); - let (bucket, input_path) = split_bucket_path(&input_file).unwrap(); - let (_, output_path) = split_bucket_path(&output_file).unwrap(); - - let rt = tokio::runtime::Runtime::new().unwrap(); - let res = rt.block_on( - process_file_s3(&bucket, &input_path, - &output_path, - &bloom_filter, - args.max_ngram_size, - args.min_ngram_size, - !args.no_update_bloom_filter, - args.filtering_threshold, - args.annotate_only, - args.annotate_attribute_only, - args.whole_document, - args.whole_paragraphs, - ) - ); - } - }); - - + let pbar_option: Option>> = if bff_args.no_progress { + None + } else { + Some(pbar.clone()) + }; - - + threadpool.execute(move || { + if bff_args.no_progress { + println!("Processing {input:?}..."); + } + process_file( + &input, + &output, + &bloom_filter, + &bff_args, + &pbar_option, + ) + .unwrap(); + }); } - threadpool.join(); + threadpool.join(); + println!("Completed filtering all files in {:?} seconds", + loop_start_time.elapsed().as_secs()); - - // And finally upload the temp directory to s3 - - + // FINALIZE PHASE + if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { + let write_start_time = Instant::now(); + println!("Writing bloom filter to {:?}...", bff_args.bloom_filter_file); + bloom_filter.write_to_file(&bff_args.bloom_filter_file).unwrap(); + println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); + } + println!("Completed full BFF run in {:?} seconds", start_time.elapsed().as_secs()); + Ok(()) } +async fn bff_s3(s3_io: &PathBuf, bff_args: &BffArgs) -> std::io::Result<()> { + /* + General pseudocode: + Setup: + - Build/setup the bloom filter + - Setup thing to read s3_io + - Setup progress bar + Main loop: + - loop over all files and process them + Finalize: + - Write bff if needed + */ - -/************************************************ - * Main Function - * **********************************************/ - -fn main_dep() { - let args = Args::parse(); - let inputs = expand_dirs(&args.inputs).unwrap(); - println!("Parsed {:?} input files...", inputs.len()); - let threads = if args.threads == 0 { - available_parallelism().unwrap().get() - } else { - args.threads - }; - let now = Instant::now(); - let mut bloom_filter_size = args.bloom_filter_size; - let bloom_filter = if args.bloom_filter_file.exists() { - println!("Loading bloom filter from {:?}...", args.bloom_filter_file); - BloomFilter::from_file(&args.bloom_filter_file).unwrap() - } else { - println!("Creating new bloom filter..."); - if args.bloom_filter_size == 0 { - bloom_filter_size = compute_bloom_size(args.fp_rate, args.expected_ngram_count); - } - let num_hashers = BloomFilter::optimal_number_of_hashers( - bloom_filter_size, - args.expected_ngram_count, - ); - BloomFilter::new(bloom_filter_size, num_hashers) - }; - let bloom_filter = Arc::new(bloom_filter); - println!( - "\t...Bloom filter loaded. ({} hashers) ({} seconds)", - bloom_filter.hash_builders.len(), now.elapsed().as_secs() - ); - - let p = bloom_filter.my_prob_of_false_positive(args.expected_ngram_count); - if p >= 0.5 { - println!( - "WARNING: Probability of a false positive after {} elements is {}.", - args.expected_ngram_count, p - ); - } else { - println!( - "Probability of a false positive after {} elements: {}", - args.expected_ngram_count, p - ); - } - - let suggested_size = BloomFilter::suggest_size_in_bytes(args.expected_ngram_count); - println!("Suggested size is {} | Actual size is {} ", - human_bytes(suggested_size as f64), human_bytes(bloom_filter.size_in_bytes() as f64)); - if suggested_size * 2 < bloom_filter.size_in_bytes() { - println!( - "WARNING: Your bloom filter is more than twice as large as suggested for {} elements. \ - This is good for accuracy, but it is much slower, and likely not worth the trade-off.", - args.expected_ngram_count - ); - } - // Build Progress bar (do some hacky arc/mutex wrapping) - let num_files = inputs.len() as u64; + let start_time = Instant::now(); + let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); + let io_file = File::open(s3_io).expect("Failed to open io file"); + let num_files = std::fs::read_to_string(s3_io).unwrap().lines().count() as u64; + let err_count = Arc::new(Mutex::new(0)); + let reader = BufReader::new(io_file); let pbar = ProgressBar::new(num_files) .with_style( ProgressStyle::with_template( "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", ).unwrap() ); - let pbar = Arc::new(Mutex::new(pbar)); - - - if !args.no_progress { + if !bff_args.no_progress { pbar.lock().unwrap().inc(0); // initializes pbar } + println!("Completed setup phase in {:?} seconds", start_time.elapsed().as_secs()); - - let now = Instant::now(); + let threads = if bff_args.threads == 0 { + available_parallelism().unwrap().get() + } else { + bff_args.threads + }; let threadpool = ThreadPool::new(threads); - for input in inputs { - let mut output = args.output_directory.clone(); - output.push(input.file_name().unwrap()); - let bloom_filter = bloom_filter.clone(); - - let pbar_option: Option>> = if args.no_progress { + for line in reader.lines() { + let bloom_filter = bloom_filter.clone(); + let bff_args = bff_args.clone(); + let err_count: Arc> = Arc::clone(&err_count); + let pbar_option: Option>> = if bff_args.no_progress { None } else { Some(pbar.clone()) }; - threadpool.execute(move || { - if args.no_progress { - println!("Procaessing {input:?}..."); - } - process_file( - &input, - &output, - &bloom_filter, - args.max_ngram_size, - args.min_ngram_size, - !args.no_update_bloom_filter, - args.filtering_threshold, - args.annotate_only, - args.annotate_attribute_only, - args.whole_document, - args.whole_paragraphs, - &pbar_option, - ) - .unwrap(); - }); + if let Ok(line) = line { + let parts: Vec<&str> = line.split(',').collect(); + let input_file = parts[0].replace("s3://", ""); + let output_file = parts[1].replace("s3://", ""); + let (bucket, input_path) = split_bucket_path(&input_file).unwrap(); + let (_, output_path) = split_bucket_path(&output_file).unwrap(); + + + threadpool.execute(move || { + let rt = tokio::runtime::Runtime::new().unwrap(); + if let Err(err) = rt.block_on( + process_file_s3(&bucket, + &input_path, + &output_path, + &bloom_filter, + &bff_args, + &pbar_option) + ) { + eprintln!("Error processing {}; {:?}", input_path, err); + let mut count = err_count.lock().unwrap(); + *count += 1; + } + + + + }); + } } threadpool.join(); - println!("Completed deduplication in {} seconds", now.elapsed().as_secs()); - if (!args.no_update_bloom_filter) && (!args.no_save_bloom_filter) { - println!("Writing bloom filter to {:?}...", args.bloom_filter_file); - bloom_filter.write_to_file(&args.bloom_filter_file).unwrap(); - println!("Bloom filter written."); + + // FINALIZE PHASE + if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { + let write_start_time = Instant::now(); + println!("Writing bloom filter to {:?}...", bff_args.bloom_filter_file); + bloom_filter.write_to_file(&bff_args.bloom_filter_file).unwrap(); + println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); } + println!("Error count is {}/{}", err_count.lock().unwrap(), num_files); + println!("Completed full BFF run in {:?} seconds", start_time.elapsed().as_secs()); + Ok(()) } + + + + From 18853bff1626fb62c0fded5f05f2bd137d7ee3be Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 18 Mar 2024 16:06:04 -0700 Subject: [PATCH 16/56] Added sysreq command --- src/main.rs | 79 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 62 insertions(+), 17 deletions(-) diff --git a/src/main.rs b/src/main.rs index 75dd7fe..531937e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,7 +12,7 @@ use rand::Rng; use serde_json::Value; use std::clone::Clone; use std::collections::VecDeque; -use std::fs::{OpenOptions, File}; +use std::fs::{OpenOptions, File, remove_file}; use std::hash::{BuildHasher, Hash, Hasher}; use std::io; use std::io::{Cursor, Read}; @@ -156,7 +156,15 @@ enum Commands { #[command(flatten)] bff_args: BffArgs + }, + + Sysreq { + #[arg(required=true, long)] + expected_ngram_count: usize, + #[arg(required=true, long)] + fp_rate: f64 } + } @@ -435,7 +443,7 @@ fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize) -> usize { #[allow(clippy::too_many_arguments)] fn process_file( input_file: &PathBuf, - output_file: &PathBuf, + output_file_path: &PathBuf, bloom_filter: &Arc, bff_args: &BffArgs, pbar_option: &Option>>, @@ -449,12 +457,13 @@ fn process_file( .open(input_file)?; let reader = BufReader::with_capacity(1024 * 1024, MultiGzDecoder::new(input_file)); + let output_file = OpenOptions::new() .read(false) .write(true) .create(true) .truncate(true) - .open(output_file)?; + .open(output_file_path)?; let mut writer = BufWriter::with_capacity( 1024 * 1024, GzEncoder::new(output_file, Compression::default()), @@ -462,11 +471,25 @@ fn process_file( // Loop over lines and do BFF stuff + let mut count = 0; + let mut fully_skipped = 0; for line in reader.lines() { + count += 1; let dedup_data = process_line(&line.unwrap(), &bloom_filter, &bff_args); - serde_json::to_writer(&mut writer, &dedup_data)?; - writer.write_all(b"\n")?; + if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { + fully_skipped += 1; + } + else { + serde_json::to_writer(&mut writer, &dedup_data)?; + writer.write_all(b"\n")?; + } + } + + if count == fully_skipped { + remove_file(output_file_path)?; } + + match pbar_option { Some(pbar) => pbar.lock().unwrap().inc(1), None => (), @@ -514,23 +537,35 @@ async fn process_file_s3( let mut writer = GzEncoder::new(Cursor::new(&mut output_data), Compression::default()); let mut count = 0; + let mut fully_skipped = 0; for line in input_string.lines() { count += 1; let dedup_data = process_line(&line.to_string(), &bloom_filter, &bff_args); - serde_json::to_writer(&mut writer, &dedup_data)?; - writer.write_all(b"\n")?; + if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { + fully_skipped += 1; + } + else { + serde_json::to_writer(&mut writer, &dedup_data)?; + writer.write_all(b"\n")?; + } + } println!("Number of lines in {:?} is {}", s3_input, count); + + // to finalize, write to s3 writer.finish().unwrap(); - let bytes_to_upload = ByteStream::from(output_data); - client - .put_object() - .bucket(s3_bucket) - .key(s3_output) - .body(bytes_to_upload) - .send() - .await?; + + if fully_skipped < count { + let bytes_to_upload = ByteStream::from(output_data); + client + .put_object() + .bucket(s3_bucket) + .key(s3_output) + .body(bytes_to_upload) + .send() + .await?; + } match pbar_option { Some(pbar) => pbar.lock().unwrap().inc(1), None => (), @@ -684,6 +719,12 @@ fn split_bucket_path(uri: &str) -> Option<(String, String)> { +async fn list_s3_objects(bucket: &str, prefix: &str) { + (); +} + + + /*============================================================= = Main Function = =============================================================*/ @@ -697,8 +738,14 @@ async fn main() -> std::io::Result<()> { bff(inputs, output_directory, &bff_args)?; }, Commands::BffRemote {s3io, bff_args} => { + println!("NSBF {:?}", bff_args.no_save_bloom_filter); bff_s3(s3io, &bff_args).await?; } + Commands::Sysreq {expected_ngram_count, fp_rate} => { + let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count); + println!("To handle {} tokens with fp rate {}, you'd need a filter of size {}", + expected_ngram_count, fp_rate, human_bytes(bff_size as f64)); + } } Ok(()) } @@ -880,5 +927,3 @@ async fn bff_s3(s3_io: &PathBuf, bff_args: &BffArgs) -> std::io::Result<()> { } - - From 8052fe34a6833327dfd6b52ca28cdde263aa4007 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 18 Mar 2024 16:14:55 -0700 Subject: [PATCH 17/56] Okay, overrode the sysreq blockers --- src/main.rs | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/main.rs b/src/main.rs index 531937e..35f0e6a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -382,7 +382,7 @@ impl BloomFilter { } else { println!("Creating new bloom filter..."); if bff_args.bloom_filter_size == 0 { - bloom_filter_size = compute_bloom_size(bff_args.fp_rate, bff_args.expected_ngram_count); + bloom_filter_size = compute_bloom_size(bff_args.fp_rate, bff_args.expected_ngram_count, true); } let num_hashers = BloomFilter::optimal_number_of_hashers( bloom_filter_size, @@ -402,7 +402,7 @@ impl BloomFilter { -fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize) -> usize { +fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize, limit_to_sys: bool) -> usize { /* Uses binary search to find optimal size of bloom filter using optimal number of hashers and provided ngram counts */ @@ -412,10 +412,16 @@ fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize) -> usize { let mut lo = 1 as usize; - let mut hi = ((sys.total_memory() as f64) * 0.9) as usize; + + let mut hi = if limit_to_sys { + ((sys.total_memory() as f64) * 0.9) as usize + } else { + 420_744_073_709_551_615 as usize + }; + // Save some time by checking endpoint first - if BloomFilter::prob_of_false_positive(hi, expected_ngram_count, + if limit_to_sys && BloomFilter::prob_of_false_positive(hi, expected_ngram_count, BloomFilter::optimal_number_of_hashers(hi, expected_ngram_count)) > fp_rate { println!( "WARNING: To achieve desired false-positive rate, you'd need >90% of system RAM. Defaulting to 90% \ @@ -742,7 +748,7 @@ async fn main() -> std::io::Result<()> { bff_s3(s3io, &bff_args).await?; } Commands::Sysreq {expected_ngram_count, fp_rate} => { - let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count); + let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count, false); println!("To handle {} tokens with fp rate {}, you'd need a filter of size {}", expected_ngram_count, fp_rate, human_bytes(bff_size as f64)); } From 1b84f9fca366c2279625048aa3bcee8e3fa521fe Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 18 Mar 2024 17:50:14 -0700 Subject: [PATCH 18/56] Added better i/o collecting for bff-remote --- src/main.rs | 136 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 83 insertions(+), 53 deletions(-) diff --git a/src/main.rs b/src/main.rs index 35f0e6a..8a53f5b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,7 +12,7 @@ use rand::Rng; use serde_json::Value; use std::clone::Clone; use std::collections::VecDeque; -use std::fs::{OpenOptions, File, remove_file}; +use std::fs::{OpenOptions, remove_file}; use std::hash::{BuildHasher, Hash, Hasher}; use std::io; use std::io::{Cursor, Read}; @@ -123,7 +123,7 @@ struct BffArgs{ enum Commands { /* Two commands here: - `bff` is for LOCAL files (local in -> local out) - - `bff_s3` is for S3 files (S3 in -> S3 out) + - `bff_remote` is for S3 files (S3 in -> S3 out) Where each takes default arguments of: @@ -132,9 +132,10 @@ enum Commands { + inputs: file or files (directories okay) of gzip compressed newline-delimited JSON files with a 'text' field + output_directory: where the deduplicated files get loaded to - -- bff_s3: - + s3_io_file: file where each line is like "s3://bucket/path/to/input.jsonl.gz,s3://bucket/path/to/output.jsonl.gz" - which explicitly lists all input files and where they should go once deduped + -- bff_remote: + + bucket + + input_dir + + output_dir */ #[clap(arg_required_else_help = true)] @@ -152,7 +153,13 @@ enum Commands { BffRemote { #[arg(required=true, long)] - s3io: PathBuf, + bucket: String, + + #[arg(required=true, long)] + input_dir: String, + + #[arg(required=true, long)] + output_dir: String, #[command(flatten)] bff_args: BffArgs @@ -163,7 +170,7 @@ enum Commands { expected_ngram_count: usize, #[arg(required=true, long)] fp_rate: f64 - } + }, } @@ -714,23 +721,52 @@ fn expand_dirs(paths: &[PathBuf]) -> Result> { } -fn split_bucket_path(uri: &str) -> Option<(String, String)> { - - if let Some((bucket, path)) = uri.split_once('/') { - Some((bucket.to_string(), path.to_string())) - } else { - None - } +fn extract_s3_basename(input_path: &str) -> &str { + let parts: Vec<&str> = input_path.split('/').collect(); + parts.last().unwrap() } +async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str) -> Result>, Error> { + let region_provider = RegionProviderChain::first_try("us-west-2"); + let config = aws_config::defaults(BehaviorVersion::latest()) + .region(region_provider) + .load() + .await; + let client = Client::new(&config); -async fn list_s3_objects(bucket: &str, prefix: &str) { - (); + let mut response = client + .list_objects_v2() + .bucket(bucket.to_owned()) + .prefix(prefix.to_owned()) + .into_paginator() + .send(); + + + let mut io_pairs: Vec> = Vec::new(); + while let Some(result) = response.next().await { + match result { + Ok(output) => { + for object in output.contents() { + let input_key = object.key().unwrap(); + if !(input_key.ends_with(".jsonl.gz") || input_key.ends_with(".json.gz")) { + continue; + } + let basename = extract_s3_basename(&input_key); + let output_key = format!("{}{}", output_dir, basename).to_string(); + let io_pair = vec![String::from(input_key), String::from(&output_key)]; + io_pairs.push(io_pair); + } + } + Err(err) => { + eprintln!("{err:?}") + } + } + } + Ok(io_pairs) } - /*============================================================= = Main Function = =============================================================*/ @@ -743,15 +779,14 @@ async fn main() -> std::io::Result<()> { { bff(inputs, output_directory, &bff_args)?; }, - Commands::BffRemote {s3io, bff_args} => { - println!("NSBF {:?}", bff_args.no_save_bloom_filter); - bff_s3(s3io, &bff_args).await?; + Commands::BffRemote {bucket, input_dir, output_dir, bff_args} => { + bff_remote(bucket, input_dir, output_dir, &bff_args).await?; } Commands::Sysreq {expected_ngram_count, fp_rate} => { let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count, false); println!("To handle {} tokens with fp rate {}, you'd need a filter of size {}", expected_ngram_count, fp_rate, human_bytes(bff_size as f64)); - } + }, } Ok(()) } @@ -840,7 +875,7 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> } -async fn bff_s3(s3_io: &PathBuf, bff_args: &BffArgs) -> std::io::Result<()> { +async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bff_args: &BffArgs) -> std::io::Result<()> { /* General pseudocode: @@ -856,11 +891,12 @@ async fn bff_s3(s3_io: &PathBuf, bff_args: &BffArgs) -> std::io::Result<()> { let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); - let io_file = File::open(s3_io).expect("Failed to open io file"); - let num_files = std::fs::read_to_string(s3_io).unwrap().lines().count() as u64; + + let io_pairs = gather_s3_io(bucket, input_dir, output_dir).await.unwrap(); + + let num_files = io_pairs.len(); let err_count = Arc::new(Mutex::new(0)); - let reader = BufReader::new(io_file); - let pbar = ProgressBar::new(num_files) + let pbar = ProgressBar::new(num_files as u64) .with_style( ProgressStyle::with_template( "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", @@ -880,7 +916,8 @@ async fn bff_s3(s3_io: &PathBuf, bff_args: &BffArgs) -> std::io::Result<()> { }; let threadpool = ThreadPool::new(threads); - for line in reader.lines() { + for io_pair in &io_pairs { + let bucket = bucket.clone(); let bloom_filter = bloom_filter.clone(); let bff_args = bff_args.clone(); let err_count: Arc> = Arc::clone(&err_count); @@ -889,37 +926,30 @@ async fn bff_s3(s3_io: &PathBuf, bff_args: &BffArgs) -> std::io::Result<()> { } else { Some(pbar.clone()) }; + let input_path = io_pair[0].clone(); + let output_path = io_pair[1].clone(); - if let Ok(line) = line { - let parts: Vec<&str> = line.split(',').collect(); - let input_file = parts[0].replace("s3://", ""); - let output_file = parts[1].replace("s3://", ""); - let (bucket, input_path) = split_bucket_path(&input_file).unwrap(); - let (_, output_path) = split_bucket_path(&output_file).unwrap(); - - - threadpool.execute(move || { - let rt = tokio::runtime::Runtime::new().unwrap(); - if let Err(err) = rt.block_on( - process_file_s3(&bucket, - &input_path, - &output_path, - &bloom_filter, - &bff_args, - &pbar_option) - ) { - eprintln!("Error processing {}; {:?}", input_path, err); - let mut count = err_count.lock().unwrap(); - *count += 1; - } + threadpool.execute(move || { + let rt = tokio::runtime::Runtime::new().unwrap(); + if let Err(err) = rt.block_on( + process_file_s3(&bucket, + &input_path, + &output_path, + &bloom_filter, + &bff_args, + &pbar_option) + ) { + eprintln!("Error processing {}; {:?}", input_path, err); + let mut count = err_count.lock().unwrap(); + *count += 1; + } - - }); - } + + }); + } threadpool.join(); - // FINALIZE PHASE if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { let write_start_time = Instant::now(); From 50eee20709d64445ed939d72d749958505e8f80c Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Tue, 19 Mar 2024 10:41:47 -0700 Subject: [PATCH 19/56] Added streaming support for s3-in --- src/main.rs | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/main.rs b/src/main.rs index 8a53f5b..6b27771 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,7 @@ use ahash::RandomState; use anyhow::{anyhow, Result, Error}; use byteorder::{LittleEndian, NativeEndian, ReadBytesExt, WriteBytesExt}; use clap::{Args, Parser, Subcommand}; -use flate2::read::MultiGzDecoder; +use flate2::read::{MultiGzDecoder}; use flate2::write::GzEncoder; use flate2::Compression; use glob::glob; @@ -15,7 +15,7 @@ use std::collections::VecDeque; use std::fs::{OpenOptions, remove_file}; use std::hash::{BuildHasher, Hash, Hasher}; use std::io; -use std::io::{Cursor, Read}; +use std::io::{Cursor}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::mem::size_of; use std::path::{PathBuf}; @@ -33,6 +33,13 @@ use aws_config::meta::region::RegionProviderChain; use aws_config::BehaviorVersion; use aws_sdk_s3::{Client}; use aws_sdk_s3::primitives::ByteStream; +use aws_sdk_s3::operation::get_object::GetObjectOutput; + + +use tokio::io::{AsyncBufReadExt}; +use tokio::io::BufReader as tBufReader; +use async_compression::tokio::bufread::GzipDecoder as asyncGZ; + @@ -521,7 +528,7 @@ async fn process_file_s3( bff_args: &BffArgs, pbar_option: &Option>>, ) -> Result<(), Error> { - + // // Phase 1a: Build s3 client let region_provider = RegionProviderChain::first_try("us-west-2"); @@ -533,25 +540,29 @@ async fn process_file_s3( // Phase 1b: read data into lines - let object = client + // Note: this reads in a streaming sense (but we don't upload in streaming) + let object: GetObjectOutput = client .get_object() .bucket(s3_bucket) .key(s3_input) .send() .await?; - let s3_data = object.body.collect().await?; - let s3_data = s3_data.into_bytes(); - let mut gz = MultiGzDecoder::new(&s3_data[..]); - let mut input_string = String::new(); - gz.read_to_string(&mut input_string)?; + + let body_stream = object.body.into_async_read(); + let gz = asyncGZ::new(body_stream); + let reader = tBufReader::new(gz); + let mut lines_iter = reader.lines(); // Phase 1c: Setup output buffer to upload->s3 eventually... + // TODO: Make output writer streaming too? let mut output_data = Vec::new(); let mut writer = GzEncoder::new(Cursor::new(&mut output_data), Compression::default()); + + // Phase 2: Loop over lines, process each line, and write it if not fully eradicated let mut count = 0; let mut fully_skipped = 0; - for line in input_string.lines() { + while let Some(line) = lines_iter.next_line().await? { count += 1; let dedup_data = process_line(&line.to_string(), &bloom_filter, &bff_args); if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { @@ -566,9 +577,8 @@ async fn process_file_s3( println!("Number of lines in {:?} is {}", s3_input, count); - // to finalize, write to s3 + // Phase 3: to finalize, write to s3 if there's something to write writer.finish().unwrap(); - if fully_skipped < count { let bytes_to_upload = ByteStream::from(output_data); client @@ -583,7 +593,7 @@ async fn process_file_s3( Some(pbar) => pbar.lock().unwrap().inc(1), None => (), } - + Ok(()) } @@ -742,7 +752,6 @@ async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str) -> Result> = Vec::new(); while let Some(result) = response.next().await { match result { @@ -793,7 +802,6 @@ async fn main() -> std::io::Result<()> { fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> std::io::Result<()> { - /* General pseudocode: Setup: @@ -805,9 +813,7 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> Finalize: - Write bff if needed */ - // SETUP PHASE - let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); let all_inputs = expand_dirs(inputs).unwrap(); @@ -876,7 +882,6 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bff_args: &BffArgs) -> std::io::Result<()> { - /* General pseudocode: Setup: @@ -888,7 +893,6 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf Finalize: - Write bff if needed */ - let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); @@ -908,14 +912,12 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf } println!("Completed setup phase in {:?} seconds", start_time.elapsed().as_secs()); - let threads = if bff_args.threads == 0 { available_parallelism().unwrap().get() } else { bff_args.threads }; let threadpool = ThreadPool::new(threads); - for io_pair in &io_pairs { let bucket = bucket.clone(); let bloom_filter = bloom_filter.clone(); @@ -943,8 +945,6 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf let mut count = err_count.lock().unwrap(); *count += 1; } - - }); From 241dc0555bb02f9576b0bf5a8e29347f12c1418c Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Tue, 19 Mar 2024 11:01:51 -0700 Subject: [PATCH 20/56] Added default region for AWS --- src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 6b27771..643d7fa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -531,7 +531,7 @@ async fn process_file_s3( // // Phase 1a: Build s3 client - let region_provider = RegionProviderChain::first_try("us-west-2"); + let region_provider = RegionProviderChain::default_provider(); let config = aws_config::defaults(BehaviorVersion::latest()) .region(region_provider) .load() @@ -738,7 +738,7 @@ fn extract_s3_basename(input_path: &str) -> &str { async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str) -> Result>, Error> { - let region_provider = RegionProviderChain::first_try("us-west-2"); + let region_provider = RegionProviderChain::default_provider(); let config = aws_config::defaults(BehaviorVersion::latest()) .region(region_provider) .load() From 4840e95511d1a926f8261e615701b5dd73f615d3 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 20 Mar 2024 10:00:22 -0700 Subject: [PATCH 21/56] Added some more timing/ablations for some various checks --- Cargo.toml | 1 + src/main.rs | 180 +++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 158 insertions(+), 23 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f7387c0..b53831d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,3 +22,4 @@ anyhow = "1.0.80" tokio = { version = "1.36.0", features = ["full"] } aws-config = "1.1.8" aws-sdk-s3 = "1.19.1" +async-compression = {version ="0.4.6", features=["all"]} diff --git a/src/main.rs b/src/main.rs index 643d7fa..e702762 100644 --- a/src/main.rs +++ b/src/main.rs @@ -120,6 +120,14 @@ struct BffArgs{ #[arg(long, default_value_t = false)] no_progress: bool, + + /// PROFILE ARGS (REMOVE LATER) + #[arg(long, default_value_t = false)] + count_lines_only: bool, + + #[arg(long, default_value_t = false)] + fully_download_s3: bool, + #[arg(long, short = 't', default_value_t = 0)] threads: usize, } @@ -520,7 +528,7 @@ fn process_file( -async fn process_file_s3( +async fn process_file_s3_stream( s3_bucket: &String, s3_input: &String, s3_output: &String, @@ -548,6 +556,7 @@ async fn process_file_s3( .send() .await?; + let body_stream = object.body.into_async_read(); let gz = asyncGZ::new(body_stream); let reader = tBufReader::new(gz); @@ -556,7 +565,8 @@ async fn process_file_s3( // Phase 1c: Setup output buffer to upload->s3 eventually... // TODO: Make output writer streaming too? let mut output_data = Vec::new(); - let mut writer = GzEncoder::new(Cursor::new(&mut output_data), Compression::default()); + let encoder = GzEncoder::new(Cursor::new(&mut output_data), Compression::default()); + let mut buf_writer = BufWriter::with_capacity(1024 * 1024, encoder); // Phase 2: Loop over lines, process each line, and write it if not fully eradicated @@ -564,13 +574,105 @@ async fn process_file_s3( let mut fully_skipped = 0; while let Some(line) = lines_iter.next_line().await? { count += 1; + if bff_args.count_lines_only { // REMOVE THIS BLOCK WHEN WE REMOVE count_lines_only arg + fully_skipped += 1; + continue; + } let dedup_data = process_line(&line.to_string(), &bloom_filter, &bff_args); if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { fully_skipped += 1; } else { - serde_json::to_writer(&mut writer, &dedup_data)?; - writer.write_all(b"\n")?; + serde_json::to_writer(&mut buf_writer, &dedup_data)?; + buf_writer.write_all(b"\n")?; + } + + } + println!("Number of lines in {:?} is {}", s3_input, count); + + + // Phase 3: to finalize, write to s3 if there's something to write + buf_writer.flush()?; + let encoder = buf_writer.into_inner().expect("Failed to get encoder"); + encoder.finish().unwrap(); + + if fully_skipped < count { + let bytes_to_upload = ByteStream::from(output_data); + client + .put_object() + .bucket(s3_bucket) + .key(s3_output) + .body(bytes_to_upload) + .send() + .await?; + } + match pbar_option { + Some(pbar) => pbar.lock().unwrap().inc(1), + None => (), + } + + Ok(()) +} + + + +async fn process_file_s3_download( + s3_bucket: &String, + s3_input: &String, + s3_output: &String, + bloom_filter: &Arc, + bff_args: &BffArgs, + pbar_option: &Option>>, +) -> Result<(), Error> { + // + + // Phase 1a: Build s3 client + let region_provider = RegionProviderChain::default_provider(); + let config = aws_config::defaults(BehaviorVersion::latest()) + .region(region_provider) + .load() + .await; + let client = Client::new(&config); + + + // Phase 1b: read data into lines + // Note: this reads in a streaming sense (but we don't upload in streaming) + let object: GetObjectOutput = client + .get_object() + .bucket(s3_bucket) + .key(s3_input) + .send() + .await?; + + + let body_stream = object.body.into_async_read(); + let gz = asyncGZ::new(body_stream); + let reader = tBufReader::new(gz); + let mut lines_iter = reader.lines(); + + // Phase 1c: Setup output buffer to upload->s3 eventually... + // TODO: Make output writer streaming too? + let mut output_data = Vec::new(); + let encoder = GzEncoder::new(Cursor::new(&mut output_data), Compression::default()); + let mut buf_writer = BufWriter::with_capacity(1024 * 1024, encoder); + + + // Phase 2: Loop over lines, process each line, and write it if not fully eradicated + let mut count = 0; + let mut fully_skipped = 0; + while let Some(line) = lines_iter.next_line().await? { + count += 1; + if bff_args.count_lines_only { // REMOVE THIS BLOCK WHEN WE REMOVE count_lines_only arg + fully_skipped += 1; + continue; + } + let dedup_data = process_line(&line.to_string(), &bloom_filter, &bff_args); + if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { + fully_skipped += 1; + } + else { + serde_json::to_writer(&mut buf_writer, &dedup_data)?; + buf_writer.write_all(b"\n")?; } } @@ -578,7 +680,10 @@ async fn process_file_s3( // Phase 3: to finalize, write to s3 if there's something to write - writer.finish().unwrap(); + buf_writer.flush()?; + let encoder = buf_writer.into_inner().expect("Failed to get encoder"); + encoder.finish().unwrap(); + if fully_skipped < count { let bytes_to_upload = ByteStream::from(output_data); client @@ -598,6 +703,8 @@ async fn process_file_s3( } + + fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) -> serde_json::Value{ let mut data: Value = serde_json::from_str(&line).unwrap(); let text = data["text"].as_str().unwrap(); @@ -793,8 +900,9 @@ async fn main() -> std::io::Result<()> { } Commands::Sysreq {expected_ngram_count, fp_rate} => { let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count, false); - println!("To handle {} tokens with fp rate {}, you'd need a filter of size {}", - expected_ngram_count, fp_rate, human_bytes(bff_size as f64)); + let num_hashers = BloomFilter::optimal_number_of_hashers(bff_size, *expected_ngram_count); + println!("To handle {} tokens with fp rate {}, you'd need a filter of size {} and {} hashers", + expected_ngram_count, fp_rate, human_bytes(bff_size as f64), num_hashers); }, } Ok(()) @@ -912,6 +1020,8 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf } println!("Completed setup phase in {:?} seconds", start_time.elapsed().as_secs()); + + let loop_start_time = Instant::now(); let threads = if bff_args.threads == 0 { available_parallelism().unwrap().get() } else { @@ -931,25 +1041,49 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf let input_path = io_pair[0].clone(); let output_path = io_pair[1].clone(); - threadpool.execute(move || { - let rt = tokio::runtime::Runtime::new().unwrap(); - if let Err(err) = rt.block_on( - process_file_s3(&bucket, - &input_path, - &output_path, - &bloom_filter, - &bff_args, - &pbar_option) - ) { - eprintln!("Error processing {}; {:?}", input_path, err); - let mut count = err_count.lock().unwrap(); - *count += 1; - } - - }); + + if bff_args.fully_download_s3 { + threadpool.execute(move || { + let rt = tokio::runtime::Runtime::new().unwrap(); + if let Err(err) = rt.block_on( + process_file_s3_download(&bucket, + &input_path, + &output_path, + &bloom_filter, + &bff_args, + &pbar_option) + ) { + eprintln!("Error processing {}; {:?}", input_path, err); + let mut count = err_count.lock().unwrap(); + *count += 1; + } + + }); + } else { + threadpool.execute(move || { + let rt = tokio::runtime::Runtime::new().unwrap(); + if let Err(err) = rt.block_on( + process_file_s3_stream(&bucket, + &input_path, + &output_path, + &bloom_filter, + &bff_args, + &pbar_option) + ) { + eprintln!("Error processing {}; {:?}", input_path, err); + let mut count = err_count.lock().unwrap(); + *count += 1; + } + + }); + + } } threadpool.join(); + println!("Completed filtering all files in {:?} seconds", + loop_start_time.elapsed().as_secs()); + // FINALIZE PHASE if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { let write_start_time = Instant::now(); From cd29a10275a31e27c60c8ceda7e1c7e146628a01 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 20 Mar 2024 10:30:34 -0700 Subject: [PATCH 22/56] More testing stuff --- src/main.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/main.rs b/src/main.rs index e702762..3cfc199 100644 --- a/src/main.rs +++ b/src/main.rs @@ -120,6 +120,9 @@ struct BffArgs{ #[arg(long, default_value_t = false)] no_progress: bool, + #[arg(long, short = 't', default_value_t = 0)] + threads: usize, + /// PROFILE ARGS (REMOVE LATER) #[arg(long, default_value_t = false)] @@ -128,8 +131,10 @@ struct BffArgs{ #[arg(long, default_value_t = false)] fully_download_s3: bool, - #[arg(long, short = 't', default_value_t = 0)] - threads: usize, + #[arg(long, default_value_t= 0)] + limit_files: usize, + + } @@ -588,7 +593,7 @@ async fn process_file_s3_stream( } } - println!("Number of lines in {:?} is {}", s3_input, count); + // println!("Number of lines in {:?} is {}", s3_input, count); // Phase 3: to finalize, write to s3 if there's something to write @@ -1004,7 +1009,11 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); - let io_pairs = gather_s3_io(bucket, input_dir, output_dir).await.unwrap(); + let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir).await.unwrap(); + if bff_args.limit_files > 0 && io_pairs.len() > bff_args.limit_files { + io_pairs.truncate(bff_args.limit_files); + } + let num_files = io_pairs.len(); let err_count = Arc::new(Mutex::new(0)); @@ -1083,7 +1092,7 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf threadpool.join(); println!("Completed filtering all files in {:?} seconds", loop_start_time.elapsed().as_secs()); - + // FINALIZE PHASE if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { let write_start_time = Instant::now(); From ba3cd53cbfb2a90d394edb3b26a783d1409c09c7 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 20 Mar 2024 11:55:08 -0700 Subject: [PATCH 23/56] Removing debuggy things --- src/main.rs | 141 +++------------------------------------------------- 1 file changed, 7 insertions(+), 134 deletions(-) diff --git a/src/main.rs b/src/main.rs index 3cfc199..b82d2b1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -123,18 +123,6 @@ struct BffArgs{ #[arg(long, short = 't', default_value_t = 0)] threads: usize, - - /// PROFILE ARGS (REMOVE LATER) - #[arg(long, default_value_t = false)] - count_lines_only: bool, - - #[arg(long, default_value_t = false)] - fully_download_s3: bool, - - #[arg(long, default_value_t= 0)] - limit_files: usize, - - } @@ -533,7 +521,7 @@ fn process_file( -async fn process_file_s3_stream( +async fn process_file_s3( s3_bucket: &String, s3_input: &String, s3_output: &String, @@ -564,7 +552,7 @@ async fn process_file_s3_stream( let body_stream = object.body.into_async_read(); let gz = asyncGZ::new(body_stream); - let reader = tBufReader::new(gz); + let reader = tBufReader::with_capacity(1024 * 1024, gz); let mut lines_iter = reader.lines(); // Phase 1c: Setup output buffer to upload->s3 eventually... @@ -579,10 +567,6 @@ async fn process_file_s3_stream( let mut fully_skipped = 0; while let Some(line) = lines_iter.next_line().await? { count += 1; - if bff_args.count_lines_only { // REMOVE THIS BLOCK WHEN WE REMOVE count_lines_only arg - fully_skipped += 1; - continue; - } let dedup_data = process_line(&line.to_string(), &bloom_filter, &bff_args); if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { fully_skipped += 1; @@ -621,95 +605,6 @@ async fn process_file_s3_stream( -async fn process_file_s3_download( - s3_bucket: &String, - s3_input: &String, - s3_output: &String, - bloom_filter: &Arc, - bff_args: &BffArgs, - pbar_option: &Option>>, -) -> Result<(), Error> { - // - - // Phase 1a: Build s3 client - let region_provider = RegionProviderChain::default_provider(); - let config = aws_config::defaults(BehaviorVersion::latest()) - .region(region_provider) - .load() - .await; - let client = Client::new(&config); - - - // Phase 1b: read data into lines - // Note: this reads in a streaming sense (but we don't upload in streaming) - let object: GetObjectOutput = client - .get_object() - .bucket(s3_bucket) - .key(s3_input) - .send() - .await?; - - - let body_stream = object.body.into_async_read(); - let gz = asyncGZ::new(body_stream); - let reader = tBufReader::new(gz); - let mut lines_iter = reader.lines(); - - // Phase 1c: Setup output buffer to upload->s3 eventually... - // TODO: Make output writer streaming too? - let mut output_data = Vec::new(); - let encoder = GzEncoder::new(Cursor::new(&mut output_data), Compression::default()); - let mut buf_writer = BufWriter::with_capacity(1024 * 1024, encoder); - - - // Phase 2: Loop over lines, process each line, and write it if not fully eradicated - let mut count = 0; - let mut fully_skipped = 0; - while let Some(line) = lines_iter.next_line().await? { - count += 1; - if bff_args.count_lines_only { // REMOVE THIS BLOCK WHEN WE REMOVE count_lines_only arg - fully_skipped += 1; - continue; - } - let dedup_data = process_line(&line.to_string(), &bloom_filter, &bff_args); - if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { - fully_skipped += 1; - } - else { - serde_json::to_writer(&mut buf_writer, &dedup_data)?; - buf_writer.write_all(b"\n")?; - } - - } - println!("Number of lines in {:?} is {}", s3_input, count); - - - // Phase 3: to finalize, write to s3 if there's something to write - buf_writer.flush()?; - let encoder = buf_writer.into_inner().expect("Failed to get encoder"); - encoder.finish().unwrap(); - - if fully_skipped < count { - let bytes_to_upload = ByteStream::from(output_data); - client - .put_object() - .bucket(s3_bucket) - .key(s3_output) - .body(bytes_to_upload) - .send() - .await?; - } - match pbar_option { - Some(pbar) => pbar.lock().unwrap().inc(1), - None => (), - } - - Ok(()) -} - - - - fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) -> serde_json::Value{ let mut data: Value = serde_json::from_str(&line).unwrap(); let text = data["text"].as_str().unwrap(); @@ -1008,11 +903,7 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf */ let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); - - let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir).await.unwrap(); - if bff_args.limit_files > 0 && io_pairs.len() > bff_args.limit_files { - io_pairs.truncate(bff_args.limit_files); - } + let io_pairs = gather_s3_io(bucket, input_dir, output_dir).await.unwrap(); let num_files = io_pairs.len(); @@ -1051,11 +942,10 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf let output_path = io_pair[1].clone(); - if bff_args.fully_download_s3 { - threadpool.execute(move || { + threadpool.execute(move || { let rt = tokio::runtime::Runtime::new().unwrap(); if let Err(err) = rt.block_on( - process_file_s3_download(&bucket, + process_file_s3(&bucket, &input_path, &output_path, &bloom_filter, @@ -1067,26 +957,9 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf *count += 1; } - }); - } else { - threadpool.execute(move || { - let rt = tokio::runtime::Runtime::new().unwrap(); - if let Err(err) = rt.block_on( - process_file_s3_stream(&bucket, - &input_path, - &output_path, - &bloom_filter, - &bff_args, - &pbar_option) - ) { - eprintln!("Error processing {}; {:?}", input_path, err); - let mut count = err_count.lock().unwrap(); - *count += 1; - } - - }); + }); - } + } threadpool.join(); From 1dabe057eba116ccb7a3aa9397d60fd28ca3325d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 20 Mar 2024 22:32:05 +0000 Subject: [PATCH 24/56] Support subset --- src/main.rs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/main.rs b/src/main.rs index b82d2b1..a635a2c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -144,6 +144,7 @@ enum Commands { + bucket + input_dir + output_dir + + subset */ #[clap(arg_required_else_help = true)] @@ -169,6 +170,9 @@ enum Commands { #[arg(required=true, long)] output_dir: String, + #[arg(long)] + subset: Option, + #[command(flatten)] bff_args: BffArgs }, @@ -744,7 +748,7 @@ fn extract_s3_basename(input_path: &str) -> &str { } -async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str) -> Result>, Error> { +async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Option) -> Result>, Error> { let region_provider = RegionProviderChain::default_provider(); let config = aws_config::defaults(BehaviorVersion::latest()) .region(region_provider) @@ -760,10 +764,14 @@ async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str) -> Result> = Vec::new(); - while let Some(result) = response.next().await { + 'outer: while let Some(result) = response.next().await { match result { Ok(output) => { for object in output.contents() { + if subset.is_some() && io_pairs.len() >= subset.unwrap() { + // Saw enough data for subset, skip + break 'outer; + } let input_key = object.key().unwrap(); if !(input_key.ends_with(".jsonl.gz") || input_key.ends_with(".json.gz")) { continue; @@ -795,8 +803,8 @@ async fn main() -> std::io::Result<()> { { bff(inputs, output_directory, &bff_args)?; }, - Commands::BffRemote {bucket, input_dir, output_dir, bff_args} => { - bff_remote(bucket, input_dir, output_dir, &bff_args).await?; + Commands::BffRemote {bucket, input_dir, output_dir, subset, bff_args} => { + bff_remote(bucket, input_dir, output_dir, subset, &bff_args).await?; } Commands::Sysreq {expected_ngram_count, fp_rate} => { let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count, false); @@ -889,7 +897,7 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> } -async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bff_args: &BffArgs) -> std::io::Result<()> { +async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, subset: &Option, bff_args: &BffArgs) -> std::io::Result<()> { /* General pseudocode: Setup: @@ -903,7 +911,7 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf */ let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); - let io_pairs = gather_s3_io(bucket, input_dir, output_dir).await.unwrap(); + let io_pairs = gather_s3_io(bucket, input_dir, output_dir, subset).await.unwrap(); let num_files = io_pairs.len(); From 031098e949d0f1dd4f5abe35bbac23091b89612f Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 20 Mar 2024 15:47:29 -0700 Subject: [PATCH 25/56] Added some better printouts for removal stats at the end --- src/main.rs | 89 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 20 deletions(-) diff --git a/src/main.rs b/src/main.rs index b82d2b1..2d145cd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -468,7 +468,7 @@ fn process_file( bloom_filter: &Arc, bff_args: &BffArgs, pbar_option: &Option>>, -) -> Result<(), io::Error> { +) -> Result<(usize, usize), io::Error> { // Setup input/output writers let input_file = OpenOptions::new() @@ -494,9 +494,14 @@ fn process_file( // Loop over lines and do BFF stuff let mut count = 0; let mut fully_skipped = 0; + let mut removed_items = 0; + let mut total_items = 0; for line in reader.lines() { count += 1; - let dedup_data = process_line(&line.unwrap(), &bloom_filter, &bff_args); + let (dedup_data, removed_line_items, total_line_items) = process_line(&line.unwrap(), &bloom_filter, &bff_args); + removed_items += removed_line_items; + total_items += total_line_items; + if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { fully_skipped += 1; } @@ -515,7 +520,7 @@ fn process_file( Some(pbar) => pbar.lock().unwrap().inc(1), None => (), } - Ok(()) + Ok((removed_items, total_items)) } @@ -528,7 +533,7 @@ async fn process_file_s3( bloom_filter: &Arc, bff_args: &BffArgs, pbar_option: &Option>>, -) -> Result<(), Error> { +) -> Result<(usize, usize), Error> { // // Phase 1a: Build s3 client @@ -565,9 +570,13 @@ async fn process_file_s3( // Phase 2: Loop over lines, process each line, and write it if not fully eradicated let mut count = 0; let mut fully_skipped = 0; + let mut removed_items = 0; + let mut total_items = 0; while let Some(line) = lines_iter.next_line().await? { count += 1; - let dedup_data = process_line(&line.to_string(), &bloom_filter, &bff_args); + let (dedup_data, removed_line_items, total_line_items) = process_line(&line.to_string(), &bloom_filter, &bff_args); + removed_items += removed_line_items; + total_items += total_line_items; if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { fully_skipped += 1; } @@ -600,13 +609,16 @@ async fn process_file_s3( None => (), } - Ok(()) + Ok((removed_items, total_items)) } -fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) -> serde_json::Value{ +fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) -> (serde_json::Value, usize, usize){ let mut data: Value = serde_json::from_str(&line).unwrap(); + let mut total_items = 0; + let mut removed_items = 0; + let text = data["text"].as_str().unwrap(); let newlines = if bff_args.whole_document { @@ -625,6 +637,7 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - for paragraph_window in newlines.windows(2) { let paragraph = &text[paragraph_window[0]..paragraph_window[1]]; + total_items += 1; // calculate hashes for the paragraph let mut hashes: Vec> = Vec::new(); @@ -657,6 +670,7 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - contained_ngrams as f64 / number_of_ngrams as f64 > bff_args.filtering_threshold; if too_many_duplicate_ngrams { windows_to_remove.push(paragraph_window); + removed_items += 1; } else if !bff_args.no_update_bloom_filter { for ngram in hashes { bloom_filter.insert_hashes(&ngram); @@ -696,7 +710,7 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - map.retain(|key, _| allowed_fields.contains(&key.as_str())); } } - data + (data, removed_items, total_items) } @@ -805,6 +819,7 @@ async fn main() -> std::io::Result<()> { expected_ngram_count, fp_rate, human_bytes(bff_size as f64), num_hashers); }, } + Ok(()) } @@ -832,10 +847,11 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> ).unwrap() ); let pbar = Arc::new(Mutex::new(pbar)); + println!("Completed setup phase in {:?} seconds", start_time.elapsed().as_secs()); + if !bff_args.no_progress { pbar.lock().unwrap().inc(0); // initializes pbar } - println!("Completed setup phase in {:?} seconds", start_time.elapsed().as_secs()); // LOOP PHASE (W/ Threadpool) @@ -845,13 +861,16 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> bff_args.threads }; let loop_start_time = Instant::now(); + let total_items = Arc::new(Mutex::new(0)); + let removed_items = Arc::new(Mutex::new(0)); let threadpool = ThreadPool::new(threads); for input in all_inputs { let mut output = output_directory.clone(); output.push(input.file_name().unwrap()); let bloom_filter = bloom_filter.clone(); let bff_args = bff_args.clone(); - + let total_items = Arc::clone(&total_items); + let removed_items = Arc::clone(&removed_items); let pbar_option: Option>> = if bff_args.no_progress { None } else { @@ -862,7 +881,7 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> if bff_args.no_progress { println!("Processing {input:?}..."); } - process_file( + let (removed_doc_items, total_doc_items) = process_file( &input, &output, &bloom_filter, @@ -870,6 +889,13 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> &pbar_option, ) .unwrap(); + + let mut total_guard = total_items.lock().unwrap(); + *total_guard += total_doc_items; + + let mut removed_guard = removed_items.lock().unwrap(); + *removed_guard += removed_doc_items; + }); } threadpool.join(); @@ -885,6 +911,11 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); } println!("Completed full BFF run in {:?} seconds", start_time.elapsed().as_secs()); + + let total_items = *total_items.lock().unwrap(); + let removed_items = *removed_items.lock().unwrap(); + println!("Stats: Saw {} items | Removed {} of them", + total_items, removed_items as f64 / total_items as f64); Ok(()) } @@ -915,13 +946,16 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf ).unwrap() ); let pbar = Arc::new(Mutex::new(pbar)); + println!("Completed setup phase in {:?} seconds", start_time.elapsed().as_secs()); + if !bff_args.no_progress { pbar.lock().unwrap().inc(0); // initializes pbar } - println!("Completed setup phase in {:?} seconds", start_time.elapsed().as_secs()); let loop_start_time = Instant::now(); + let total_items = Arc::new(Mutex::new(0)); + let removed_items = Arc::new(Mutex::new(0)); let threads = if bff_args.threads == 0 { available_parallelism().unwrap().get() } else { @@ -933,6 +967,8 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf let bloom_filter = bloom_filter.clone(); let bff_args = bff_args.clone(); let err_count: Arc> = Arc::clone(&err_count); + let total_items = Arc::clone(&total_items); + let removed_items = Arc::clone(&removed_items); let pbar_option: Option>> = if bff_args.no_progress { None } else { @@ -943,24 +979,32 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf threadpool.execute(move || { - let rt = tokio::runtime::Runtime::new().unwrap(); - if let Err(err) = rt.block_on( + let rt = tokio::runtime::Runtime::new().unwrap(); + let result = rt.block_on( process_file_s3(&bucket, &input_path, &output_path, &bloom_filter, &bff_args, &pbar_option) - ) { + ); + match result { + Ok(outputs) => { + let (rem_doc_items, tot_doc_items) = outputs; + let mut total_guard = total_items.lock().unwrap(); + *total_guard += tot_doc_items; + let mut removed_guard = removed_items.lock().unwrap(); + *removed_guard += rem_doc_items; + } + Err(err) => { eprintln!("Error processing {}; {:?}", input_path, err); let mut count = err_count.lock().unwrap(); *count += 1; - } - - }); + } + } + }); - - + } threadpool.join(); println!("Completed filtering all files in {:?} seconds", @@ -975,6 +1019,11 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf } println!("Error count is {}/{}", err_count.lock().unwrap(), num_files); println!("Completed full BFF run in {:?} seconds", start_time.elapsed().as_secs()); + + let total_items = *total_items.lock().unwrap(); + let removed_items = *removed_items.lock().unwrap(); + println!("Stats: Saw {} items | Removed {} of them", + total_items, removed_items as f64 / total_items as f64); Ok(()) } From 8be64f16ef811f2d07cb02db4edf141d81f548e6 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 20 Mar 2024 17:45:49 -0700 Subject: [PATCH 26/56] Added retry support for bff-remote --- src/main.rs | 118 +++++++++++++++++++++++++++++----------------------- 1 file changed, 67 insertions(+), 51 deletions(-) diff --git a/src/main.rs b/src/main.rs index 2d145cd..d487ef0 100644 --- a/src/main.rs +++ b/src/main.rs @@ -170,7 +170,10 @@ enum Commands { output_dir: String, #[command(flatten)] - bff_args: BffArgs + bff_args: BffArgs, + + #[arg(long, default_value_t=3)] + num_retries: usize, }, Sysreq { @@ -534,7 +537,7 @@ async fn process_file_s3( bff_args: &BffArgs, pbar_option: &Option>>, ) -> Result<(usize, usize), Error> { - // + // Phase 1a: Build s3 client let region_provider = RegionProviderChain::default_provider(); @@ -758,7 +761,7 @@ fn extract_s3_basename(input_path: &str) -> &str { } -async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str) -> Result>, Error> { +async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str) -> Result, Error> { let region_provider = RegionProviderChain::default_provider(); let config = aws_config::defaults(BehaviorVersion::latest()) .region(region_provider) @@ -773,7 +776,7 @@ async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str) -> Result> = Vec::new(); + let mut io_pairs: Vec<(String, String)> = Vec::new(); while let Some(result) = response.next().await { match result { Ok(output) => { @@ -784,7 +787,7 @@ async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str) -> Result std::io::Result<()> { { bff(inputs, output_directory, &bff_args)?; }, - Commands::BffRemote {bucket, input_dir, output_dir, bff_args} => { - bff_remote(bucket, input_dir, output_dir, &bff_args).await?; + Commands::BffRemote {bucket, input_dir, output_dir, bff_args, num_retries} => { + bff_remote(bucket, input_dir, output_dir, &bff_args, num_retries).await?; } Commands::Sysreq {expected_ngram_count, fp_rate} => { let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count, false); @@ -920,7 +923,7 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> } -async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bff_args: &BffArgs) -> std::io::Result<()> { +async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bff_args: &BffArgs, num_retries: &usize) -> std::io::Result<()> { /* General pseudocode: Setup: @@ -934,7 +937,7 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf */ let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); - let io_pairs = gather_s3_io(bucket, input_dir, output_dir).await.unwrap(); + let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir).await.unwrap(); let num_files = io_pairs.len(); @@ -962,51 +965,64 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf bff_args.threads }; let threadpool = ThreadPool::new(threads); - for io_pair in &io_pairs { - let bucket = bucket.clone(); - let bloom_filter = bloom_filter.clone(); - let bff_args = bff_args.clone(); - let err_count: Arc> = Arc::clone(&err_count); - let total_items = Arc::clone(&total_items); - let removed_items = Arc::clone(&removed_items); - let pbar_option: Option>> = if bff_args.no_progress { - None - } else { - Some(pbar.clone()) - }; - let input_path = io_pair[0].clone(); - let output_path = io_pair[1].clone(); - - threadpool.execute(move || { - let rt = tokio::runtime::Runtime::new().unwrap(); - let result = rt.block_on( - process_file_s3(&bucket, - &input_path, - &output_path, - &bloom_filter, - &bff_args, - &pbar_option) - ); - match result { - Ok(outputs) => { - let (rem_doc_items, tot_doc_items) = outputs; - let mut total_guard = total_items.lock().unwrap(); - *total_guard += tot_doc_items; - let mut removed_guard = removed_items.lock().unwrap(); - *removed_guard += rem_doc_items; - } - Err(err) => { - eprintln!("Error processing {}; {:?}", input_path, err); - let mut count = err_count.lock().unwrap(); - *count += 1; - } - } - }); + for retry_count in 0..*num_retries { + let failed_io_pairs: Arc>> = Arc::new(Mutex::new(Vec::new())); + for io_pair in &io_pairs { + let num_retries = (*num_retries).clone(); + let retry_count = retry_count.clone(); + let bucket = bucket.clone(); + let bloom_filter = bloom_filter.clone(); + let bff_args = bff_args.clone(); + let failed_io_pairs = Arc::clone(&failed_io_pairs); + let err_count: Arc> = Arc::clone(&err_count); + let total_items = Arc::clone(&total_items); + let removed_items = Arc::clone(&removed_items); + let pbar_option: Option>> = if bff_args.no_progress { + None + } else { + Some(pbar.clone()) + }; + + let (input_path, output_path) = io_pair.clone(); + threadpool.execute(move || { + let rt = tokio::runtime::Runtime::new().unwrap(); + let result = rt.block_on( + process_file_s3(&bucket, + &input_path, + &output_path, + &bloom_filter, + &bff_args, + &pbar_option) + ); + match result { + Ok(outputs) => { + let (rem_doc_items, tot_doc_items) = outputs; + let mut total_guard = total_items.lock().unwrap(); + *total_guard += tot_doc_items; + let mut removed_guard = removed_items.lock().unwrap(); + *removed_guard += rem_doc_items; + } + Err(err) => { + eprintln!("Round {}/{}: Error processing {}; {:?}", retry_count+1, num_retries, input_path, err); + if retry_count < num_retries - 1 { + // in all but last round, push the failed pair to failed_io_pairs + let mut fail_guard = failed_io_pairs.lock().unwrap(); + fail_guard.push((input_path, output_path)); + } else { + // in last round, give up and mark this one as an error + let mut count = err_count.lock().unwrap(); + *count += 1; + } + + } + } + }); - + } + threadpool.join(); + io_pairs = failed_io_pairs.lock().unwrap().clone(); } - threadpool.join(); println!("Completed filtering all files in {:?} seconds", loop_start_time.elapsed().as_secs()); From bd7add8e4b50cc1e8e719a6981458d16cdbe84a3 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 21 Mar 2024 12:21:50 -0700 Subject: [PATCH 27/56] Added better RT thread control? --- src/main.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index d487ef0..0bb8d62 100644 --- a/src/main.rs +++ b/src/main.rs @@ -537,8 +537,6 @@ async fn process_file_s3( bff_args: &BffArgs, pbar_option: &Option>>, ) -> Result<(usize, usize), Error> { - - // Phase 1a: Build s3 client let region_provider = RegionProviderChain::default_provider(); let config = aws_config::defaults(BehaviorVersion::latest()) @@ -986,7 +984,10 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bf let (input_path, output_path) = io_pair.clone(); threadpool.execute(move || { - let rt = tokio::runtime::Runtime::new().unwrap(); + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); let result = rt.block_on( process_file_s3(&bucket, &input_path, From 029ed652418492b070942fc9c1a0c985ed98eb17 Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Thu, 21 Mar 2024 18:21:49 -0700 Subject: [PATCH 28/56] Retry within process_file_s3 with exp backoff --- src/main.rs | 134 +++++++++++++++++++++++++--------------------------- 1 file changed, 65 insertions(+), 69 deletions(-) diff --git a/src/main.rs b/src/main.rs index 6837ff1..3dd2822 100644 --- a/src/main.rs +++ b/src/main.rs @@ -38,6 +38,7 @@ use aws_sdk_s3::operation::get_object::GetObjectOutput; use tokio::io::{AsyncBufReadExt}; use tokio::io::BufReader as tBufReader; +use tokio::time::{Duration, sleep}; use async_compression::tokio::bufread::GzipDecoder as asyncGZ; @@ -531,7 +532,24 @@ fn process_file( } +async fn get_object_with_retry(client: &Client, bucket: &str, key: &str, num_retries: usize) -> Result { + let mut attempts = 0; + let base_delay = Duration::from_millis(100); // Starting delay of 100 ms + loop { + match client.get_object().bucket(bucket).key(key).send().await { + Ok(response) => return Ok(response), + Err(e) if attempts < num_retries => { + // Calculate delay for exponential backoff + let delay = base_delay * 2u32.pow(attempts as u32); + println!("Error reading from S3, retrying in {:?}... (Attempt {}/{})", delay, attempts + 1, num_retries); + sleep(delay).await; + attempts += 1; + } + Err(e) => return Err(e.into()), + } + } +} async fn process_file_s3( s3_bucket: &String, @@ -540,7 +558,9 @@ async fn process_file_s3( bloom_filter: &Arc, bff_args: &BffArgs, pbar_option: &Option>>, + num_retries: usize, ) -> Result<(usize, usize), Error> { + println!("Processing file s3"); // Phase 1a: Build s3 client let region_provider = RegionProviderChain::default_provider(); let config = aws_config::defaults(BehaviorVersion::latest()) @@ -549,17 +569,7 @@ async fn process_file_s3( .await; let client = Client::new(&config); - - // Phase 1b: read data into lines - // Note: this reads in a streaming sense (but we don't upload in streaming) - let object: GetObjectOutput = client - .get_object() - .bucket(s3_bucket) - .key(s3_input) - .send() - .await?; - - + let object = get_object_with_retry(&client, s3_bucket, s3_input, num_retries).await?; let body_stream = object.body.into_async_read(); let gz = asyncGZ::new(body_stream); let reader = tBufReader::with_capacity(1024 * 1024, gz); @@ -974,67 +984,53 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su bff_args.threads }; let threadpool = ThreadPool::new(threads); + for io_pair in &io_pairs { + let bucket = bucket.clone(); + let bloom_filter = bloom_filter.clone(); + let bff_args = bff_args.clone(); + let err_count: Arc> = Arc::clone(&err_count); + let total_items = Arc::clone(&total_items); + let removed_items = Arc::clone(&removed_items); + let pbar_option: Option>> = if bff_args.no_progress { + None + } else { + Some(pbar.clone()) + }; + let num_retries = num_retries.clone(); - for retry_count in 0..*num_retries { - let failed_io_pairs: Arc>> = Arc::new(Mutex::new(Vec::new())); - for io_pair in &io_pairs { - let num_retries = (*num_retries).clone(); - let retry_count = retry_count.clone(); - let bucket = bucket.clone(); - let bloom_filter = bloom_filter.clone(); - let bff_args = bff_args.clone(); - let failed_io_pairs = Arc::clone(&failed_io_pairs); - let err_count: Arc> = Arc::clone(&err_count); - let total_items = Arc::clone(&total_items); - let removed_items = Arc::clone(&removed_items); - let pbar_option: Option>> = if bff_args.no_progress { - None - } else { - Some(pbar.clone()) - }; - - let (input_path, output_path) = io_pair.clone(); - threadpool.execute(move || { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let result = rt.block_on( - process_file_s3(&bucket, - &input_path, - &output_path, - &bloom_filter, - &bff_args, - &pbar_option) - ); - match result { - Ok(outputs) => { - let (rem_doc_items, tot_doc_items) = outputs; - let mut total_guard = total_items.lock().unwrap(); - *total_guard += tot_doc_items; - let mut removed_guard = removed_items.lock().unwrap(); - *removed_guard += rem_doc_items; - } - Err(err) => { - eprintln!("Round {}/{}: Error processing {}; {:?}", retry_count+1, num_retries, input_path, err); - if retry_count < num_retries - 1 { - // in all but last round, push the failed pair to failed_io_pairs - let mut fail_guard = failed_io_pairs.lock().unwrap(); - fail_guard.push((input_path, output_path)); - } else { - // in last round, give up and mark this one as an error - let mut count = err_count.lock().unwrap(); - *count += 1; - } - - } - } - }); + let (input_path, output_path) = io_pair.clone(); + threadpool.execute(move || { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let result = rt.block_on( + process_file_s3(&bucket, + &input_path, + &output_path, + &bloom_filter, + &bff_args, + &pbar_option, + num_retries) + ); + match result { + Ok(outputs) => { + let (rem_doc_items, tot_doc_items) = outputs; + let mut total_guard = total_items.lock().unwrap(); + *total_guard += tot_doc_items; + let mut removed_guard = removed_items.lock().unwrap(); + *removed_guard += rem_doc_items; + } + Err(err) => { + eprintln!("Error processing {}; {:?}", input_path, err); + let mut count = err_count.lock().unwrap(); + *count += 1; + } + } + }); - } - threadpool.join(); - io_pairs = failed_io_pairs.lock().unwrap().clone(); } + threadpool.join(); println!("Completed filtering all files in {:?} seconds", loop_start_time.elapsed().as_secs()); From 5ba59f6b30b88d777a105018fcc0f0e38a6aab25 Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Thu, 21 Mar 2024 18:24:30 -0700 Subject: [PATCH 29/56] Better prints --- src/main.rs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 3dd2822..5509d02 100644 --- a/src/main.rs +++ b/src/main.rs @@ -542,11 +542,13 @@ async fn get_object_with_retry(client: &Client, bucket: &str, key: &str, num_ret Err(e) if attempts < num_retries => { // Calculate delay for exponential backoff let delay = base_delay * 2u32.pow(attempts as u32); - println!("Error reading from S3, retrying in {:?}... (Attempt {}/{})", delay, attempts + 1, num_retries); sleep(delay).await; attempts += 1; } - Err(e) => return Err(e.into()), + Err(e) => { + println!("Too many errors reading: {}. Giving up.", key); + return Err(e.into()), + } } } } @@ -560,7 +562,6 @@ async fn process_file_s3( pbar_option: &Option>>, num_retries: usize, ) -> Result<(usize, usize), Error> { - println!("Processing file s3"); // Phase 1a: Build s3 client let region_provider = RegionProviderChain::default_provider(); let config = aws_config::defaults(BehaviorVersion::latest()) From 92c395703de278f84d36278aa8928832fa73ade9 Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Thu, 21 Mar 2024 18:24:58 -0700 Subject: [PATCH 30/56] semicolon --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 5509d02..c184da3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -547,7 +547,7 @@ async fn get_object_with_retry(client: &Client, bucket: &str, key: &str, num_ret } Err(e) => { println!("Too many errors reading: {}. Giving up.", key); - return Err(e.into()), + return Err(e.into()); } } } From ce53e1c27438b1652b2b3896ad90926562607ea6 Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Thu, 21 Mar 2024 18:35:30 -0700 Subject: [PATCH 31/56] Random delay --- src/main.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/main.rs b/src/main.rs index c184da3..e7c5d8e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,7 +8,7 @@ use flate2::Compression; use glob::glob; use human_bytes::human_bytes; use indicatif::{ProgressBar,ProgressStyle}; -use rand::Rng; +use rand::{thread_rng, Rng}; use serde_json::Value; use std::clone::Clone; use std::collections::VecDeque; @@ -238,7 +238,7 @@ impl BloomFilter { } fn new(size_in_bytes: usize, num_hashers: usize) -> Self { - let mut rng = rand::thread_rng(); + let mut rng = thread_rng(); let mut hash_builder_seeds = Vec::with_capacity(num_hashers); let mut hash_builders = Vec::with_capacity(num_hashers); for _ in 0..num_hashers { @@ -535,13 +535,16 @@ fn process_file( async fn get_object_with_retry(client: &Client, bucket: &str, key: &str, num_retries: usize) -> Result { let mut attempts = 0; let base_delay = Duration::from_millis(100); // Starting delay of 100 ms + let mut rng = thread_rng(); loop { match client.get_object().bucket(bucket).key(key).send().await { Ok(response) => return Ok(response), Err(e) if attempts < num_retries => { - // Calculate delay for exponential backoff - let delay = base_delay * 2u32.pow(attempts as u32); + // Calculate delay for exponential backoff, add some randomness so multiple threads don't access at the + // same time. + let random_delay = rng.gen_range(Duration::from_millis(0)..Duration::from_millis(200)); + let delay = base_delay * 2u32.pow(attempts as u32) + random_delay; sleep(delay).await; attempts += 1; } From 71bf7b26add6d4358b3e70eb7ae3925061d8bbfc Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Thu, 21 Mar 2024 18:38:17 -0700 Subject: [PATCH 32/56] fixes --- src/main.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/main.rs b/src/main.rs index e7c5d8e..d18b220 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,7 +8,7 @@ use flate2::Compression; use glob::glob; use human_bytes::human_bytes; use indicatif::{ProgressBar,ProgressStyle}; -use rand::{thread_rng, Rng}; +use rand::Rng; use serde_json::Value; use std::clone::Clone; use std::collections::VecDeque; @@ -238,7 +238,7 @@ impl BloomFilter { } fn new(size_in_bytes: usize, num_hashers: usize) -> Self { - let mut rng = thread_rng(); + let mut rng = rand::thread_rng(); let mut hash_builder_seeds = Vec::with_capacity(num_hashers); let mut hash_builders = Vec::with_capacity(num_hashers); for _ in 0..num_hashers { @@ -534,8 +534,10 @@ fn process_file( async fn get_object_with_retry(client: &Client, bucket: &str, key: &str, num_retries: usize) -> Result { let mut attempts = 0; - let base_delay = Duration::from_millis(100); // Starting delay of 100 ms - let mut rng = thread_rng(); + let base_delay = Duration::from_millis(100); + let max_delay = Duration::from_millis(2000); + + let mut rng = rand::thread_rng(); loop { match client.get_object().bucket(bucket).key(key).send().await { @@ -544,8 +546,11 @@ async fn get_object_with_retry(client: &Client, bucket: &str, key: &str, num_ret // Calculate delay for exponential backoff, add some randomness so multiple threads don't access at the // same time. let random_delay = rng.gen_range(Duration::from_millis(0)..Duration::from_millis(200)); - let delay = base_delay * 2u32.pow(attempts as u32) + random_delay; - sleep(delay).await; + let mut exponential_delay = base_delay * 2u32.pow(attempts as u32); + if exponential_delay > max_delay { + exponential_delay = max_delay; + } + sleep(exponential_delay + random_delay).await; attempts += 1; } Err(e) => { From e4fce7e726029057a134696c58d3d8de588e7698 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Fri, 22 Mar 2024 02:12:40 +0000 Subject: [PATCH 33/56] Add random delay before spawning threads --- src/main.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index d18b220..7a9bab5 100644 --- a/src/main.rs +++ b/src/main.rs @@ -545,7 +545,8 @@ async fn get_object_with_retry(client: &Client, bucket: &str, key: &str, num_ret Err(e) if attempts < num_retries => { // Calculate delay for exponential backoff, add some randomness so multiple threads don't access at the // same time. - let random_delay = rng.gen_range(Duration::from_millis(0)..Duration::from_millis(200)); + println!("Error {}/{}: {}", e, attempts, num_retries); + let random_delay = rng.gen_range(Duration::from_millis(0)..Duration::from_millis(1000)); let mut exponential_delay = base_delay * 2u32.pow(attempts as u32); if exponential_delay > max_delay { exponential_delay = max_delay; @@ -993,6 +994,7 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su bff_args.threads }; let threadpool = ThreadPool::new(threads); + let mut rng = rand::thread_rng(); for io_pair in &io_pairs { let bucket = bucket.clone(); let bloom_filter = bloom_filter.clone(); @@ -1037,7 +1039,9 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su } } }); - + // Wait a little before spawning the next processor. + let random_delay = rng.gen_range(Duration::from_millis(0)..Duration::from_millis(100)); + sleep(random_delay).await; } threadpool.join(); println!("Completed filtering all files in {:?} seconds", From 9b8b45c46237098bd4ead9707a6f2978ca3ab18e Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 25 Mar 2024 09:45:29 -0700 Subject: [PATCH 34/56] Freezing status for dedup ablations v3 --- Cargo.toml | 1 + src/main.rs | 30 +++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b53831d..a0d618b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,7 @@ name = "bff" version = "0.1.0" edition = "2021" + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] diff --git a/src/main.rs b/src/main.rs index 6837ff1..498cc78 100644 --- a/src/main.rs +++ b/src/main.rs @@ -34,15 +34,11 @@ use aws_config::BehaviorVersion; use aws_sdk_s3::{Client}; use aws_sdk_s3::primitives::ByteStream; use aws_sdk_s3::operation::get_object::GetObjectOutput; - - use tokio::io::{AsyncBufReadExt}; use tokio::io::BufReader as tBufReader; use async_compression::tokio::bufread::GzipDecoder as asyncGZ; - - /*======================================================= = Argument Struct = =======================================================*/ @@ -111,6 +107,9 @@ struct BffArgs{ #[arg(long, default_value_t = false)] annotate_attribute_only: bool, + #[arg(long, default_value_t = RemoveType::Paragraph, value_enum)] + remove_type: RemoveType, + #[arg(long, default_value_t = false)] whole_document: bool, @@ -190,6 +189,16 @@ enum Commands { } +#[derive(Debug, Clone, Eq, PartialEq, clap::ValueEnum)] +enum RemoveType { + // Types for what we check to see if is a duplicate + Paragraph, // Paragraph level only + Document, // Whole document only + Both, // Does paragraph first, but if enough of the ngrams are contained in the bff, removes the whole document + // NOTE: ^ will add some ngram data (OF TO-REMOVE ngrams) into the filter [other methods don't do this] +} + + /*=================================================== = Bloom Filter stuff = @@ -380,6 +389,7 @@ impl BloomFilter { true } + #[allow(dead_code)] // use in unit test fn contains(&self, s: &VecDeque<&str>) -> bool { let hashes = self.hashes(s); @@ -626,7 +636,8 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - let text = data["text"].as_str().unwrap(); - let newlines = if bff_args.whole_document { + + let newlines = if bff_args.remove_type == RemoveType::Document { vec![0, text.len()] } else { let mut newlines = Vec::new(); @@ -638,6 +649,8 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - newlines }; let mut windows_to_remove = Vec::new(); + + let mut total_ngrams = 0; let mut total_contained_ngrams = 0; for paragraph_window in newlines.windows(2) { @@ -655,6 +668,7 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - ngram.pop_front(); } } + // If the paragraph was too short, put in a shorter ngram, so we can dedupe short // paragraphs exactly. if hashes.is_empty() && ngram.len() >= bff_args.min_ngram_size { @@ -665,6 +679,7 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - .iter() .filter(|ngram| bloom_filter.contains_hashes(ngram)) .count(); + total_ngrams += hashes.len(); total_contained_ngrams += contained_ngrams; // calculate how many ngrams are in the bloom filter @@ -696,6 +711,11 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - last_end = paragraph_window[1]; } output_paragraphs.push_str(&text[last_end..]); + if bff_args.remove_type == RemoveType::Both && + (total_contained_ngrams as f64) / (total_ngrams as f64) > bff_args.filtering_threshold + { + output_paragraphs = String::new(); // If we found enough duplicates to remove whole document too + } data["text"] = Value::String(output_paragraphs); data["bff_contained_ngram_count_before_dedupe"] = serde_json::to_value(total_contained_ngrams).unwrap(); From fc9cfca298d8b1a90d3fdb2fac52e0401d1ad130 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 25 Mar 2024 09:45:29 -0700 Subject: [PATCH 35/56] Freezing status for dedup ablations v3 --- Cargo.toml | 1 + src/main.rs | 30 +++++++++++++++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index b53831d..a0d618b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,7 @@ name = "bff" version = "0.1.0" edition = "2021" + # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] diff --git a/src/main.rs b/src/main.rs index 7a9bab5..20bc20a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -34,16 +34,12 @@ use aws_config::BehaviorVersion; use aws_sdk_s3::{Client}; use aws_sdk_s3::primitives::ByteStream; use aws_sdk_s3::operation::get_object::GetObjectOutput; - - use tokio::io::{AsyncBufReadExt}; use tokio::io::BufReader as tBufReader; use tokio::time::{Duration, sleep}; use async_compression::tokio::bufread::GzipDecoder as asyncGZ; - - /*======================================================= = Argument Struct = =======================================================*/ @@ -112,6 +108,9 @@ struct BffArgs{ #[arg(long, default_value_t = false)] annotate_attribute_only: bool, + #[arg(long, default_value_t = RemoveType::Paragraph, value_enum)] + remove_type: RemoveType, + #[arg(long, default_value_t = false)] whole_document: bool, @@ -191,6 +190,16 @@ enum Commands { } +#[derive(Debug, Clone, Eq, PartialEq, clap::ValueEnum)] +enum RemoveType { + // Types for what we check to see if is a duplicate + Paragraph, // Paragraph level only + Document, // Whole document only + Both, // Does paragraph first, but if enough of the ngrams are contained in the bff, removes the whole document + // NOTE: ^ will add some ngram data (OF TO-REMOVE ngrams) into the filter [other methods don't do this] +} + + /*=================================================== = Bloom Filter stuff = @@ -381,6 +390,7 @@ impl BloomFilter { true } + #[allow(dead_code)] // use in unit test fn contains(&self, s: &VecDeque<&str>) -> bool { let hashes = self.hashes(s); @@ -646,7 +656,8 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - let text = data["text"].as_str().unwrap(); - let newlines = if bff_args.whole_document { + + let newlines = if bff_args.remove_type == RemoveType::Document { vec![0, text.len()] } else { let mut newlines = Vec::new(); @@ -658,6 +669,8 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - newlines }; let mut windows_to_remove = Vec::new(); + + let mut total_ngrams = 0; let mut total_contained_ngrams = 0; for paragraph_window in newlines.windows(2) { @@ -675,6 +688,7 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - ngram.pop_front(); } } + // If the paragraph was too short, put in a shorter ngram, so we can dedupe short // paragraphs exactly. if hashes.is_empty() && ngram.len() >= bff_args.min_ngram_size { @@ -685,6 +699,7 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - .iter() .filter(|ngram| bloom_filter.contains_hashes(ngram)) .count(); + total_ngrams += hashes.len(); total_contained_ngrams += contained_ngrams; // calculate how many ngrams are in the bloom filter @@ -716,6 +731,11 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - last_end = paragraph_window[1]; } output_paragraphs.push_str(&text[last_end..]); + if bff_args.remove_type == RemoveType::Both && + (total_contained_ngrams as f64) / (total_ngrams as f64) > bff_args.filtering_threshold + { + output_paragraphs = String::new(); // If we found enough duplicates to remove whole document too + } data["text"] = Value::String(output_paragraphs); data["bff_contained_ngram_count_before_dedupe"] = serde_json::to_value(total_contained_ngrams).unwrap(); From 68853d2ba8a47d20db5466538f2ec9ce5b1c08df Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 27 Mar 2024 11:08:08 -0700 Subject: [PATCH 36/56] Added i) parallel init for bits vector; ii) better output directory formatting, iii) better whitespace/empty document filtration --- Cargo.toml | 1 + src/main.rs | 29 ++++++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index a0d618b..22b32f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,3 +24,4 @@ tokio = { version = "1.36.0", features = ["full"] } aws-config = "1.1.8" aws-sdk-s3 = "1.19.1" async-compression = {version ="0.4.6", features=["all"]} +rayon = "1.10.0" \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 498cc78..cafc8d3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,13 +12,13 @@ use rand::Rng; use serde_json::Value; use std::clone::Clone; use std::collections::VecDeque; -use std::fs::{OpenOptions, remove_file}; +use std::fs::{OpenOptions, remove_file, create_dir_all}; use std::hash::{BuildHasher, Hash, Hasher}; use std::io; use std::io::{Cursor}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::mem::size_of; -use std::path::{PathBuf}; +use std::path::{PathBuf, Path}; use std::string::String; use std::sync::atomic::{AtomicU32, Ordering}; use std::sync::{Arc, Mutex}; @@ -37,6 +37,7 @@ use aws_sdk_s3::operation::get_object::GetObjectOutput; use tokio::io::{AsyncBufReadExt}; use tokio::io::BufReader as tBufReader; use async_compression::tokio::bufread::GzipDecoder as asyncGZ; +use rayon::prelude::*; /*======================================================= @@ -259,7 +260,7 @@ impl BloomFilter { let number_of_u32 = size_in_bytes / size_of::(); let bits = { - (0..number_of_u32).map(|_| AtomicU32::default()).collect() + (0..number_of_u32).into_par_iter().map(|_| AtomicU32::default()).collect() }; @@ -519,7 +520,7 @@ fn process_file( removed_items += removed_line_items; total_items += total_line_items; - if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { + if dedup_data.get("text").unwrap().as_str().unwrap().trim().is_empty() { fully_skipped += 1; } else { @@ -776,6 +777,18 @@ fn expand_dirs(paths: &[PathBuf]) -> Result> { Ok(files) } +fn create_dir_if_not_exists(path: &PathBuf) -> Result<(), std::io::Error> { + match create_dir_all(path) { + Ok(_) => Ok(()), + Err(err) => { + if err.kind() == std::io::ErrorKind::AlreadyExists { + Ok(()) + } else { + Err(err) + } + } + } +} fn extract_s3_basename(input_path: &str) -> &str { let parts: Vec<&str> = input_path.split('/').collect(); @@ -813,7 +826,7 @@ async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Opt continue; } let basename = extract_s3_basename(&input_key); - let output_key = format!("{}{}", output_dir, basename).to_string(); + let output_key = Path::new(output_dir).join(basename).as_os_str().to_str().unwrap().to_string(); let io_pair: (String, String) = (String::from(input_key), String::from(&output_key)); io_pairs.push(io_pair); } @@ -869,6 +882,7 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> */ // SETUP PHASE let start_time = Instant::now(); + create_dir_if_not_exists(output_directory).unwrap(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); let all_inputs = expand_dirs(inputs).unwrap(); let pbar = ProgressBar::new(all_inputs.len() as u64) @@ -896,8 +910,9 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> let removed_items = Arc::new(Mutex::new(0)); let threadpool = ThreadPool::new(threads); for input in all_inputs { - let mut output = output_directory.clone(); - output.push(input.file_name().unwrap()); + //let mut output = output_directory.clone(); + let output = output_directory.clone().join(input.file_name().unwrap()); + //output.push(input.file_name().unwrap()); let bloom_filter = bloom_filter.clone(); let bff_args = bff_args.clone(); let total_items = Arc::clone(&total_items); From 5bf7f56f570751c7dcf25201dcee5adcaa426415 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Thu, 28 Mar 2024 10:23:21 -0700 Subject: [PATCH 37/56] Shuffling s3 shards --- src/main.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index cafc8d3..457f07b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -8,7 +8,8 @@ use flate2::Compression; use glob::glob; use human_bytes::human_bytes; use indicatif::{ProgressBar,ProgressStyle}; -use rand::Rng; +use rand::{Rng,thread_rng}; +use rand::seq::SliceRandom; use serde_json::Value; use std::clone::Clone; use std::collections::VecDeque; @@ -836,6 +837,9 @@ async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Opt } } } + let mut rng = thread_rng(); + io_pairs.shuffle(&mut rng); + Ok(io_pairs) } From 1a776a5bddeef8ab8faeeada7dd138eaa792fe40 Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Thu, 28 Mar 2024 10:44:30 -0700 Subject: [PATCH 38/56] Add global retries --- src/main.rs | 122 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 73 insertions(+), 49 deletions(-) diff --git a/src/main.rs b/src/main.rs index 4d17f4c..ade801e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -178,8 +178,17 @@ enum Commands { #[command(flatten)] bff_args: BffArgs, + // Local number of retries; we try to load each file from s3 this many times. #[arg(long, default_value_t=3)] num_retries: usize, + + // Global number of retries; we do a full loop through all remaining files this many times. + // i.e., + // remaining = all_paths + // for i in num_retries: + // remaining = process_data(remaining) + #[arg(long, default_value_t=3)] + num_global_retries: usize, }, Sysreq { @@ -991,7 +1000,7 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> -async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, subset: &Option, bff_args: &BffArgs, num_retries: &usize) -> std::io::Result<()> { +async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, subset: &Option, bff_args: &BffArgs, num_retries: &usize, num_global_retries: &usize) -> std::io::Result<()> { /* General pseudocode: Setup: @@ -1033,56 +1042,71 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su bff_args.threads }; let threadpool = ThreadPool::new(threads); - let mut rng = rand::thread_rng(); - for io_pair in &io_pairs { - let bucket = bucket.clone(); - let bloom_filter = bloom_filter.clone(); - let bff_args = bff_args.clone(); - let err_count: Arc> = Arc::clone(&err_count); - let total_items = Arc::clone(&total_items); - let removed_items = Arc::clone(&removed_items); - let pbar_option: Option>> = if bff_args.no_progress { - None - } else { - Some(pbar.clone()) - }; - let num_retries = num_retries.clone(); - let (input_path, output_path) = io_pair.clone(); - threadpool.execute(move || { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let result = rt.block_on( - process_file_s3(&bucket, - &input_path, - &output_path, - &bloom_filter, - &bff_args, - &pbar_option, - num_retries) - ); - match result { - Ok(outputs) => { - let (rem_doc_items, tot_doc_items) = outputs; - let mut total_guard = total_items.lock().unwrap(); - *total_guard += tot_doc_items; - let mut removed_guard = removed_items.lock().unwrap(); - *removed_guard += rem_doc_items; - } - Err(err) => { - eprintln!("Error processing {}; {:?}", input_path, err); - let mut count = err_count.lock().unwrap(); - *count += 1; - } - } - }); - // Wait a little before spawning the next processor. - let random_delay = rng.gen_range(Duration::from_millis(0)..Duration::from_millis(100)); - sleep(random_delay).await; + for retry_count in 0..*num_global_retries { + let failed_io_pairs: Arc>> = Arc::new(Mutex::new(Vec::new())); + let mut rng = rand::thread_rng(); + for io_pair in &io_pairs { + let num_global_retries = (*num_global_retries).clone(); + let retry_count = retry_count.clone(); + let bucket = bucket.clone(); + let bloom_filter = bloom_filter.clone(); + let bff_args = bff_args.clone(); + let failed_io_pairs = Arc::clone(&failed_io_pairs); + let err_count: Arc> = Arc::clone(&err_count); + let total_items = Arc::clone(&total_items); + let removed_items = Arc::clone(&removed_items); + let pbar_option: Option>> = if bff_args.no_progress { + None + } else { + Some(pbar.clone()) + }; + + let (input_path, output_path) = io_pair.clone(); + threadpool.execute(move || { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let result = rt.block_on( + process_file_s3(&bucket, + &input_path, + &output_path, + &bloom_filter, + &bff_args, + &pbar_option, + num_retries) + ); + match result { + Ok(outputs) => { + let (rem_doc_items, tot_doc_items) = outputs; + let mut total_guard = total_items.lock().unwrap(); + *total_guard += tot_doc_items; + let mut removed_guard = removed_items.lock().unwrap(); + *removed_guard += rem_doc_items; + } + Err(err) => { + eprintln!("Round {}/{}: Error processing {}; {:?}", retry_count+1, num_global_retries, input_path, err); + if retry_count < num_global_retries - 1 { + // in all but last round, push the failed pair to failed_io_pairs + let mut fail_guard = failed_io_pairs.lock().unwrap(); + fail_guard.push((input_path, output_path)); + } else { + // in last round, give up and mark this one as an error + let mut count = err_count.lock().unwrap(); + *count += 1; + } + + } + } + }); + // Wait a little before spawning the next processor. + let random_delay = rng.gen_range(Duration::from_millis(0)..Duration::from_millis(100)); + sleep(random_delay).await; + } + threadpool.join(); + io_pairs = failed_io_pairs.lock().unwrap().clone(); } - threadpool.join(); println!("Completed filtering all files in {:?} seconds", loop_start_time.elapsed().as_secs()); From 0c558b0559b7cf161a9f4c43376f52e53ae11b08 Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Thu, 28 Mar 2024 11:19:34 -0700 Subject: [PATCH 39/56] bugfix --- src/main.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index ade801e..66de7ca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -886,8 +886,8 @@ async fn main() -> std::io::Result<()> { bff(inputs, output_directory, &bff_args)?; }, - Commands::BffRemote {bucket, input_dir, output_dir, subset, bff_args, num_retries} => { - bff_remote(bucket, input_dir, output_dir, subset, &bff_args, num_retries).await?; + Commands::BffRemote {bucket, input_dir, output_dir, subset, bff_args, num_retries, num_global_retries} => { + bff_remote(bucket, input_dir, output_dir, subset, &bff_args, num_retries, num_global_retries).await?; } Commands::Sysreq {expected_ngram_count, fp_rate} => { let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count, false); From ba8641ba90f21f24d091afa4b3d29eabdf8c6e46 Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Thu, 28 Mar 2024 11:21:30 -0700 Subject: [PATCH 40/56] Bugfix --- src/main.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main.rs b/src/main.rs index 66de7ca..3a2f401 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1047,6 +1047,7 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su let failed_io_pairs: Arc>> = Arc::new(Mutex::new(Vec::new())); let mut rng = rand::thread_rng(); for io_pair in &io_pairs { + let num_retries = (*num_retries).clone(); let num_global_retries = (*num_global_retries).clone(); let retry_count = retry_count.clone(); let bucket = bucket.clone(); From de972eac355ae3a7c45bea0338ae3f3e1e27e08a Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Thu, 28 Mar 2024 22:11:28 -0700 Subject: [PATCH 41/56] Support offset --- src/main.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/main.rs b/src/main.rs index 3a2f401..f1054ef 100644 --- a/src/main.rs +++ b/src/main.rs @@ -175,6 +175,9 @@ enum Commands { #[arg(long)] subset: Option, + #[arg(long, default_value_t=0)] + offset: usize, + #[command(flatten)] bff_args: BffArgs, @@ -827,7 +830,7 @@ fn extract_s3_basename(input_path: &str) -> &str { -async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Option) -> Result, Error> { +async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Option, offset: usize) -> Result, Error> { let region_provider = RegionProviderChain::default_provider(); let config = aws_config::defaults(BehaviorVersion::latest()) .region(region_provider) @@ -842,11 +845,17 @@ async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Opt .into_paginator() .send(); + let mut skipped = 0; let mut io_pairs: Vec<(String, String)> = Vec::new(); 'outer: while let Some(result) = response.next().await { match result { Ok(output) => { for object in output.contents() { + if skipped < offset { + // Skip files until the offset is reached + skipped += 1; + continue; + } if subset.is_some() && io_pairs.len() >= subset.unwrap() { // Saw enough data for subset, skip break 'outer; @@ -886,8 +895,8 @@ async fn main() -> std::io::Result<()> { bff(inputs, output_directory, &bff_args)?; }, - Commands::BffRemote {bucket, input_dir, output_dir, subset, bff_args, num_retries, num_global_retries} => { - bff_remote(bucket, input_dir, output_dir, subset, &bff_args, num_retries, num_global_retries).await?; + Commands::BffRemote {bucket, input_dir, output_dir, subset, bff_args, num_retries, num_global_retries, offset} => { + bff_remote(bucket, input_dir, output_dir, subset, &bff_args, num_retries, num_global_retries, offset).await?; } Commands::Sysreq {expected_ngram_count, fp_rate} => { let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count, false); @@ -1000,7 +1009,7 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> -async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, subset: &Option, bff_args: &BffArgs, num_retries: &usize, num_global_retries: &usize) -> std::io::Result<()> { +async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, subset: &Option, bff_args: &BffArgs, num_retries: &usize, num_global_retries: &usize, offset: &usize) -> std::io::Result<()> { /* General pseudocode: Setup: @@ -1014,7 +1023,7 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su */ let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); - let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir, subset).await.unwrap(); + let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir, subset, *offset).await.unwrap(); let num_files = io_pairs.len(); From 01934574bae567f913bbb1d08534d82e52b244c5 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Fri, 29 Mar 2024 11:29:34 -0700 Subject: [PATCH 42/56] Printed number of input files --- src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 457f07b..e148b05 100644 --- a/src/main.rs +++ b/src/main.rs @@ -986,7 +986,7 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir, subset).await.unwrap(); - + println!("Collected {} input files...", io_pairs.len()); let num_files = io_pairs.len(); let err_count = Arc::new(Mutex::new(0)); From 18cedb597ccda2667449fe5c3890c0668ac74934 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Fri, 29 Mar 2024 12:03:38 -0700 Subject: [PATCH 43/56] Added sharding control for BFF, printed out filter sparsity at the end --- src/main.rs | 61 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 7 deletions(-) mode change 100644 => 100755 src/main.rs diff --git a/src/main.rs b/src/main.rs old mode 100644 new mode 100755 index bda7ec1..038820b --- a/src/main.rs +++ b/src/main.rs @@ -125,6 +125,13 @@ struct BffArgs{ #[arg(long, short = 't', default_value_t = 0)] threads: usize, + + #[arg(long, default_value_t=0)] + shard_num: usize, + + #[arg(long, default_value_t=1)] + total_shards: usize, + } @@ -260,6 +267,17 @@ impl BloomFilter { self.bits.len() * size_of::() } + fn calculate_sparsity(&self) -> f64 { + let set_bits:usize = self.bits.par_iter() + .map(|atomic| { + let value = atomic.load(std::sync::atomic::Ordering::Relaxed); + value.count_ones() as usize + }) + .sum(); + let total_bits = self.size_in_bytes() * 8; + return (set_bits as f64) / (total_bits as f64); + } + fn new(size_in_bytes: usize, num_hashers: usize) -> Self { let mut rng = rand::thread_rng(); let mut hash_builder_seeds = Vec::with_capacity(num_hashers); @@ -830,7 +848,8 @@ fn extract_s3_basename(input_path: &str) -> &str { -async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Option, offset: usize) -> Result, Error> { +async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Option, offset: usize, + shard_num: usize, total_shards: usize) -> Result, Error> { let region_provider = RegionProviderChain::default_provider(); let config = aws_config::defaults(BehaviorVersion::latest()) .region(region_provider) @@ -875,10 +894,19 @@ async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Opt } } } + // select shard before we shuffle + let mut shard: Vec<(String, String)> = Vec::new(); + let mut idx = shard_num; + while idx < io_pairs.len() { + shard.push(io_pairs[idx].clone()); + idx += total_shards; + } + + // Then shuffle let mut rng = thread_rng(); - io_pairs.shuffle(&mut rng); + shard.shuffle(&mut rng); - Ok(io_pairs) + Ok(shard) } @@ -927,7 +955,22 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> create_dir_if_not_exists(output_directory).unwrap(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); let all_inputs = expand_dirs(inputs).unwrap(); - let pbar = ProgressBar::new(all_inputs.len() as u64) + + // Select shard and then shuffle + let mut shard: Vec = Vec::new(); + let mut idx = bff_args.shard_num; + while idx < all_inputs.len() { + shard.push(all_inputs[idx].clone()); + idx += bff_args.total_shards; + } + // Then shuffle + let mut rng = thread_rng(); + shard.shuffle(&mut rng); + + + + + let pbar = ProgressBar::new(shard.len() as u64) .with_style( ProgressStyle::with_template( "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", @@ -951,7 +994,7 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> let total_items = Arc::new(Mutex::new(0)); let removed_items = Arc::new(Mutex::new(0)); let threadpool = ThreadPool::new(threads); - for input in all_inputs { + for input in shard { //let mut output = output_directory.clone(); let output = output_directory.clone().join(input.file_name().unwrap()); //output.push(input.file_name().unwrap()); @@ -998,6 +1041,8 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> bloom_filter.write_to_file(&bff_args.bloom_filter_file).unwrap(); println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); } + println!("After running, BFF sparsity was {:?}", bloom_filter.calculate_sparsity()); + println!("Completed full BFF run in {:?} seconds", start_time.elapsed().as_secs()); let total_items = *total_items.lock().unwrap(); @@ -1024,7 +1069,8 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); - let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir, subset, *offset).await.unwrap(); + let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir, subset, *offset, + bff_args.shard_num, bff_args.total_shards).await.unwrap(); println!("Collected {} input files...", io_pairs.len()); let num_files = io_pairs.len(); @@ -1129,8 +1175,9 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); } println!("Error count is {}/{}", err_count.lock().unwrap(), num_files); - println!("Completed full BFF run in {:?} seconds", start_time.elapsed().as_secs()); + println!("After running, BFF sparsity was {:?}", bloom_filter.calculate_sparsity()); + println!("Completed full BFF run in {:?} seconds", start_time.elapsed().as_secs()); let total_items = *total_items.lock().unwrap(); let removed_items = *removed_items.lock().unwrap(); println!("Stats: Saw {} items | Removed {} of them", From cf0947cbcc5ae9fc8c570b00c88159c2d7a64ba7 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 8 Apr 2024 10:36:39 -0700 Subject: [PATCH 44/56] Allowing text in annotate mode --- src/main.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 038820b..23536f7 100755 --- a/src/main.rs +++ b/src/main.rs @@ -685,7 +685,6 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - let mut data: Value = serde_json::from_str(&line).unwrap(); let mut total_items = 0; let mut removed_items = 0; - let text = data["text"].as_str().unwrap(); @@ -704,7 +703,6 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - let mut total_ngrams = 0; let mut total_contained_ngrams = 0; - for paragraph_window in newlines.windows(2) { let paragraph = &text[paragraph_window[0]..paragraph_window[1]]; total_items += 1; @@ -780,6 +778,7 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - "bff_contained_ngram_count", "id", "source", + "text" ]; // Iterate through the keys of the JSON object and remove any field that is not in the allowed_fields list From 358b6b7d16951641c94290bf96412f82aaeff28e Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 8 Apr 2024 10:37:38 -0700 Subject: [PATCH 45/56] idk, cargo needed updating --- Cargo.toml | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 Cargo.toml diff --git a/Cargo.toml b/Cargo.toml old mode 100644 new mode 100755 From 222b50e23fa8cb1952283c522a926cb8bfcb77b8 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 8 Apr 2024 10:46:21 -0700 Subject: [PATCH 46/56] Added scripts/ to gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) mode change 100644 => 100755 .gitignore diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index 12c78f2..19dcc8e --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /target/ +/scripts/ /.idea/ -/*.iml \ No newline at end of file +/*.iml From 084df4c49d41cbf3ab2a60eaed649bb31d3ded3a Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 8 Apr 2024 11:57:29 -0700 Subject: [PATCH 47/56] Update README.md --- README.md | 108 ++++++++++++++---------------------------------------- 1 file changed, 28 insertions(+), 80 deletions(-) diff --git a/README.md b/README.md index bf6060b..5501c99 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ BFF === The big friendly filter 😁 +(originally written by Dirk @ AI2, updated by me) Getting started --------------- @@ -12,90 +13,37 @@ Getting started 2. Run `cargo build --release`. It places the binary at `target/release/bff`. 3. Run `./target/release/bff --help` to see the available options. + Examples -------- - -### Deduplicating a file against itself - -This is how you deduplicate a file against itself: -```bash -target/release/bff \ - --bloom-filter-file filter.bff \ - --bloom-filter-size 274877906944 \ - --expected-ngram-count 1000000000 \ - --output-directory deduped/ \ - input.json.gz -``` - -This creates the filter at `filter.bff`, with a size of 256 GB. -This size should be a little smaller than the amount of main memory you have. -It calculates the optimal setup for the filter based on the expected number of ngrams. -Getting that number right is very important. -If in doubt, guess high. -It's safer to guess a higher number than a lower number. -The filter will be created in memory, and only written to disk at the end of the job. - -### Deduplicating multiple files - -To get a lot of speed out of `bff`, you have to process multiple files at once: -```bash -target/release/bff \ - --bloom-filter-file filter.bff \ - --bloom-filter-size 274877906944 \ - --expected-ngram-count 1000000000 \ - --output-directory deduped/ \ - *.json.gz # Can also pass a directory containing .json.gz files here -``` - -Each input file will run in its own thread, and the filter will be shared between them. -In the end, as before the filter will be written to disk. - -### Automatically choosing filter size -To automatically compute the size of the filter, you can instead specify a false-positive rate. `bff` will create a bloom filter to attain that false positive rate, up to 90% of the system RAM. Note that false positive rate is per token-ngram, so the chance of a whole paragraph/document being marked as a false-positive will actually be quite less than the specified `fp-rate`: - -```bash -target/release/bff \ - --bloom-filter-file filter.bff \ - --fp-rate 0.01 \ - --expected-ngram-count 1000000000 \ - --output-directory deduped/ \ - input_dir/ +There are three modes `bff` (local input -> local output), `bff-remote` (S3 input -> S3 output), and `sysreq` (for assessing system requirements). We always need an input, output, false positive rate, and expected number of ngrams. But then there's some optional hyperparameters: + +- `--min-ngram-size`: In pargraph/both mode, we ignore any paragraphs shorter than this. Defaults to 5. +- `--max-ngram-size`: The "working width" of shinglings of ngrams: e.g., for long paragraphs/documents, we check membership over ngrams of this size. Defaults to 13. +- `--filtering-threshold`: If at least this fraction of ngrams is present, we remove the entire paragraph/document. Defaults to 0.8 + +And some REMOTE ONLY arguments: +- `--shard-num`: For large nummbers of files, sharding is helpful. This selects some subset of the files. Defaults to 0 +- `--num-shards`: Dictates how many shards we have. Defaults to 1. + +### Deduplicating local files: +For files that exist locally, say a directory `to_be_deduped/`, we can output deduplicated versions of these files in `has_been_deduped/` like: +```cargo run --release bff \ + --inputs to_be_deduped \ + --output-directory has_been_deduped \ + --expected-ngram-count 12345678 \ + --fp-rate 0.01 ``` - -### Pre-load the filter - -You can stick ngrams into the filter ahead of time, for example if you want to decontaminate your dataset: -```bash -target/release/bff \ - --bloom-filter-file decontaminating_filter.bff \ - --bloom-filter-size 274877906944 \ - --expected-ngram-count 1000000000 \ - --output-directory deduped/ \ - --filtering-threshold 1.0 \ - my_test_set.json.gz +### Deduplicating remote files +For files that exist on S3, say with the prefix `s3://my-bucket/to_be_deduped/`, we can output deduplicated versions of these files in `s3://my-bucket/has_been_deduped` like: +``` cargo run --release bff-remote \ +--bucket my-bucket \ +--input-dir to_be_deduped \ +--output_dir has_been_deduped \ +--expected-ngram-count 12345678 \\ +--fp-rate 0.01 ``` -This will copy the output unchanged to the `deduped/` directory, but it will also produce a filter that you can use afterwards. -It is important that you still take a good guess at the ngram count you expect to see when you do the actual -deduplication. -The parameters of the bloom filter are baked in when you first create the file, so you have to guess right the -first time. - -### Only decontaminate - -If you only want to decontaminate, but not deduplicate against itself, you can do that by using the filter -you just created in the previous step: -```bash -target/release/bff \ - --bloom-filter-file decontaminating_filter.bff \ - --bloom-filter-size 274877906944 \ - --expected-ngram-count 1000000000 \ - --output-directory deduped/ \ - --update-bloom-filter false \ - *.json.gz -``` +There's also some options to preload or save the bloom filter itself, but you can check the code for those. -If you are using the filter this way, you can use the number of ngrams in the decontamination set for the -`--expected-ngram-count` parameter. -Since this is usually much smaller, it might make the filter run faster. From 68ddf506ceb9f6ee52a8715e3d97055e6ac867e0 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 8 Apr 2024 12:01:07 -0700 Subject: [PATCH 48/56] Cleaned things up a bit --- bff_v0.py | 169 ---------------------------------- scripts/create_size_groups.py | 60 ------------ scripts/s3_script.sh | 99 -------------------- 3 files changed, 328 deletions(-) delete mode 100644 bff_v0.py delete mode 100644 scripts/create_size_groups.py delete mode 100644 scripts/s3_script.sh diff --git a/bff_v0.py b/bff_v0.py deleted file mode 100644 index a37da9d..0000000 --- a/bff_v0.py +++ /dev/null @@ -1,169 +0,0 @@ -""" Quick'n'dirty mapping of bff for python before I can make it pure-rust -How this works: (using ray) -- map to DOWNLOAD all files and store on some local location -- use this local mapping to feed to bff.rs -- map to UPLOAD all files to some output location -*Yes I know this is clunky and requires 2x dataset size in local storage - -ASSUMES .jsonl.gz files, with 'text' field and FLAT file structure on s3 -i.e. - s3://bucket/input_dir/ - file_00.jsonl.gz - file_01.jsonl.gz - . - . - . - file_N.jsonl.gz - # AND NO SUBDIRECTORIES! - -TODOS: -- Add default tokenizer/counter to get good filter/ngram sizes automatically -- Make this pure rust - + Only need |num_threads| * |file_size| *2 + |filter_size| memory - + Sadly I code rust like a preschooler, so I'm working on this =( -- Add a progress bar vs printing filenames w/in rust code -""" - - -import argparse -import ray -import boto3 -from pathlib import Path -import os -from tqdm.auto import tqdm -import glob -import subprocess - - -RUST_CMD = os.path.join(os.path.dirname(__file__), 'target/release/bff') - -# ================================================= -# = Ray/S3 Utils = -# ================================================= - -def to_iterator(obj_ids, batch_size=100): - while obj_ids: - done, obj_ids = ray.wait(obj_ids, num_returns=min(batch_size, len(obj_ids))) - for d in done: - yield ray.get(d) - - -def block_until_complete(ret): - """Use this when we want to efficiently wait until an iterable - of ray object ids finishes - """ - - for _ in tqdm(to_iterator(ret), total=len(ret)): - pass - ret = ray.get(ret) - - -def copy_to_s3(local_obj, remote_dir, disable_tqdm=True): - if remote_dir == None: - return - remote_obj = os.path.join(remote_dir, os.path.basename(local_obj)).replace("s3://", "") - bucket, key = remote_obj.split("/", 1) - s3 = boto3.client("s3") - local_obj_size = os.path.getsize(local_obj) - with tqdm(total=local_obj_size, unit="B", unit_scale=True, desc=local_obj, disable=disable_tqdm) as pbar: - s3.upload_file(Bucket=bucket, Filename=local_obj, Key=key, Callback=lambda b: pbar.update(b)) - -@ray.remote -def RAY_copy_to_s3(local_obj, remote_dir, disable_tqdm=True): - copy_to_s3(local_obj, remote_dir, disable_tqdm=disable_tqdm) - - -def copy_from_s3(remote_obj, local_dir, disable_tqdm=True): - bucket, key = remote_obj.replace("s3://", "").split("/", 1) - s3 = boto3.client("s3") - remote_obj_size = s3.head_object(Bucket=bucket, Key=key)["ContentLength"] - target = os.path.join(local_dir, os.path.basename(key)) - with tqdm(total=remote_obj_size, unit="B", unit_scale=True, desc=remote_obj, disable=disable_tqdm) as pbar: - s3.download_file(Bucket=bucket, Key=key, Filename=target, Callback=lambda b: pbar.update(b)) - -@ray.remote -def RAY_copy_from_s3(remote_obj, local_dir, disable_tqdm=True): - copy_from_s3(remote_obj, local_dir, disable_tqdm=disable_tqdm) - - -def list_s3_keys(prefix, suffix='.jsonl.gz'): - """ Lists full s3 uri's of all keys that match both the prefix and suffix. - if Suffix==None => no filtering on suffx - """ - s3 = boto3.resource("s3") - bucket_name, path_within_bucket = prefix.replace("s3://", "").split("/", 1) - bucket = s3.Bucket(bucket_name) - - s3_uris = [] - for x in bucket.objects.filter(Prefix=path_within_bucket): - if suffix == None or x.key.endswith(suffix): - s3_uris.append(os.path.join('s3://%s' % bucket_name, x.key)) - return s3_uris - - - -# ================================================ -# = Main block = -# ================================================ - - -def run_bff(tmp_dir, input_dir, output_dir, filter_size, expected_ngram_count): - # Subprocess DIY python<->rust bindings. Might be better to use maturin, but this is a bit simpler - cmd = '%s --bloom-filter-file %s --bloom-filter-size %s --expected-ngram-count %s --output-directory %s %s' % ( - RUST_CMD, - os.path.join(tmp_dir, 'filter.bff'), - filter_size, - expected_ngram_count, - os.path.join(output_dir, ''), - os.path.join(input_dir, '*') - ) - return_code = subprocess.run(cmd, shell=True).returncode - assert return_code == 0 - - -def main(s3_input_dir, s3_output_dir, filter_size, - expected_ngram_count, tmp_storage_dir, suffix='.jsonl.gz'): - # Step 0: initialize ray and args and local storage directories - ray.init(ignore_reinit_error=True) - - input_dir = os.path.join(tmp_storage_dir, 'input') - output_dir = os.path.join(tmp_storage_dir, 'output') - for p in input_dir, output_dir: - Path(p).mkdir(parents=True, exist_ok=True) - - # step 1: collect and download s3 files to disk - print("Collecting and downloading s3 files...") - s3_uris = list_s3_keys(s3_input_dir, suffix=suffix)[:10] - download_refs = [RAY_copy_from_s3.remote(uri, input_dir) for uri in s3_uris] - block_until_complete(download_refs) - - # Step 2: Run BFF - print("Running BFF on %s local files..." % len(s3_uris)) - run_bff(tmp_storage_dir, input_dir, output_dir, filter_size, expected_ngram_count) - - - # Step 3: upload output files to S3 - print("Uploading filtered files...") - output_files = glob.glob(os.path.join(output_dir, '*%s' % suffix if suffix != None else '*')) - upload_refs = [RAY_copy_to_s3.remote(f, s3_output_dir) for f in output_files] - block_until_complete(upload_refs) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - - # S3 adapter args - parser.add_argument('--s3-input-dir', type=str) - parser.add_argument('--s3-output-dir', type=str) - - # Args passed to rust function - parser.add_argument('--filter-size', type=int) - parser.add_argument('--expected-ngram-count', type=int) - parser.add_argument('--tmp-storage-dir', type=str, default='/tmp/bff') - - args = parser.parse_args() - main(s3_input_dir=args.s3_input_dir, - s3_output_dir=args.s3_output_dir, - filter_size=args.filter_size, - expected_ngram_count=args.expected_ngram_count, - tmp_storage_dir=args.tmp_storage_dir) diff --git a/scripts/create_size_groups.py b/scripts/create_size_groups.py deleted file mode 100644 index d40f51a..0000000 --- a/scripts/create_size_groups.py +++ /dev/null @@ -1,60 +0,0 @@ -""" -Simple adapter script that takes in the outputs of an -`aws s3 ls ...` command (that's been piped to an outfile) -and a max-file-size and groups these into files -""" - - -import argparse -import re - - -def create_groups(input_lines, group_size_in_bytes, re_suffix): - groups = [] - cur_group, cur_group_size = [], 0 - - parse_line = lambda line: line.strip().split(' ')[-2:]# outputs (size_in_bytes:string) - for line in input_lines: - next_line_bytes, next_line_file = parse_line(line) - if re.search(re_suffix, next_line_file) == None: continue - next_line_bytes = int(next_line_bytes) - if cur_group_size + next_line_bytes > group_size_in_bytes: # make new group - if len(cur_group) > 0: - groups.append(cur_group) - cur_group_size = next_line_bytes - cur_group = [next_line_file] - else: - cur_group_size += next_line_bytes - cur_group.append(next_line_file) - - groups.append(cur_group) - return groups - - -def write_groupfile(groups, output_file): - """ Modify this to make it easy to read in rust""" - with open(output_file, 'w') as f: - for group in groups: - f.write(','.join(group) + '\n') - - -def main(input_file, output_file, group_size_in_bytes, re_suffix): - input_lines = open(input_file, 'r').readlines() - groups = create_groups(input_lines, group_size_in_bytes, re_suffix) - write_groupfile(groups, output_file) - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser() - parser.add_argument('--input', type=str, required=True) - parser.add_argument('--output', type=str, required=True) - parser.add_argument('--groupsize', type=int, required=True) - parser.add_argument('--suffix', type=str, default=r'\.jsonl?\.gz$') - - args = parser.parse_args() - main(input_file=args.input, - output_file=args.output, - group_size_in_bytes=args.groupsize, - re_suffix=args.suffix) - diff --git a/scripts/s3_script.sh b/scripts/s3_script.sh deleted file mode 100644 index 88c1768..0000000 --- a/scripts/s3_script.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/bin/bash - -#Mostly GPT-written bash script to do the following: -#1. Download all jsonl.gz files from s3-src to stash-dir -#2. Compute arguments to feed to bff. Defaults: -# a. filter-size: (Main memory) * 0.90 = filter size -# b. expected-ngram-count: use wimbd to compute -#3. Run BFF -#4. Upload back to S3 -# - - -expected_ngrams="-1" -# Parse the named arguments -while [[ $# -gt 0 ]]; do - key="$1" - - case $key in - -i|--s3-input) - s3_input_dir="$2" - shift - shift - ;; - -t|--temp-dir) - temp_dir="$2" - shift - shift - ;; - -o|--s3-output) - s3_output_dir="$2" - shift - shift - ;; - -fp|--fp-rate) - fp_rate="$2" - shift - shift - ;; - --tok|--tokens) - exepected_ngrams="$2" - shift - shift - ;; - - *) - echo "Unknown option: $key" - exit 1 - ;; - esac -done - -input_files="$temp_dir/input" -output_files="$temp_dir/output" -mkdir -p $input_files -mkdir -p $output_files - -# ======================================== -# = Download from S3 = -# ======================================== -aws s3 cp $s3_input_dir $input_files --recursive - - - -# ======================================================= -# = Compute stats to give to script = -# ======================================================= - -# Use wimbd to get total tokens -echo "PRENGR, $expected_ngrams" -if [ "$expected_ngrams" == "-1" ]; then - echo "IN THIS BLOCK" - wimbd_stats=$(wimbd stats $input_files/*.jsonl.gz) - total_tokens=$(echo "$wimbd_stats" | grep "^total tokens" | sed 's/^total tokens: //' | tr -d ',') - total_documents=$(echo "$wimbd_stats" | grep "^total documents" | sed 's/^total documents: //' | tr -d ',') - expected_ngrams=$(( total_tokens-total_documents )) -fi - -echo "BFF ARGS" -echo "FP RATE $fp_rate" -echo "NGRAMS $expected_ngrams" - - -# ====================================================== -# = Actually run bff = -# ====================================================== -rm -f filter.bff # Always rebuilds the filter froms scratch -target/release/bff --bloom-filter-file filter.bff --expected-ngram-count $expected_ngrams --fp-rate $fp_rate --output-directory $output_files --no-save-bloom-filter $input_files - -# ================================================== -# = And then upload back to S3 = -# ================================================== -aws s3 cp $output_files $s3_output_dir --recursive - - - - - - - From 333e7d13a1826945e3035e39a0942226665c690c Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 8 Apr 2024 12:36:25 -0700 Subject: [PATCH 49/56] Made bloom-filter-file not mandatory --- src/main.rs | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/src/main.rs b/src/main.rs index 23536f7..806c8f0 100755 --- a/src/main.rs +++ b/src/main.rs @@ -78,8 +78,8 @@ struct BffArgs{ */ // Bloom filter kwargs - #[arg(required = true, long)] - bloom_filter_file: PathBuf, + #[arg(long)] + bloom_filter_file: Option, #[arg(required = true, long)] expected_ngram_count: usize, @@ -441,10 +441,12 @@ impl BloomFilter { */ let mut bloom_filter_size = bff_args.bloom_filter_size; - let bloom_filter = if bff_args.bloom_filter_file.exists() { - println!("Loading bloom filter from {:?}...", bff_args.bloom_filter_file); - BloomFilter::from_file(&bff_args.bloom_filter_file).unwrap() - } else { + let bloom_filter = match &bff_args.bloom_filter_file { + Some(path) if path.exists() => { + println!("Loading bloom filter from {:?}...", path); + BloomFilter::from_file(&path).unwrap() + } + _ => { println!("Creating new bloom filter..."); if bff_args.bloom_filter_size == 0 { bloom_filter_size = compute_bloom_size(bff_args.fp_rate, bff_args.expected_ngram_count, true); @@ -454,8 +456,10 @@ impl BloomFilter { bff_args.expected_ngram_count, ); BloomFilter::new(bloom_filter_size, num_hashers) + } }; + println!("Bloom filter has size {} | FP Rate {:?}", human_bytes(bloom_filter.size_in_bytes() as f64), @@ -1034,12 +1038,19 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> // FINALIZE PHASE - if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { - let write_start_time = Instant::now(); - println!("Writing bloom filter to {:?}...", bff_args.bloom_filter_file); - bloom_filter.write_to_file(&bff_args.bloom_filter_file).unwrap(); - println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); + match &bff_args.bloom_filter_file { + Some(path) => { + if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { + let write_start_time = Instant::now(); + println!("Writing bloom filter to {:?}...", path); + bloom_filter.write_to_file(&path).unwrap(); + println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); + } + } + _ => {} } + + println!("After running, BFF sparsity was {:?}", bloom_filter.calculate_sparsity()); println!("Completed full BFF run in {:?} seconds", start_time.elapsed().as_secs()); @@ -1167,12 +1178,18 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su loop_start_time.elapsed().as_secs()); // FINALIZE PHASE - if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { - let write_start_time = Instant::now(); - println!("Writing bloom filter to {:?}...", bff_args.bloom_filter_file); - bloom_filter.write_to_file(&bff_args.bloom_filter_file).unwrap(); - println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); + match &bff_args.bloom_filter_file { + Some(path) => { + if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { + let write_start_time = Instant::now(); + println!("Writing bloom filter to {:?}...", path); + bloom_filter.write_to_file(&path).unwrap(); + println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); + } + } + _ => {} } + println!("Error count is {}/{}", err_count.lock().unwrap(), num_files); println!("After running, BFF sparsity was {:?}", bloom_filter.calculate_sparsity()); From b7f0228deca2ff20f6cf8b7fbc3677492471d963 Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Sun, 14 Apr 2024 14:54:38 -0700 Subject: [PATCH 50/56] Read all lines before doing dedup --- src/main.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 806c8f0..6ce5b3f 100755 --- a/src/main.rs +++ b/src/main.rs @@ -630,6 +630,10 @@ async fn process_file_s3( let gz = asyncGZ::new(body_stream); let reader = tBufReader::with_capacity(1024 * 1024, gz); let mut lines_iter = reader.lines(); + let mut all_lines = Vec::new(); + while let Some(line) = lines_iter.next_line().await? { + all_lines.push(line); + } // Phase 1c: Setup output buffer to upload->s3 eventually... // TODO: Make output writer streaming too? @@ -643,7 +647,8 @@ async fn process_file_s3( let mut fully_skipped = 0; let mut removed_items = 0; let mut total_items = 0; - while let Some(line) = lines_iter.next_line().await? { + // while let Some(line) = lines_iter.next_line().await? { + for line in all_lines { count += 1; let (dedup_data, removed_line_items, total_line_items) = process_line(&line.to_string(), &bloom_filter, &bff_args); removed_items += removed_line_items; From 703cd3b8508a08a8ab44251556a9af47672f43cc Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Sun, 14 Apr 2024 15:04:52 -0700 Subject: [PATCH 51/56] Ignore creds --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 19dcc8e..17059b2 100755 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ /scripts/ /.idea/ /*.iml +scripts_achal/credentials From 826040131b6bd55e0ad64d6b41a7f60b05f0b264 Mon Sep 17 00:00:00 2001 From: Achal Dave Date: Sun, 14 Apr 2024 15:39:44 -0700 Subject: [PATCH 52/56] Print loop progress every so often --- src/main.rs | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index 6ce5b3f..52cfc49 100755 --- a/src/main.rs +++ b/src/main.rs @@ -571,12 +571,24 @@ fn process_file( match pbar_option { - Some(pbar) => pbar.lock().unwrap().inc(1), + Some(pbar) => { + let pb = pbar.lock().unwrap(); + pb.inc(1); + if pb.position() < 10 || pb.position() % 100 == 0 { + println!("Log Progress: {}/{} - {} elapsed, ETA {}", + pb.position(), pb.length().unwrap(), + format_duration(pb.elapsed()), + format_duration(pb.eta())); + } + } None => (), } Ok((removed_items, total_items)) } +fn format_duration(dur: Duration) -> String { + format!("{:02}:{:02}:{:02}", dur.as_secs() / 3600, (dur.as_secs() % 3600) / 60, dur.as_secs() % 60) +} async fn get_object_with_retry(client: &Client, bucket: &str, key: &str, num_retries: usize) -> Result { let mut attempts = 0; @@ -681,10 +693,18 @@ async fn process_file_s3( .await?; } match pbar_option { - Some(pbar) => pbar.lock().unwrap().inc(1), + Some(pbar) => { + let pb = pbar.lock().unwrap(); + pb.inc(1); + if pb.position() < 10 || pb.position() % 100 == 0 { + println!("Log Progress: {}/{} - {} elapsed, ETA {}", + pb.position(), pb.length().unwrap(), + format_duration(pb.elapsed()), + format_duration(pb.eta())); + } + } None => (), } - Ok((removed_items, total_items)) } From f5eeea84b1f0d98fca003f3324ab88d04bc3736f Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 15 Apr 2024 11:27:47 -0700 Subject: [PATCH 53/56] Folded in no-stream for bff-remote --- src/main.rs | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/src/main.rs b/src/main.rs index 52cfc49..b3ca9a3 100755 --- a/src/main.rs +++ b/src/main.rs @@ -131,7 +131,6 @@ struct BffArgs{ #[arg(long, default_value_t=1)] total_shards: usize, - } @@ -153,7 +152,6 @@ enum Commands { + bucket + input_dir + output_dir - + subset */ #[clap(arg_required_else_help = true)] @@ -179,12 +177,6 @@ enum Commands { #[arg(required=true, long)] output_dir: String, - #[arg(long)] - subset: Option, - - #[arg(long, default_value_t=0)] - offset: usize, - #[command(flatten)] bff_args: BffArgs, @@ -659,7 +651,6 @@ async fn process_file_s3( let mut fully_skipped = 0; let mut removed_items = 0; let mut total_items = 0; - // while let Some(line) = lines_iter.next_line().await? { for line in all_lines { count += 1; let (dedup_data, removed_line_items, total_line_items) = process_line(&line.to_string(), &bloom_filter, &bff_args); @@ -876,7 +867,7 @@ fn extract_s3_basename(input_path: &str) -> &str { -async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Option, offset: usize, +async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, shard_num: usize, total_shards: usize) -> Result, Error> { let region_provider = RegionProviderChain::default_provider(); let config = aws_config::defaults(BehaviorVersion::latest()) @@ -892,21 +883,11 @@ async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, subset: &Opt .into_paginator() .send(); - let mut skipped = 0; let mut io_pairs: Vec<(String, String)> = Vec::new(); - 'outer: while let Some(result) = response.next().await { + while let Some(result) = response.next().await { match result { Ok(output) => { for object in output.contents() { - if skipped < offset { - // Skip files until the offset is reached - skipped += 1; - continue; - } - if subset.is_some() && io_pairs.len() >= subset.unwrap() { - // Saw enough data for subset, skip - break 'outer; - } let input_key = object.key().unwrap(); if !(input_key.ends_with(".jsonl.gz") || input_key.ends_with(".json.gz")) { continue; @@ -948,11 +929,13 @@ async fn main() -> std::io::Result<()> { match &args.command { Commands::Bff {inputs, output_directory, bff_args} => { + assert!(bff_args.shard_num < bff_args.total_shards, "Shard num must be <= total shards"); bff(inputs, output_directory, &bff_args)?; }, - Commands::BffRemote {bucket, input_dir, output_dir, subset, bff_args, num_retries, num_global_retries, offset} => { - bff_remote(bucket, input_dir, output_dir, subset, &bff_args, num_retries, num_global_retries, offset).await?; + Commands::BffRemote {bucket, input_dir, output_dir, bff_args, num_retries, num_global_retries} => { + assert!(bff_args.shard_num < bff_args.total_shards, "Shard num must be <= total shards"); + bff_remote(bucket, input_dir, output_dir, &bff_args, num_retries, num_global_retries).await?; } Commands::Sysreq {expected_ngram_count, fp_rate} => { let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count, false); @@ -1089,7 +1072,7 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> -async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, subset: &Option, bff_args: &BffArgs, num_retries: &usize, num_global_retries: &usize, offset: &usize) -> std::io::Result<()> { +async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bff_args: &BffArgs, num_retries: &usize, num_global_retries: &usize) -> std::io::Result<()> { /* General pseudocode: Setup: @@ -1104,7 +1087,7 @@ async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, su let start_time = Instant::now(); let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); - let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir, subset, *offset, + let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir, bff_args.shard_num, bff_args.total_shards).await.unwrap(); println!("Collected {} input files...", io_pairs.len()); From 2d037922e08bca253ccda4137ee92fc3eef2026f Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 15 Apr 2024 12:25:15 -0700 Subject: [PATCH 54/56] Added s5cmd wrapper --- scripts/s5cmd_wrapper.sh | 116 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 116 insertions(+) create mode 100755 scripts/s5cmd_wrapper.sh diff --git a/scripts/s5cmd_wrapper.sh b/scripts/s5cmd_wrapper.sh new file mode 100755 index 0000000..8e2251c --- /dev/null +++ b/scripts/s5cmd_wrapper.sh @@ -0,0 +1,116 @@ +#!/bin/bash + + +# Default args +fp_rate="0.01" +filtering_threshold="0.8" +min_ngram_size="20" +max_ngram_size="20" +remove_type="both" +shard_num="0" +total_shards="1" + + +# Parsing the args + +while [[ $# -gt 0 ]]; do + case $1 in + --s3-input=*) # REQUIRED + s3_input="${1#*=}" + shift + ;; + --s3-output=*) # REQUIRED + s3_output="${1#*=}" + shift + ;; + --local-dir=*) #REQUIRED + local_dir="${1#*=}" + shift + ;; + --expected-ngram-count=*) # REQUIRED + expected_ngram_count="${1#*=}" + shift + ;; + --fp-rate=*) # default 0.01 + fp_rate="${1#*=}" + shift + ;; + --min-ngram-size=*) # default 20 + s3_output="${1#*=}" + shift + ;; + --max-ngramsize=*) # default 20 + max_ngram_size="${1#*=}" + shift + ;; + --filtering-threshold=*) # default 0.8 + filtering_threshold="${1#*=}" + shift + ;; + --remove-type=*) # default both + remove_type="${1#*=}" + shift + ;; + --shard-num=*) # default 0 + shard_num="${1#*=}" + shift + ;; + --total-shards=*) # default 1 + total_shards="${1#*=}" + shift + ;; + *) + echo "Unknown argument: $1" + shift + ;; + esac +done + +# Run some asserts that the up/down is specified correctly +if [[ -z $s3_input ]]; then + echo "Error: --s3-input is required and cannot be empty." + exit 1 +fi + +if [[ -z $s3_output ]]; then + echo "Error: --s3-output is required and cannot be empty." + exit 1 +fi + +if [[ -z $local_dir ]]; then + echo "Error: --local-dir is required and cannot be empty." + exit 1 +fi + + +# Actually do the thing: +input_dir=${local_dir%/}/input/ +output_dir=${local_dir%/}/output/ + +mkdir -p $input_dir +mkdir -p $output_dir + +echo "Downloading S3 files to local..." +s5cmd cp --show-progress "${s3_input%/}/*" $input_dir + + +echo "Running BFF..." +cargo run --release bff \ +--inputs $input_dir \ +--output-directory $output_dir \ +--expected-ngram-count $expected_ngram_count \ +--fp-rate $fp_rate \ +--min-ngram-size $min_ngram_size \ +--max-ngram-size $max_ngram_size \ +--filtering-threshold $filtering_threshold \ +--remove-type $remove_type \ +--shard-num $shard_num \ +--total-shards $total_shards \ + + +echo "Uploading dedup'ed data to S3..." +s5cmd cp --show-progress $output_dir $s3_output + + + + From 7e686d4498a26b32f986a2e706e699d51e485f40 Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Mon, 15 Apr 2024 14:28:57 -0700 Subject: [PATCH 55/56] oops --- scripts/s5cmd_wrapper.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/s5cmd_wrapper.sh b/scripts/s5cmd_wrapper.sh index 8e2251c..b4efa84 100755 --- a/scripts/s5cmd_wrapper.sh +++ b/scripts/s5cmd_wrapper.sh @@ -36,7 +36,7 @@ while [[ $# -gt 0 ]]; do shift ;; --min-ngram-size=*) # default 20 - s3_output="${1#*=}" + min_ngram_size="${1#*=}" shift ;; --max-ngramsize=*) # default 20 From b4267cf857a76d74297efe0dbaa45f6c2978b6ea Mon Sep 17 00:00:00 2001 From: Matt Jordan Date: Wed, 17 Apr 2024 11:33:00 -0700 Subject: [PATCH 56/56] Ready for PR; @dirk, it's time to d-d-duel --- src/main.rs | 950 ++++++++++++++++------------------------------------ 1 file changed, 284 insertions(+), 666 deletions(-) diff --git a/src/main.rs b/src/main.rs index b3ca9a3..903ea16 100755 --- a/src/main.rs +++ b/src/main.rs @@ -1,51 +1,36 @@ use ahash::RandomState; -use anyhow::{anyhow, Result, Error}; +use anyhow::{anyhow, Result}; use byteorder::{LittleEndian, NativeEndian, ReadBytesExt, WriteBytesExt}; -use clap::{Args, Parser, Subcommand}; -use flate2::read::{MultiGzDecoder}; +use clap::{Parser, Subcommand}; +use flate2::read::MultiGzDecoder; use flate2::write::GzEncoder; use flate2::Compression; -use glob::glob; -use human_bytes::human_bytes; -use indicatif::{ProgressBar,ProgressStyle}; -use rand::{Rng,thread_rng}; +use rand::{Rng, thread_rng}; use rand::seq::SliceRandom; use serde_json::Value; use std::clone::Clone; use std::collections::VecDeque; -use std::fs::{OpenOptions, remove_file, create_dir_all}; use std::hash::{BuildHasher, Hash, Hasher}; use std::io; -use std::io::{Cursor}; use std::io::{BufRead, BufReader, BufWriter, Write}; use std::mem::size_of; -use std::path::{PathBuf, Path}; -use std::string::String; +use std::path::{PathBuf}; use std::sync::atomic::{AtomicU32, Ordering}; -use std::sync::{Arc, Mutex}; -use std::time::{Instant}; use std::thread::available_parallelism; -use sysinfo::{ - System, -}; use threadpool::ThreadPool; use unicode_segmentation::UnicodeSegmentation; -use aws_config::meta::region::RegionProviderChain; -use aws_config::BehaviorVersion; -use aws_sdk_s3::{Client}; -use aws_sdk_s3::primitives::ByteStream; -use aws_sdk_s3::operation::get_object::GetObjectOutput; -use tokio::io::{AsyncBufReadExt}; -use tokio::io::BufReader as tBufReader; -use tokio::time::{Duration, sleep}; -use async_compression::tokio::bufread::GzipDecoder as asyncGZ; use rayon::prelude::*; +use sysinfo::{ + System, +}; +use glob::glob; +use human_bytes::human_bytes; +use std::fs::{OpenOptions, remove_file, create_dir_all}; +use std::sync::{Arc, Mutex}; +use indicatif::{ProgressBar,ProgressStyle}; +use std::time::{Instant}; -/*======================================================= -= Argument Struct = -=======================================================*/ - #[derive(Parser)] #[clap(author, version, about, long_about = None)] struct ArgParser { @@ -53,170 +38,134 @@ struct ArgParser { command: Commands, } - -#[derive(Debug, Clone, Args)] -struct BffArgs{ - /* - -- BLOOM FILTER KWARGS - + bloom_filter_location: where we save/load the bloom filter - + expected_ngram_count: how many ngrams we're expecting - + fp_rate: false positive (per ngram) we're expecting - -- BLOOM FILTER HYPERPARAMS - + min_ngram_size (default: 5), smallest ngram size to consider - + max_ngram_size (default: 13), largest ngram size to consider - + filtering_threshold (default 0.80), threshold used to determine if text is duplicate - - -- BLOOM FILTER OVERRIDE KWARGS: - + bloom_filter_size (default: 0), if >0 we force the filter to have this size - + no_update_bloom_filter (default: false), if true, we never update the bloom filter - + no_save_bloom_filter (default: false), if true, we don't save the bloom filter at the end - + annotate_only (default: false), if true we leave text intact but annotate with which spans are duplicates - + whole_document (default: false), if true, we dedup across the whole document (spanning pargaraphs) - + whole_paragraph (default: false), if true, we don't match ngrams but rather whole paragraphs - + no_progress (default: false), if true, we don't display a progress bar, instead printing out files as we handle them - + threads: (default: 0), if > 0, we force use of this many threads, o/w it's automatically computed - */ - - // Bloom filter kwargs - #[arg(long)] - bloom_filter_file: Option, - - #[arg(required = true, long)] - expected_ngram_count: usize, - - #[arg(required = true, long)] - fp_rate: f64, - - // Bloom filter hyperparams - #[arg(long, default_value_t = 5)] - min_ngram_size: usize, - - #[arg(long, default_value_t = 13)] - max_ngram_size: usize, - - #[arg(long, default_value_t = 0.80)] - filtering_threshold: f64, - - // Bloom filter override args - #[arg(long, default_value_t=0)] - bloom_filter_size: usize, - - #[arg(long, default_value_t = false)] - no_update_bloom_filter: bool, - - #[arg(long, default_value_t = false)] - no_save_bloom_filter: bool, - - #[arg(long, default_value_t = false)] - annotate_attribute_only: bool, - - #[arg(long, default_value_t = RemoveType::Paragraph, value_enum)] - remove_type: RemoveType, - - #[arg(long, default_value_t = false)] - whole_document: bool, - - #[arg(long, default_value_t = false)] - whole_paragraphs: bool, - - #[arg(long, default_value_t = false)] - no_progress: bool, - - #[arg(long, short = 't', default_value_t = 0)] - threads: usize, - - - #[arg(long, default_value_t=0)] - shard_num: usize, - - #[arg(long, default_value_t=1)] - total_shards: usize, -} - - - #[derive(Subcommand, Debug)] enum Commands { - /* Two commands here: - - `bff` is for LOCAL files (local in -> local out) - - `bff_remote` is for S3 files (S3 in -> S3 out) - Where each takes default arguments of: - - - And then subcommand arguments - -- bff: - + inputs: file or files (directories okay) of gzip compressed newline-delimited JSON files with a 'text' field - + output_directory: where the deduplicated files get loaded to - - -- bff_remote: - + bucket - + input_dir - + output_dir - */ - #[clap(arg_required_else_help = true)] Bff { - // subcommand arguments + /// (List of) directories or files that are jsonl.gz files #[arg(required=true, long)] inputs: Vec, + /// Output directory where the deduplicated files will end up. + /// These will have the same basename as the inputs, so it is up to you to ensure no collisions here! #[arg(required=true, long)] output_directory: PathBuf, - #[command(flatten)] - bff_args: BffArgs, - }, - - BffRemote { - #[arg(required=true, long)] - bucket: String, + /// If specified, tries to load the bloom filter from this file, and will save once complete. + /// If unspecified, will not save the bloom filter at the end + #[arg(long)] + bloom_filter_file: Option, + /// The number of expected ngrams. This is used to calculate the optimal number of hashers. + /// If the filter already exists, this parameter is ignored. #[arg(required=true, long)] - input_dir: String, + expected_ngram_count: usize, + /// The desired false positive rate + /// Note that this is a per-ngram FP rate, and not a per-paragraph rate #[arg(required=true, long)] - output_dir: String, - - #[command(flatten)] - bff_args: BffArgs, - - // Local number of retries; we try to load each file from s3 this many times. - #[arg(long, default_value_t=3)] - num_retries: usize, - - // Global number of retries; we do a full loop through all remaining files this many times. - // i.e., - // remaining = all_paths - // for i in num_retries: - // remaining = process_data(remaining) - #[arg(long, default_value_t=3)] - num_global_retries: usize, + fp_rate: f64, + + /// The smallest ngram size to consider. Paragraphs that have fewer than this number of tokens + /// are not deduplicated and always kept. These ngrams are never added to the bloom filter. + /// Note that this value only matters if the paragraph has fewer tokens than the max ngram size. + #[arg(long, default_value_t = 20)] + min_ngram_size: usize, + + /// The largest ngram size to consider. Paragraphs are deduplicated based on the number of + /// ngrams of this size that are already present in the bloom filter. + #[arg(long, default_value_t = 20)] + max_ngram_size: usize, + + /// If this fraction of ngrams of the max ngram size are already present in the bloom filter, + /// the paragraph is considered a duplicate and is discarded. + /// Set this to 0 to never produce any output. This is useful when you want to prime the filter + /// with some content that should be considered duplicates, without deduplicating that content + /// itself. + #[arg(long, default_value_t = 0.80)] + filtering_threshold: f64, + + /// Which "BFF mode" we're in. We have options of 'paragraph', 'document', 'both' + /// indicating we remove individual paragraphs/documents if duplicated + /// The logic for "both" mode is a bit subtle. See comments below + #[arg(long, default_value_t = RemoveType::Paragraph, value_enum)] + remove_type: RemoveType, + + /// Whether or not to update the bloom filter. If this is true, the filter is not updated, but + /// the input is still deduplicated based on the filter. Default is false. + #[arg(long, default_value_t = false)] + no_update_bloom_filter: bool, + + /// If this is true, we keep the input intact, but we add an annotation to each document that + /// explains which spans from the text would have been deleted. + #[arg(long, default_value_t = false)] + annotate: bool, + + /// The number of threads to use for processing. + /// If this is 0, the number of threads is automatically determined. + #[arg(long, short = 't', default_value_t = 0)] + threads: usize, + + /// If this flag is present, we will never save a bloom filter to disk + #[arg(long, default_value_t = false)] + no_save_bloom_filter: bool, + + + /// Turn this flag on if we don't want to use a progress bar + /// Helpful when running through ssh and wanting to check progress via logs and not a terminal + #[arg(long, default_value_t = false)] + no_progress_bar: bool, + + /// For virtual "sharding", this param and the next one subselect files to deduplicate together + /// Defaults to no virtual sharding + #[arg(long, default_value_t=0)] + shard_num: usize, + + #[arg(long, default_value_t=1)] + total_shards: usize, }, Sysreq { + /// Handy tool to help guess RAM requirements for a given pool of data #[arg(required=true, long)] expected_ngram_count: usize, #[arg(required=true, long)] fp_rate: f64 }, - + } - #[derive(Debug, Clone, Eq, PartialEq, clap::ValueEnum)] enum RemoveType { - // Types for what we check to see if is a duplicate - Paragraph, // Paragraph level only - Document, // Whole document only - Both, // Does paragraph first, but if enough of the ngrams are contained in the bff, removes the whole document - // NOTE: ^ will add some ngram data (OF TO-REMOVE ngrams) into the filter [other methods don't do this] + /// Types for what we check to see if is a duplicate + + ///Checks each paragraph of size >=min_ngram_size if it is duplicated. If so, it gets removed + Paragraph, + + /// Checks if enough of the ngrams of size ==max_ngram_size (or just one ngram if tokens in range [min_ngram_size, max_ngram_size]) + /// and if enough are present in filter, the whole document gets removed + Document, + + /// Does paragraph removal, BUT if enough of the paragraph ngram checks are contained, removes the whole document + Both, } +fn tokenize(s: &str) -> impl Iterator { + s.split_word_bounds().filter(|w| { + for c in w.chars() { + if !c.is_whitespace() { + return true; + } + } + false + }) +} + -/*=================================================== -= Bloom Filter stuff = -===================================================*/ +/*============================================================= += Bloom Filter stuff = +==============================================================*/ struct BloomFilter { bits: Vec, @@ -295,6 +244,8 @@ impl BloomFilter { } } + + fn from_file(path: &PathBuf) -> io::Result { let mut file = OpenOptions::new() .read(true) @@ -410,11 +361,9 @@ impl BloomFilter { return false; } } - true } - #[allow(dead_code)] // use in unit test fn contains(&self, s: &VecDeque<&str>) -> bool { let hashes = self.hashes(s); @@ -422,8 +371,8 @@ impl BloomFilter { } - fn from_args(bff_args: &BffArgs) -> Self { - /* Uses a BFFArgs object to build a bloom filter + fn from_args(bloom_filter_file: Option, expected_ngram_count: usize, fp_rate: f64,) -> Self { + /* Uses the CLI args to build a bloom filter Logic: - Check if file exists, if so, just load it and return - Get size: @@ -431,35 +380,30 @@ impl BloomFilter { + otherwise, compute based on ngrams + fp rate - Return */ - let mut bloom_filter_size = bff_args.bloom_filter_size; - let bloom_filter = match &bff_args.bloom_filter_file { + let bloom_filter = match &bloom_filter_file { Some(path) if path.exists() => { println!("Loading bloom filter from {:?}...", path); BloomFilter::from_file(&path).unwrap() } _ => { println!("Creating new bloom filter..."); - if bff_args.bloom_filter_size == 0 { - bloom_filter_size = compute_bloom_size(bff_args.fp_rate, bff_args.expected_ngram_count, true); - } + let bloom_filter_size = compute_bloom_size(fp_rate, expected_ngram_count, true); let num_hashers = BloomFilter::optimal_number_of_hashers( bloom_filter_size, - bff_args.expected_ngram_count, + expected_ngram_count, ); BloomFilter::new(bloom_filter_size, num_hashers) } }; - - println!("Bloom filter has size {} | FP Rate {:?}", human_bytes(bloom_filter.size_in_bytes() as f64), - bloom_filter.my_prob_of_false_positive(bff_args.expected_ngram_count)); + bloom_filter.my_prob_of_false_positive(expected_ngram_count)); bloom_filter } -} +} @@ -507,13 +451,21 @@ fn compute_bloom_size(fp_rate: f64, expected_ngram_count: usize, limit_to_sys: b } -#[allow(clippy::too_many_arguments)] + + + +#[allow(clippy::too_many_arguments)] // TODO : abstract parameters into a struct fn process_file( input_file: &PathBuf, - output_file_path: &PathBuf, + output_file: &PathBuf, bloom_filter: &Arc, - bff_args: &BffArgs, - pbar_option: &Option>>, + max_ngram_size: usize, + min_ngram_size: usize, + remove_type: &RemoveType, + filtering_threshold: f64, + no_update_bloom_filter: bool, + annotate: bool, + pbar_option: &Option>>, ) -> Result<(usize, usize), io::Error> { // Setup input/output writers @@ -524,32 +476,33 @@ fn process_file( .open(input_file)?; let reader = BufReader::with_capacity(1024 * 1024, MultiGzDecoder::new(input_file)); - + let output_file_pathbuf = output_file; let output_file = OpenOptions::new() .read(false) .write(true) .create(true) .truncate(true) - .open(output_file_path)?; + .open(output_file)?; let mut writer = BufWriter::with_capacity( 1024 * 1024, GzEncoder::new(output_file, Compression::default()), ); - - // Loop over lines and do BFF stuff + // Loop over lines and do bff stuff let mut count = 0; let mut fully_skipped = 0; - let mut removed_items = 0; - let mut total_items = 0; + let mut removed_text_bytes = 0; + let mut total_text_bytes = 0; for line in reader.lines() { count += 1; - let (dedup_data, removed_line_items, total_line_items) = process_line(&line.unwrap(), &bloom_filter, &bff_args); - removed_items += removed_line_items; - total_items += total_line_items; - + let (dedup_data, removed_line_bytes, total_line_bytes) = process_line(&line.unwrap(), &bloom_filter, + min_ngram_size, max_ngram_size, + remove_type, filtering_threshold, + no_update_bloom_filter, annotate); + removed_text_bytes += removed_line_bytes; + total_text_bytes += total_line_bytes; if dedup_data.get("text").unwrap().as_str().unwrap().trim().is_empty() { - fully_skipped += 1; + fully_skipped += 1 } else { serde_json::to_writer(&mut writer, &dedup_data)?; @@ -558,157 +511,45 @@ fn process_file( } if count == fully_skipped { - remove_file(output_file_path)?; + remove_file(output_file_pathbuf)?; } - - match pbar_option { Some(pbar) => { let pb = pbar.lock().unwrap(); pb.inc(1); - if pb.position() < 10 || pb.position() % 100 == 0 { - println!("Log Progress: {}/{} - {} elapsed, ETA {}", - pb.position(), pb.length().unwrap(), - format_duration(pb.elapsed()), - format_duration(pb.eta())); - } } None => (), } - Ok((removed_items, total_items)) -} - -fn format_duration(dur: Duration) -> String { - format!("{:02}:{:02}:{:02}", dur.as_secs() / 3600, (dur.as_secs() % 3600) / 60, dur.as_secs() % 60) -} - -async fn get_object_with_retry(client: &Client, bucket: &str, key: &str, num_retries: usize) -> Result { - let mut attempts = 0; - let base_delay = Duration::from_millis(100); - let max_delay = Duration::from_millis(2000); - - let mut rng = rand::thread_rng(); - - loop { - match client.get_object().bucket(bucket).key(key).send().await { - Ok(response) => return Ok(response), - Err(e) if attempts < num_retries => { - // Calculate delay for exponential backoff, add some randomness so multiple threads don't access at the - // same time. - println!("Error {}/{}: {}", e, attempts, num_retries); - let random_delay = rng.gen_range(Duration::from_millis(0)..Duration::from_millis(1000)); - let mut exponential_delay = base_delay * 2u32.pow(attempts as u32); - if exponential_delay > max_delay { - exponential_delay = max_delay; - } - sleep(exponential_delay + random_delay).await; - attempts += 1; - } - Err(e) => { - println!("Too many errors reading: {}. Giving up.", key); - return Err(e.into()); - } - } - } + Ok((removed_text_bytes, total_text_bytes)) } -async fn process_file_s3( - s3_bucket: &String, - s3_input: &String, - s3_output: &String, - bloom_filter: &Arc, - bff_args: &BffArgs, - pbar_option: &Option>>, - num_retries: usize, -) -> Result<(usize, usize), Error> { - // Phase 1a: Build s3 client - let region_provider = RegionProviderChain::default_provider(); - let config = aws_config::defaults(BehaviorVersion::latest()) - .region(region_provider) - .load() - .await; - let client = Client::new(&config); - - let object = get_object_with_retry(&client, s3_bucket, s3_input, num_retries).await?; - let body_stream = object.body.into_async_read(); - let gz = asyncGZ::new(body_stream); - let reader = tBufReader::with_capacity(1024 * 1024, gz); - let mut lines_iter = reader.lines(); - let mut all_lines = Vec::new(); - while let Some(line) = lines_iter.next_line().await? { - all_lines.push(line); - } - - // Phase 1c: Setup output buffer to upload->s3 eventually... - // TODO: Make output writer streaming too? - let mut output_data = Vec::new(); - let encoder = GzEncoder::new(Cursor::new(&mut output_data), Compression::default()); - let mut buf_writer = BufWriter::with_capacity(1024 * 1024, encoder); - - // Phase 2: Loop over lines, process each line, and write it if not fully eradicated - let mut count = 0; - let mut fully_skipped = 0; - let mut removed_items = 0; - let mut total_items = 0; - for line in all_lines { - count += 1; - let (dedup_data, removed_line_items, total_line_items) = process_line(&line.to_string(), &bloom_filter, &bff_args); - removed_items += removed_line_items; - total_items += total_line_items; - if dedup_data.get("text").unwrap().as_str().unwrap().is_empty() { - fully_skipped += 1; - } - else { - serde_json::to_writer(&mut buf_writer, &dedup_data)?; - buf_writer.write_all(b"\n")?; - } - - } - // println!("Number of lines in {:?} is {}", s3_input, count); - - - // Phase 3: to finalize, write to s3 if there's something to write - buf_writer.flush()?; - let encoder = buf_writer.into_inner().expect("Failed to get encoder"); - encoder.finish().unwrap(); - - if fully_skipped < count { - let bytes_to_upload = ByteStream::from(output_data); - client - .put_object() - .bucket(s3_bucket) - .key(s3_output) - .body(bytes_to_upload) - .send() - .await?; - } - match pbar_option { - Some(pbar) => { - let pb = pbar.lock().unwrap(); - pb.inc(1); - if pb.position() < 10 || pb.position() % 100 == 0 { - println!("Log Progress: {}/{} - {} elapsed, ETA {}", - pb.position(), pb.length().unwrap(), - format_duration(pb.elapsed()), - format_duration(pb.eta())); - } - } - None => (), - } - Ok((removed_items, total_items)) -} +fn process_line(line: &String, bloom_filter: &BloomFilter, min_ngram_size: usize, max_ngram_size: usize, + remove_type: &RemoveType, filtering_threshold: f64, no_update_bloom_filter: bool, annotate: bool) -> + (serde_json::Value, usize, usize) { + // Main BFF logic: processes a single json document + // Does the following (handling the {paragraph, document, both} cases) + // 1. Breaks document into units (paragraph/both -> paragraph; document -> full text) + // 2. For each unit, tokenize and + // a. if num_tokens < min_ngram_size: do nothing, leave this unit intact + // b. if num_tokens >= max_ngram_size: break unit into ngram-shingling of max_ngram_size + // c. else, full unit is treated as one ngram + // 3. Check containment of each ngram in bloom filter. + // a. If > filtering_threshold contained, mark unit for deletion + // 4. If unit survives step 3, add all ngrams into bloom filter + // 5. BOTH-mode ONLY: If total_contained_ngrams * threshold >= total_ngrams, omit the WHOLE document + // Outputs are (output_json, total_removed_bytes, total_input_bytes) + // If annotate is turned on, nothing gets removed, text is left intact, but byte-windows-removed -fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) -> (serde_json::Value, usize, usize){ let mut data: Value = serde_json::from_str(&line).unwrap(); - let mut total_items = 0; - let mut removed_items = 0; + let mut total_bytes = 0; + let mut removed_bytes = 0; let text = data["text"].as_str().unwrap(); - - let newlines = if bff_args.remove_type == RemoveType::Document { + // Step 1: Break text into "units" + let newlines = if *remove_type == RemoveType::Document { vec![0, text.len()] } else { let mut newlines = Vec::new(); @@ -721,58 +562,63 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - }; let mut windows_to_remove = Vec::new(); + let mut total_ngrams = 0; let mut total_contained_ngrams = 0; for paragraph_window in newlines.windows(2) { let paragraph = &text[paragraph_window[0]..paragraph_window[1]]; - total_items += 1; + total_bytes += paragraph.len(); + - // calculate hashes for the paragraph + // Step 2: Tokenize and chunk into ngram shinglings, hash each one for the bff let mut hashes: Vec> = Vec::new(); - let mut ngram: VecDeque<&str> = VecDeque::with_capacity(bff_args.max_ngram_size); + let mut ngram: VecDeque<&str> = VecDeque::with_capacity(max_ngram_size); for token in tokenize(paragraph) { ngram.push_back(token); - // If not hashing whole paragraphs, add ngrams to the bloom filter as they reach max size - if !bff_args.whole_paragraphs && ngram.len() >= bff_args.max_ngram_size { + if ngram.len() >= max_ngram_size { // Step 2b: ngram shingling long enough hashes.push(bloom_filter.hashes(&ngram)); ngram.pop_front(); } } - - // If the paragraph was too short, put in a shorter ngram, so we can dedupe short - // paragraphs exactly. - if hashes.is_empty() && ngram.len() >= bff_args.min_ngram_size { + // Step 2c: unit is short, but not TOO SHORT + if hashes.is_empty() && ngram.len() >= min_ngram_size { hashes.push(bloom_filter.hashes(&ngram)); } + + // Step 3: check containment of ngrams let contained_ngrams = hashes .iter() .filter(|ngram| bloom_filter.contains_hashes(ngram)) .count(); total_ngrams += hashes.len(); - total_contained_ngrams += contained_ngrams; - - // calculate how many ngrams are in the bloom filter - let number_of_ngrams = hashes.len(); - - // produce output - let too_many_duplicate_ngrams = - contained_ngrams as f64 / number_of_ngrams as f64 > bff_args.filtering_threshold; - if too_many_duplicate_ngrams { + total_contained_ngrams += contained_ngrams; + let number_of_ngrams = hashes.len() as f64; + //windows_to_remove.ansoteuhoausenh(); + let should_remove = contained_ngrams as f64 / number_of_ngrams > filtering_threshold; + if should_remove { windows_to_remove.push(paragraph_window); - removed_items += 1; - } else if !bff_args.no_update_bloom_filter { + removed_bytes += paragraph.len(); + } else if !no_update_bloom_filter { + // Step 4: add all ngrams to the bloom filter if we don't remove it for ngram in hashes { bloom_filter.insert_hashes(&ngram); } } } - // if annotate_attribute_only or annotate_only, add the annotation to the json - if bff_args.annotate_attribute_only { + // Step 5: Handle the both case + let temp_window = vec![0, text.len()]; + if *remove_type == RemoveType::Both && + (total_contained_ngrams as f64) / (total_ngrams as f64) > filtering_threshold { + windows_to_remove.clear(); + windows_to_remove.push(&temp_window); + } + + // Format outputs: + if annotate { data["bff_duplicate_spans"] = serde_json::to_value(windows_to_remove).unwrap(); - data["bff_contained_ngram_count"] = - serde_json::to_value(total_contained_ngrams).unwrap(); + data["bff_contained_ngram_count"] = serde_json::to_value(total_contained_ngrams).unwrap(); } else { let mut output_paragraphs = String::new(); let mut last_end = 0; @@ -781,46 +627,10 @@ fn process_line(line: &String, bloom_filter: &BloomFilter, bff_args: &BffArgs) - last_end = paragraph_window[1]; } output_paragraphs.push_str(&text[last_end..]); - if bff_args.remove_type == RemoveType::Both && - (total_contained_ngrams as f64) / (total_ngrams as f64) > bff_args.filtering_threshold - { - output_paragraphs = String::new(); // If we found enough duplicates to remove whole document too - } data["text"] = Value::String(output_paragraphs); - data["bff_contained_ngram_count_before_dedupe"] = - serde_json::to_value(total_contained_ngrams).unwrap(); } - if bff_args.annotate_attribute_only { - // Allowed fields - let allowed_fields = [ - "bff_duplicate_spans", - "bff_contained_ngram_count", - "id", - "source", - "text" - ]; - - // Iterate through the keys of the JSON object and remove any field that is not in the allowed_fields list - if let Value::Object(ref mut map) = data { - map.retain(|key, _| allowed_fields.contains(&key.as_str())); - } - } - (data, removed_items, total_items) -} - - - - -fn tokenize(s: &str) -> impl Iterator { - s.split_word_bounds().filter(|w| { - for c in w.chars() { - if !c.is_whitespace() { - return true; - } - } - false - }) + (data, removed_bytes, total_bytes) } @@ -860,127 +670,74 @@ fn create_dir_if_not_exists(path: &PathBuf) -> Result<(), std::io::Error> { } } -fn extract_s3_basename(input_path: &str) -> &str { - let parts: Vec<&str> = input_path.split('/').collect(); - parts.last().unwrap() -} - - - -async fn gather_s3_io(bucket: &str, prefix: &str, output_dir: &str, - shard_num: usize, total_shards: usize) -> Result, Error> { - let region_provider = RegionProviderChain::default_provider(); - let config = aws_config::defaults(BehaviorVersion::latest()) - .region(region_provider) - .load() - .await; - let client = Client::new(&config); - - let mut response = client - .list_objects_v2() - .bucket(bucket.to_owned()) - .prefix(prefix.to_owned()) - .into_paginator() - .send(); - - let mut io_pairs: Vec<(String, String)> = Vec::new(); - while let Some(result) = response.next().await { - match result { - Ok(output) => { - for object in output.contents() { - let input_key = object.key().unwrap(); - if !(input_key.ends_with(".jsonl.gz") || input_key.ends_with(".json.gz")) { - continue; - } - let basename = extract_s3_basename(&input_key); - let output_key = Path::new(output_dir).join(basename).as_os_str().to_str().unwrap().to_string(); - let io_pair: (String, String) = (String::from(input_key), String::from(&output_key)); - io_pairs.push(io_pair); - } - } - Err(err) => { - eprintln!("{err:?}") - } - } - } - // select shard before we shuffle - let mut shard: Vec<(String, String)> = Vec::new(); - let mut idx = shard_num; - while idx < io_pairs.len() { - shard.push(io_pairs[idx].clone()); - idx += total_shards; - } - - // Then shuffle - let mut rng = thread_rng(); - shard.shuffle(&mut rng); - - Ok(shard) -} - /*============================================================= = Main Function = =============================================================*/ -#[tokio::main] -async fn main() -> std::io::Result<()> { + + + +fn main() -> Result<()> { let args = ArgParser::parse(); match &args.command { - Commands::Bff {inputs, output_directory, bff_args} => - { - assert!(bff_args.shard_num < bff_args.total_shards, "Shard num must be <= total shards"); - bff(inputs, output_directory, &bff_args)?; + Commands::Bff {inputs, output_directory, bloom_filter_file, expected_ngram_count, + fp_rate, min_ngram_size, max_ngram_size, filtering_threshold, + remove_type, no_update_bloom_filter, annotate, + threads, no_save_bloom_filter, no_progress_bar, shard_num, total_shards} => + { + assert!(shard_num < total_shards, "Shard num must be < total shards"); + bff(inputs, output_directory, bloom_filter_file, expected_ngram_count, + fp_rate, min_ngram_size, max_ngram_size, filtering_threshold, + remove_type, no_update_bloom_filter, annotate, + threads, no_save_bloom_filter, no_progress_bar, shard_num, total_shards)?; }, - - Commands::BffRemote {bucket, input_dir, output_dir, bff_args, num_retries, num_global_retries} => { - assert!(bff_args.shard_num < bff_args.total_shards, "Shard num must be <= total shards"); - bff_remote(bucket, input_dir, output_dir, &bff_args, num_retries, num_global_retries).await?; - } Commands::Sysreq {expected_ngram_count, fp_rate} => { let bff_size = compute_bloom_size(*fp_rate, *expected_ngram_count, false); let num_hashers = BloomFilter::optimal_number_of_hashers(bff_size, *expected_ngram_count); println!("To handle {} tokens with fp rate {}, you'd need a filter of size {} and {} hashers", - expected_ngram_count, fp_rate, human_bytes(bff_size as f64), num_hashers); + expected_ngram_count, fp_rate, human_bytes(bff_size as f64), num_hashers); }, - } + } Ok(()) } -fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> std::io::Result<()> { - /* - General pseudocode: - Setup: - - Build/setup the bloom filter - - Expand all the inputs - - Setup progress bar - Main loop: - - loop over all files and process them - Finalize: - - Write bff if needed - */ - // SETUP PHASE + +fn bff(inputs: &Vec, output_directory: &PathBuf, bloom_filter_file: &Option, + expected_ngram_count: &usize, fp_rate: &f64, min_ngram_size: &usize, max_ngram_size: &usize, + filtering_threshold: &f64, remove_type: &RemoveType, no_update_bloom_filter: &bool, + annotate: &bool, threads: &usize, no_save_bloom_filter: &bool, + no_progress_bar: &bool, shard_num: &usize, total_shards: &usize) -> Result<()> { + + + // SETUP PHASE: + // Set up {output_location, filter, inputs, threading, progress bar} let start_time = Instant::now(); - create_dir_if_not_exists(output_directory).unwrap(); - let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); - let all_inputs = expand_dirs(inputs).unwrap(); + create_dir_if_not_exists(output_directory).unwrap(); + let bloom_filter = Arc::new(BloomFilter::from_args(bloom_filter_file.clone(), *expected_ngram_count, *fp_rate)); + - // Select shard and then shuffle + // Setup input files + let all_inputs = expand_dirs(inputs).unwrap(); let mut shard: Vec = Vec::new(); - let mut idx = bff_args.shard_num; + let mut idx = *shard_num; while idx < all_inputs.len() { shard.push(all_inputs[idx].clone()); - idx += bff_args.total_shards; + idx += total_shards; } - // Then shuffle let mut rng = thread_rng(); shard.shuffle(&mut rng); + // Setup threads + let threads = if *threads == 0 { + available_parallelism().unwrap().get() + } else { + *threads + }; - - + // Setup progress bar let pbar = ProgressBar::new(shard.len() as u64) .with_style( ProgressStyle::with_template( @@ -988,225 +745,86 @@ fn bff(inputs: &Vec, output_directory: &PathBuf, bff_args: &BffArgs) -> ).unwrap() ); let pbar = Arc::new(Mutex::new(pbar)); + if !no_progress_bar { + pbar.lock().unwrap().inc(0); // Makes pbar show up with 0/N files complete + } println!("Completed setup phase in {:?} seconds", start_time.elapsed().as_secs()); - if !bff_args.no_progress { - pbar.lock().unwrap().inc(0); // initializes pbar - } - - // LOOP PHASE (W/ Threadpool) - let threads = if bff_args.threads == 0 { - available_parallelism().unwrap().get() - } else { - bff_args.threads - }; + // LOOP PHASE(using threadpool) let loop_start_time = Instant::now(); - let total_items = Arc::new(Mutex::new(0)); - let removed_items = Arc::new(Mutex::new(0)); + let total_bytes = Arc::new(Mutex::new(0)); + let removed_bytes = Arc::new(Mutex::new(0)); let threadpool = ThreadPool::new(threads); for input in shard { - //let mut output = output_directory.clone(); let output = output_directory.clone().join(input.file_name().unwrap()); - //output.push(input.file_name().unwrap()); let bloom_filter = bloom_filter.clone(); - let bff_args = bff_args.clone(); - let total_items = Arc::clone(&total_items); - let removed_items = Arc::clone(&removed_items); - let pbar_option: Option>> = if bff_args.no_progress { + let pbar_option: Option>> = if *no_progress_bar { None } else { Some(pbar.clone()) }; + let min_ngram_size = min_ngram_size.clone(); + let max_ngram_size = max_ngram_size.clone(); + let filtering_threshold = filtering_threshold.clone(); + let remove_type = remove_type.clone(); + let no_update_bloom_filter = no_update_bloom_filter.clone(); + let annotate = annotate.clone(); + let no_progress_bar = no_progress_bar.clone(); + let total_bytes = Arc::clone(&total_bytes); + let removed_bytes = Arc::clone(&removed_bytes); + threadpool.execute(move || { - if bff_args.no_progress { + if no_progress_bar { println!("Processing {input:?}..."); } - let (removed_doc_items, total_doc_items) = process_file( + let (removed_doc_bytes, total_doc_bytes) = process_file( &input, &output, &bloom_filter, - &bff_args, - &pbar_option, - ) - .unwrap(); - - let mut total_guard = total_items.lock().unwrap(); - *total_guard += total_doc_items; - - let mut removed_guard = removed_items.lock().unwrap(); - *removed_guard += removed_doc_items; - + max_ngram_size, + min_ngram_size, + &remove_type, + filtering_threshold.clone(), + no_update_bloom_filter.clone(), + annotate.clone(), + &pbar_option + ).unwrap(); + + let mut total_guard = total_bytes.lock().unwrap(); + *total_guard += total_doc_bytes; + + let mut removed_guard = removed_bytes.lock().unwrap(); + *removed_guard += removed_doc_bytes; }); } - threadpool.join(); + threadpool.join(); println!("Completed filtering all files in {:?} seconds", - loop_start_time.elapsed().as_secs()); - - - // FINALIZE PHASE - match &bff_args.bloom_filter_file { - Some(path) => { - if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { - let write_start_time = Instant::now(); - println!("Writing bloom filter to {:?}...", path); - bloom_filter.write_to_file(&path).unwrap(); - println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); - } - } - _ => {} - } - - - println!("After running, BFF sparsity was {:?}", bloom_filter.calculate_sparsity()); - - println!("Completed full BFF run in {:?} seconds", start_time.elapsed().as_secs()); + loop_start_time.elapsed().as_secs()); - let total_items = *total_items.lock().unwrap(); - let removed_items = *removed_items.lock().unwrap(); - println!("Stats: Saw {} items | Removed {} of them", - total_items, removed_items as f64 / total_items as f64); - Ok(()) -} - - - -async fn bff_remote(bucket: &String, input_dir: &String, output_dir: &String, bff_args: &BffArgs, num_retries: &usize, num_global_retries: &usize) -> std::io::Result<()> { - /* - General pseudocode: - Setup: - - Build/setup the bloom filter - - Setup thing to read s3_io - - Setup progress bar - Main loop: - - loop over all files and process them - Finalize: - - Write bff if needed - */ - let start_time = Instant::now(); - let bloom_filter = Arc::new(BloomFilter::from_args(bff_args)); - - let mut io_pairs = gather_s3_io(bucket, input_dir, output_dir, - bff_args.shard_num, bff_args.total_shards).await.unwrap(); - println!("Collected {} input files...", io_pairs.len()); - - let num_files = io_pairs.len(); - let err_count = Arc::new(Mutex::new(0)); - let pbar = ProgressBar::new(num_files as u64) - .with_style( - ProgressStyle::with_template( - "Files {human_pos}/{human_len} [{elapsed_precise}/{duration_precise}] [{wide_bar:.cyan/blue}]", - ).unwrap() - ); - let pbar = Arc::new(Mutex::new(pbar)); - println!("Completed setup phase in {:?} seconds", start_time.elapsed().as_secs()); - - if !bff_args.no_progress { - pbar.lock().unwrap().inc(0); // initializes pbar - } - - - let loop_start_time = Instant::now(); - let total_items = Arc::new(Mutex::new(0)); - let removed_items = Arc::new(Mutex::new(0)); - let threads = if bff_args.threads == 0 { - available_parallelism().unwrap().get() - } else { - bff_args.threads - }; - let threadpool = ThreadPool::new(threads); - - for retry_count in 0..*num_global_retries { - let failed_io_pairs: Arc>> = Arc::new(Mutex::new(Vec::new())); - let mut rng = rand::thread_rng(); - for io_pair in &io_pairs { - let num_retries = (*num_retries).clone(); - let num_global_retries = (*num_global_retries).clone(); - let retry_count = retry_count.clone(); - let bucket = bucket.clone(); - let bloom_filter = bloom_filter.clone(); - let bff_args = bff_args.clone(); - let failed_io_pairs = Arc::clone(&failed_io_pairs); - let err_count: Arc> = Arc::clone(&err_count); - let total_items = Arc::clone(&total_items); - let removed_items = Arc::clone(&removed_items); - let pbar_option: Option>> = if bff_args.no_progress { - None - } else { - Some(pbar.clone()) - }; - - let (input_path, output_path) = io_pair.clone(); - threadpool.execute(move || { - let rt = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .unwrap(); - let result = rt.block_on( - process_file_s3(&bucket, - &input_path, - &output_path, - &bloom_filter, - &bff_args, - &pbar_option, - num_retries) - ); - match result { - Ok(outputs) => { - let (rem_doc_items, tot_doc_items) = outputs; - let mut total_guard = total_items.lock().unwrap(); - *total_guard += tot_doc_items; - let mut removed_guard = removed_items.lock().unwrap(); - *removed_guard += rem_doc_items; - } - Err(err) => { - eprintln!("Round {}/{}: Error processing {}; {:?}", retry_count+1, num_global_retries, input_path, err); - if retry_count < num_global_retries - 1 { - // in all but last round, push the failed pair to failed_io_pairs - let mut fail_guard = failed_io_pairs.lock().unwrap(); - fail_guard.push((input_path, output_path)); - } else { - // in last round, give up and mark this one as an error - let mut count = err_count.lock().unwrap(); - *count += 1; - } - - } - } - }); - // Wait a little before spawning the next processor. - let random_delay = rng.gen_range(Duration::from_millis(0)..Duration::from_millis(100)); - sleep(random_delay).await; - } - threadpool.join(); - io_pairs = failed_io_pairs.lock().unwrap().clone(); - } - println!("Completed filtering all files in {:?} seconds", - loop_start_time.elapsed().as_secs()); // FINALIZE PHASE - match &bff_args.bloom_filter_file { + // Save bloom filter + match &bloom_filter_file { Some(path) => { - if (!bff_args.no_update_bloom_filter) && (!bff_args.no_save_bloom_filter) { + if !no_update_bloom_filter && !no_save_bloom_filter { let write_start_time = Instant::now(); println!("Writing bloom filter to {:?}...", path); bloom_filter.write_to_file(&path).unwrap(); - println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); + println!("...Bloom filter written in {:?} seconds.", write_start_time.elapsed().as_secs()); } + } _ => {} } - - println!("Error count is {}/{}", err_count.lock().unwrap(), num_files); + // Print out summary println!("After running, BFF sparsity was {:?}", bloom_filter.calculate_sparsity()); - println!("Completed full BFF run in {:?} seconds", start_time.elapsed().as_secs()); - let total_items = *total_items.lock().unwrap(); - let removed_items = *removed_items.lock().unwrap(); - println!("Stats: Saw {} items | Removed {} of them", - total_items, removed_items as f64 / total_items as f64); - Ok(()) -} - + let total_bytes = *total_bytes.lock().unwrap(); + let removed_bytes = *removed_bytes.lock().unwrap(); + println!("Stats: Saw {} of text | Removed {} of them", + human_bytes(total_bytes as f64), removed_bytes as f64 / total_bytes as f64); + Ok(()) +} \ No newline at end of file