|
68 | 68 | //! | 1,000,000 | 0.00001 | 131,072 | 4,096 | |
69 | 69 | //! | 1,000,000 | 0.000001 | 262,144 | 8,192 | |
70 | 70 | //! |
| 71 | +//! # Structure: Filter → Blocks → Words → Bits |
| 72 | +//! |
| 73 | +//! An SBBF is an array of **blocks**. Each block is 256 bits (32 bytes), |
| 74 | +//! divided into eight 32-bit **words**. A word is just a `u32` — an array of |
| 75 | +//! 32 individual bits that can each be "set" (1) or "not set" (0). |
| 76 | +//! |
| 77 | +//! ```text |
| 78 | +//! Sbbf (the whole filter) |
| 79 | +//! ┌──────────┬──────────┬──────────┬─── ─── ──┬──────────┐ |
| 80 | +//! │ Block 0 │ Block 1 │ Block 2 │ ... │ Block N-1│ |
| 81 | +//! └──────────┴──────────┴──────────┴─── ─── ──┴──────────┘ |
| 82 | +//! │ |
| 83 | +//! ▼ |
| 84 | +//! One Block = 256 bits = 8 words |
| 85 | +//! ┌────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐ |
| 86 | +//! │ word 0 │ word 1 │ word 2 │ word 3 │ word 4 │ word 5 │ word 6 │ word 7 │ |
| 87 | +//! │ (u32) │ (u32) │ (u32) │ (u32) │ (u32) │ (u32) │ (u32) │ (u32) │ |
| 88 | +//! └────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘ |
| 89 | +//! │ |
| 90 | +//! ▼ |
| 91 | +//! One Word = 32 individual bits |
| 92 | +//! ┌─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┬─┐ |
| 93 | +//! │0│0│1│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│0│ ← bit 29 is set |
| 94 | +//! └─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┴─┘ |
| 95 | +//! ``` |
| 96 | +//! |
| 97 | +//! **Inserting** a value hashes it to a 64-bit number, then: |
| 98 | +//! 1. The upper 32 bits pick which **block** (via `Sbbf::hash_to_block_index`). |
| 99 | +//! 2. The lower 32 bits pick one bit position in each of the 8 **words** (via `Block::mask`). |
| 100 | +//! So each insert sets exactly **8 bits** (one per word) in a single block. |
| 101 | +//! |
| 102 | +//! **Checking** does the same two steps and returns `true` only if all 8 bits |
| 103 | +//! are already set — meaning the value was *probably* inserted (or is a false |
| 104 | +//! positive). |
| 105 | +//! |
71 | 106 | //! # Bloom Filter Folding |
72 | 107 | //! |
73 | | -//! When the NDV is not known ahead of time, bloom filters support a **folding mode** that |
74 | | -//! eliminates the need to guess NDV upfront. See [`Sbbf::fold_to_target_fpp`] for details |
75 | | -//! on the algorithm and its mathematical basis. |
| 108 | +//! After inserting all values into a bloom filter it can be "folded" to minimize it's size. |
| 109 | +//! See [`Sbbf::fold_to_target_fpp`] for details on the algorithm and its mathematical basis. |
76 | 110 | //! |
77 | 111 | //! [parquet-bf-spec]: https://github.com/apache/parquet-format/blob/master/BloomFilter.md |
78 | 112 | //! [sbbf-paper]: https://arxiv.org/pdf/2101.01719 |
@@ -120,8 +154,22 @@ pub struct BloomFilterHeader { |
120 | 154 | } |
121 | 155 | ); |
122 | 156 |
|
123 | | -/// Each block is 256 bits, broken up into eight contiguous "words", each consisting of 32 bits. |
124 | | -/// Each word is thought of as an array of bits; each bit is either "set" or "not set". |
| 157 | +/// A single 256-bit block, the basic unit of the Split Block Bloom Filter. |
| 158 | +/// |
| 159 | +/// A block is eight contiguous 32-bit **words** (`[u32; 8]`). |
| 160 | +/// Each word is an independent bit-array of 32 positions: |
| 161 | +/// |
| 162 | +/// ```text |
| 163 | +/// Block (256 bits total) |
| 164 | +/// ┌────────┬────────┬────────┬────────┬────────┬────────┬────────┬────────┐ |
| 165 | +/// │ word 0 │ word 1 │ word 2 │ word 3 │ word 4 │ word 5 │ word 6 │ word 7 │ |
| 166 | +/// │ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│ 32 bits│ |
| 167 | +/// └────────┴────────┴────────┴────────┴────────┴────────┴────────┴────────┘ |
| 168 | +/// ``` |
| 169 | +/// |
| 170 | +/// When a value is inserted, [`Block::mask`] picks one bit in each word |
| 171 | +/// (8 bits total), and those bits are OR'd in. When checking, we verify |
| 172 | +/// all 8 bits are set. |
125 | 173 | #[derive(Debug, Copy, Clone)] |
126 | 174 | #[repr(transparent)] |
127 | 175 | struct Block([u32; 8]); |
@@ -561,11 +609,12 @@ impl Sbbf { |
561 | 609 | /// Folding **never introduces false negatives**. Every bit that was set in the original |
562 | 610 | /// filter remains set in the folded filter (via bitwise OR). The only effect is a controlled |
563 | 611 | /// increase in FPP as set bits from different blocks are merged together. |
| 612 | + /// This is was originally proven in [Sailhan & Stehr 2012] for standard bloom filters and is empirically |
| 613 | + /// demonstrated for SBBFs in Lemma 1 and Lemma 2 of the tests. |
564 | 614 | /// |
565 | 615 | /// ## References |
566 | 616 | /// |
567 | | - /// - Sailhan, F. & Stehr, M-O. "Folding and Unfolding Bloom Filters", |
568 | | - /// IEEE iThings 2012. <https://doi.org/10.1109/GreenCom.2012.16> |
| 617 | + /// [Sailhan & Stehr 2012]: https://doi.org/10.1109/GreenCom.2012.16 |
569 | 618 | pub fn fold_to_target_fpp(&mut self, target_fpp: f64) { |
570 | 619 | let num_folds = self.num_folds_for_target_fpp(target_fpp); |
571 | 620 | if num_folds > 0 { |
|
0 commit comments