6868//! | 1,000,000 | 0.00001 | 131,072 | 4,096 |
6969//! | 1,000,000 | 0.000001 | 262,144 | 8,192 |
7070//!
71+ //! # Bloom Filter Folding
72+ //!
73+ //! When the NDV is not known ahead of time, bloom filters support a **folding mode** that
74+ //! eliminates the need to guess NDV upfront. See [`Sbbf::fold_to_target_fpp`] for details
75+ //! on the algorithm and its mathematical basis.
76+ //!
7177//! [parquet-bf-spec]: https://github.com/apache/parquet-format/blob/master/BloomFilter.md
7278//! [sbbf-paper]: https://arxiv.org/pdf/2101.01719
7379//! [bf-formulae]: http://tfk.mit.edu/pdf/bloom.pdf
@@ -191,7 +197,21 @@ impl std::ops::IndexMut<usize> for Block {
191197 }
192198}
193199
194- /// A split block Bloom filter.
200+ /// A split block Bloom filter (SBBF).
201+ ///
202+ /// An SBBF partitions its bit space into fixed-size 256-bit (32-byte) blocks, each fitting in a
203+ /// single CPU cache line. Each block contains eight 32-bit words, aligned with SIMD lanes for
204+ /// parallel bit manipulation. When checking membership, only one block is accessed per query,
205+ /// eliminating the cache-miss penalty of standard Bloom filters.
206+ ///
207+ /// ## Two sizing modes
208+ ///
209+ /// - **Fixed-size mode**: Created via [`Sbbf::new_with_ndv_fpp`] when the number of distinct
210+ /// values (NDV) is known. The filter is sized exactly for the given NDV and FPP.
211+ ///
212+ /// - **Folding mode**: Created via [`Sbbf::new_with_num_of_bytes`] at a conservatively large
213+ /// size, then compacted after all values are inserted by calling [`Sbbf::fold_to_target_fpp`].
214+ /// This eliminates the need to know NDV upfront.
195215///
196216/// The creation of this structure is based on the [`crate::file::properties::BloomFilterProperties`]
197217/// struct set via [`crate::file::properties::WriterProperties`] and is thus hidden by default.
@@ -436,10 +456,35 @@ impl Sbbf {
436456 self . 0 . len ( )
437457 }
438458
439- /// Fold the bloom filter once by merging adjacent block pairs via bitwise OR,
440- /// halving the filter size. Block[2i] and Block[2i+1] are merged into a single block
441- /// at position [i] in the folded filter. This preserves correctness because
442- /// `hash_to_block_index` maps to `floor(original_index / 2)` when `num_blocks` is halved.
459+ /// Fold the bloom filter once, halving its size by merging adjacent block pairs.
460+ ///
461+ /// This implements an elementary folding operation for Split Block Bloom
462+ /// Filters<sup>[1]</sup>. Each pair of adjacent blocks is combined via bitwise OR:
463+ ///
464+ /// ```text
465+ /// folded[i] = blocks[2*i] | blocks[2*i + 1] for 0 <= i < num_blocks/2
466+ /// ```
467+ ///
468+ /// ## Why adjacent pairs (not halves)?
469+ ///
470+ /// Standard Bloom filter folding merges the two halves (`B[i] | B[i + m/2]`) because
471+ /// standard filters use modular hashing: `index = h(x) mod m`, so `h(x) mod (m/2)`
472+ /// maps index `i` and index `i + m/2` to the same position.
473+ ///
474+ /// SBBFs use **multiplicative** hashing for block selection:
475+ ///
476+ /// ```text
477+ /// block_index = ((hash >> 32) * num_blocks) >> 32
478+ /// ```
479+ ///
480+ /// When `num_blocks` is halved, the new index becomes `floor(original_index / 2)`.
481+ /// Therefore blocks `2i` and `2i+1` (not `i` and `i + N/2`) map to the same position `i`
482+ /// in the folded filter.
483+ ///
484+ /// ## References
485+ ///
486+ /// 1. Sailhan, F. & Stehr, M-O. "Folding and Unfolding Bloom Filters",
487+ /// IEEE iThings 2012. <https://doi.org/10.1109/GreenCom.2012.16>
443488 ///
444489 /// # Panics
445490 ///
@@ -458,9 +503,16 @@ impl Sbbf {
458503
459504 /// Estimate the FPP that would result from folding once, without mutating the filter.
460505 ///
461- /// SBBF checks are per-block: a query hashes to one block, then checks 8 bits within it.
462- /// The FPP is therefore the average of per-block FPPs, not a function of global fill.
463- /// For each merged block pair (2i, 2i+1), we compute `(block_fill)^8` and average.
506+ /// Unlike standard Bloom filters where FPP depends on the global fill ratio, SBBF
507+ /// membership checks are **per-block**: a query hashes to exactly one block, then checks
508+ /// `k=8` bits within that block. The FPP is therefore the **average of per-block FPPs**:
509+ ///
510+ /// ```text
511+ /// FPP = (1 / num_blocks) * sum_i (set_bits_in_block_i / 256)^8
512+ /// ```
513+ ///
514+ /// To project the FPP after a fold, we simulate the merge of each adjacent pair `(2i, 2i+1)`
515+ /// by computing `(block[2i] | block[2i+1]).count_ones()` without actually mutating the filter.
464516 fn estimated_fpp_after_fold ( & self ) -> f64 {
465517 let half = self . 0 . len ( ) / 2 ;
466518 let mut total_fpp = 0.0 ;
@@ -475,15 +527,74 @@ impl Sbbf {
475527 total_fpp / half as f64
476528 }
477529
478- /// Fold the bloom filter down until reaching the target false positive probability.
530+ /// Fold the bloom filter down to the smallest size that still meets the target FPP.
531+ ///
532+ /// Repeatedly halves the filter by merging adjacent block pairs (see [`Self::fold_once`]),
533+ /// stopping when the next fold would cause the estimated FPP to exceed `target_fpp`, or
534+ /// when the filter reaches the minimum size of 1 block (32 bytes).
535+ ///
536+ /// ## Background
537+ ///
538+ /// Bloom filter folding is a technique for dynamically resizing filters after
539+ /// construction. For a standard Bloom filter of size `m` (a power of two), an element
540+ /// hashed to index `i = h(x) mod m` would map to `i' = h(x) mod (m/2)` in a filter
541+ /// half the size. Since `m` is a power of two, the fold is a bitwise OR of the upper half
542+ /// onto the lower half: `B_folded[j] = B[j] | B[j + m/2]`.
543+ ///
544+ /// ## Adaptation for SBBF
545+ ///
546+ /// SBBFs use multiplicative hashing for block selection rather than modular arithmetic:
547+ ///
548+ /// ```text
549+ /// block_index = ((hash >> 32) * num_blocks) >> 32
550+ /// ```
551+ ///
552+ /// When `num_blocks` is halved, the new index becomes `floor(original_index / 2)`, so
553+ /// blocks `2i` and `2i+1` (not `i` and `i+N/2`) map to the same position. The fold
554+ /// therefore merges **adjacent** pairs:
555+ ///
556+ /// ```text
557+ /// folded[i] = blocks[2*i] | blocks[2*i + 1]
558+ /// ```
559+ ///
560+ /// ## FPP estimation
561+ ///
562+ /// SBBF membership checks are per-block (`k=8` bit checks within one 256-bit block), so
563+ /// the FPP is the average of per-block false positive probabilities:
564+ ///
565+ /// ```text
566+ /// FPP = (1/b) * sum_i (set_bits_in_block_i / 256)^8
567+ /// ```
568+ ///
569+ /// Before each fold, we project the post-fold FPP by simulating the block merges. Folding
570+ /// stops when the next fold would exceed the target.
571+ ///
572+ /// ## Correctness
573+ ///
574+ /// Folding **never introduces false negatives**. Every bit that was set in the original
575+ /// filter remains set in the folded filter (via bitwise OR). The only effect is a controlled
576+ /// increase in FPP as set bits from different blocks are merged together.
577+ ///
578+ /// ## Typical usage
579+ ///
580+ /// ```text
581+ /// // 1. Allocate large (worst-case NDV = max row group rows)
582+ /// let mut sbbf = Sbbf::new_with_num_of_bytes(1_048_576); // 1 MiB
583+ ///
584+ /// // 2. Insert all values during column writing
585+ /// for value in column_values {
586+ /// sbbf.insert(&value);
587+ /// }
588+ ///
589+ /// // 3. Fold down to target FPP before serializing
590+ /// sbbf.fold_to_target_fpp(0.05);
591+ /// // Filter is now optimally sized for the actual data
592+ /// ```
479593 ///
480- /// Repeatedly halves the filter by OR-ing the upper half into the lower half, stopping
481- /// when the next fold would cause the estimated FPP to exceed `target_fpp`, or when the
482- /// filter reaches the minimum size of 1 block (32 bytes).
594+ /// ## References
483595 ///
484- /// This is useful when a bloom filter was allocated conservatively large and needs to be
485- /// compacted after all values have been inserted. Folding preserves all set bits, so there
486- /// are never false negatives — only a controlled increase in false positive probability.
596+ /// - Sailhan, F. & Stehr, M-O. "Folding and Unfolding Bloom Filters",
597+ /// IEEE iThings 2012. <https://doi.org/10.1109/GreenCom.2012.16>
487598 pub fn fold_to_target_fpp ( & mut self , target_fpp : f64 ) {
488599 while self . 0 . len ( ) >= 2 {
489600 if self . estimated_fpp_after_fold ( ) > target_fpp {
0 commit comments