@@ -897,4 +897,197 @@ mod tests {
897897 ) ;
898898 }
899899 }
900+
901+ /*
902+ Ok, so the following is trying to prove in simple terms that folding an SBBF and
903+ building a fresh smaller SBBF from scratch prodcues the exact same bits
904+
905+ If you insert the same values into a 512-block filter and fold it to 256 blocks,
906+ you get a bit-for-bit identical result to just inserting those values into a
907+ 256-block filter directly. The fold doesn't lose information or scramble anything,
908+ it's like you had known the right size all along
909+
910+ This works because of the 2 lemmas:
911+ 1. when you half the filter, each hash's block index divides cleanly by 2
912+ so the hash that went to block `i` in the big filter goes to block `i/2` in the small one
913+ which is exactly where the fold puts it
914+ > this is trivial since floor(x/2) == floor(floor(x) / 2) is a basic math fact
915+
916+ 2. the bit pattern set _within_ a block depends only on the lower 32 bits of the hash,
917+ which doesn't change with filter size. So the same bits get set regardless!
918+ > structually trivial, mask() takes a u32 and uses only the SALT constants..
919+
920+
921+ When you combine it together, every hash sets the same bits in the same destination block
922+ whether you fold or build fresh. Therefore the filters are bit-identical
923+ */
924+ #[ test]
925+ fn test_sbbf_folded_equals_fresh ( ) {
926+ let values = ( 0 ..5000 ) . map ( |i| format ! ( "elem_{i}" ) ) . collect :: < Vec < _ > > ( ) ;
927+ let hashes = values
928+ . iter ( )
929+ . map ( |v| hash_as_bytes ( v. as_str ( ) ) )
930+ . collect :: < Vec < _ > > ( ) ;
931+
932+ for num_blocks in [ 64 , 256 , 1024 ] {
933+ let half = num_blocks / 2 ;
934+
935+ // original filter
936+ let mut original = Sbbf :: new_with_num_of_bytes ( num_blocks * 32 ) ;
937+ assert_eq ! ( original. num_blocks( ) , num_blocks) ;
938+ for & h in & hashes {
939+ original. insert_hash ( h) ;
940+ }
941+
942+ for & h in hashes. iter ( ) {
943+ let mask = Block :: mask ( h as u32 ) ;
944+
945+ // step 1: element's block in original
946+ let orig_idx = original. hash_to_block_index ( h) ;
947+ assert ! ( orig_idx < num_blocks) ;
948+
949+ // step 2 (lemma 1): destination in N/2 filter
950+ let fresh_idx = {
951+ let tmp = Sbbf ( vec ! [ Block :: ZERO ; half] ) ;
952+ tmp. hash_to_block_index ( h)
953+ } ;
954+
955+ let folded_idx = orig_idx / 2 ;
956+ assert_eq ! ( fresh_idx, folded_idx, ) ;
957+
958+ // step 3 (lemma 2): mask is the same
959+ for w in 0 ..8 {
960+ assert_ne ! ( original. 0 [ orig_idx] . 0 [ w] & mask. 0 [ w] , 0 , ) ;
961+ }
962+ }
963+
964+ // verify the actual blocks match
965+ let mut folded = original. clone ( ) ;
966+ folded. fold_once ( ) ;
967+ assert_eq ! ( folded. num_blocks( ) , half) ;
968+
969+ let mut fresh = Sbbf :: new_with_num_of_bytes ( half * 32 ) ;
970+ for & h in & hashes {
971+ fresh. insert_hash ( h) ;
972+ }
973+
974+ for j in 0 ..half {
975+ assert_eq ! (
976+ folded. 0 [ j] . 0 , fresh. 0 [ j] . 0 ,
977+ "Step 4 failed: block {j} differs (N={num_blocks}→{half})"
978+ ) ;
979+ }
980+ }
981+ }
982+
983+ /// show multi-step folding.
984+ ///
985+ /// You can apply the above inductively, folding k times from N blocks prodcues a filter bit-identical to a fresh N/2^k filter
986+ #[ test]
987+ fn test_multi_step_fold ( ) {
988+ let values = ( 0 ..3000 ) . map ( |i| format ! ( "x_{i}" ) ) . collect :: < Vec < _ > > ( ) ;
989+
990+ let mut filter = Sbbf :: new_with_num_of_bytes ( 512 * 32 ) ;
991+ for v in & values {
992+ filter. insert ( v. as_str ( ) ) ;
993+ }
994+
995+ for expected_blocks in [ 256 , 128 , 64 , 32 , 16 , 8 , 4 , 2 , 1 ] {
996+ filter. fold_once ( ) ;
997+ assert_eq ! ( filter. num_blocks( ) , expected_blocks) ;
998+
999+ let mut fresh = Sbbf :: new_with_num_of_bytes ( expected_blocks * 32 ) ;
1000+ for v in & values {
1001+ fresh. insert ( v. as_str ( ) ) ;
1002+ }
1003+ for ( fb, rb) in filter. 0 . iter ( ) . zip ( fresh. 0 . iter ( ) ) {
1004+ assert_eq ! ( fb. 0 , rb. 0 , ) ;
1005+ }
1006+ }
1007+ }
1008+
1009+ /// test that the fpp estimator's overestimation doesn't cause fold_to_target_fpp
1010+ /// to produce significantly oversized filters
1011+ ///
1012+ /// compare the final size after folding agains tthe theoretical optimal size
1013+ #[ test]
1014+ fn test_fold_size_vs_optimal_fixed_size ( ) {
1015+ for ( ndv, target_fpp) in [
1016+ ( 1000 , 0.05 ) ,
1017+ ( 1000 , 0.01 ) ,
1018+ ( 5000 , 0.05 ) ,
1019+ ( 5000 , 0.01 ) ,
1020+ ( 10000 , 0.05 ) ,
1021+ ] {
1022+ let values = ( 0 ..ndv) . map ( |i| format ! ( "d_{i}" ) ) . collect :: < Vec < _ > > ( ) ;
1023+
1024+ let mut folded = Sbbf :: new_with_num_of_bytes ( 128 * 1024 ) ; // 128KB
1025+ for v in & values {
1026+ folded. insert ( v. as_str ( ) ) ;
1027+ }
1028+ folded. fold_to_target_fpp ( target_fpp) ;
1029+
1030+ let folded_bytes = folded. num_blocks ( ) * 32 ;
1031+
1032+ let optimal = Sbbf :: new_with_ndv_fpp ( ndv as u64 , target_fpp) . unwrap ( ) ;
1033+ let optimal_bytes = optimal. num_blocks ( ) * 32 ;
1034+
1035+ let ratio = folded_bytes as f64 / optimal_bytes as f64 ;
1036+
1037+ assert_eq ! ( ratio, 1.0 ) ;
1038+ }
1039+ }
1040+
1041+ /// verify that a folded sbbf has the same empirical fpp as a fresh filter of the same size
1042+ /// this bridges the bit-identity proof above with the FPP guarantee from the folding paper
1043+ /// since the bits are identical, the false-positive rate must be too
1044+ ///
1045+ /// we measure fpp empirically by probing with values that were never inserted
1046+ /// and counting how many are incorrectly marked as present
1047+ #[ test]
1048+ fn test_folded_fpp_matches_fresh_fpp ( ) {
1049+ let ndv = 2000 ;
1050+ let num_probes = 50_000 ;
1051+ let inserted = ( 0 ..ndv)
1052+ . map ( |i| format ! ( "ins_{i}" ) )
1053+ . collect :: < Vec < String > > ( ) ;
1054+
1055+ // probe values that were NOT inserted (different prefix guarantees no overlap)
1056+ let probes = ( 0 ..num_probes)
1057+ . map ( |i| format ! ( "probe_{i}" ) )
1058+ . collect :: < Vec < String > > ( ) ;
1059+
1060+ // build a large filter and fold it down several times
1061+ let mut folded = Sbbf :: new_with_num_of_bytes ( 512 * 32 ) ; // 512 blocks
1062+ for v in & inserted {
1063+ folded. insert ( v. as_str ( ) ) ;
1064+ }
1065+
1066+ // check FPP at each fold level
1067+ for expected_blocks in [ 256 , 128 , 64 , 32 , 16 , 8 , 4 , 2 , 1 ] {
1068+ folded. fold_once ( ) ;
1069+ assert_eq ! ( folded. num_blocks( ) , expected_blocks) ;
1070+
1071+ // build a fresh filter of the same size with the same values
1072+ let mut fresh = Sbbf :: new_with_num_of_bytes ( expected_blocks * 32 ) ;
1073+ for v in & inserted {
1074+ fresh. insert ( v. as_str ( ) ) ;
1075+ }
1076+
1077+ // measure empirical FPP on both
1078+ let mut folded_fp = 0u64 ;
1079+ let mut fresh_fp = 0u64 ;
1080+ for p in & probes {
1081+ if folded. check ( p. as_str ( ) ) {
1082+ folded_fp += 1 ;
1083+ }
1084+ if fresh. check ( p. as_str ( ) ) {
1085+ fresh_fp += 1 ;
1086+ }
1087+ }
1088+
1089+ // bit-identity means these must be exactly equal
1090+ assert_eq ! ( folded_fp, fresh_fp) ;
1091+ }
1092+ }
9001093}
0 commit comments