Skip to content

Commit 507e739

Browse files
authored
Add tests proving this works
2 parents 4c4e4ea + a6a232b commit 507e739

File tree

1 file changed

+193
-0
lines changed
  • parquet/src/bloom_filter

1 file changed

+193
-0
lines changed

parquet/src/bloom_filter/mod.rs

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -897,4 +897,197 @@ mod tests {
897897
);
898898
}
899899
}
900+
901+
/*
902+
Ok, so the following is trying to prove in simple terms that folding an SBBF and
903+
building a fresh smaller SBBF from scratch prodcues the exact same bits
904+
905+
If you insert the same values into a 512-block filter and fold it to 256 blocks,
906+
you get a bit-for-bit identical result to just inserting those values into a
907+
256-block filter directly. The fold doesn't lose information or scramble anything,
908+
it's like you had known the right size all along
909+
910+
This works because of the 2 lemmas:
911+
1. when you half the filter, each hash's block index divides cleanly by 2
912+
so the hash that went to block `i` in the big filter goes to block `i/2` in the small one
913+
which is exactly where the fold puts it
914+
> this is trivial since floor(x/2) == floor(floor(x) / 2) is a basic math fact
915+
916+
2. the bit pattern set _within_ a block depends only on the lower 32 bits of the hash,
917+
which doesn't change with filter size. So the same bits get set regardless!
918+
> structually trivial, mask() takes a u32 and uses only the SALT constants..
919+
920+
921+
When you combine it together, every hash sets the same bits in the same destination block
922+
whether you fold or build fresh. Therefore the filters are bit-identical
923+
*/
924+
#[test]
925+
fn test_sbbf_folded_equals_fresh() {
926+
let values = (0..5000).map(|i| format!("elem_{i}")).collect::<Vec<_>>();
927+
let hashes = values
928+
.iter()
929+
.map(|v| hash_as_bytes(v.as_str()))
930+
.collect::<Vec<_>>();
931+
932+
for num_blocks in [64, 256, 1024] {
933+
let half = num_blocks / 2;
934+
935+
// original filter
936+
let mut original = Sbbf::new_with_num_of_bytes(num_blocks * 32);
937+
assert_eq!(original.num_blocks(), num_blocks);
938+
for &h in &hashes {
939+
original.insert_hash(h);
940+
}
941+
942+
for &h in hashes.iter() {
943+
let mask = Block::mask(h as u32);
944+
945+
// step 1: element's block in original
946+
let orig_idx = original.hash_to_block_index(h);
947+
assert!(orig_idx < num_blocks);
948+
949+
// step 2 (lemma 1): destination in N/2 filter
950+
let fresh_idx = {
951+
let tmp = Sbbf(vec![Block::ZERO; half]);
952+
tmp.hash_to_block_index(h)
953+
};
954+
955+
let folded_idx = orig_idx / 2;
956+
assert_eq!(fresh_idx, folded_idx,);
957+
958+
// step 3 (lemma 2): mask is the same
959+
for w in 0..8 {
960+
assert_ne!(original.0[orig_idx].0[w] & mask.0[w], 0,);
961+
}
962+
}
963+
964+
// verify the actual blocks match
965+
let mut folded = original.clone();
966+
folded.fold_once();
967+
assert_eq!(folded.num_blocks(), half);
968+
969+
let mut fresh = Sbbf::new_with_num_of_bytes(half * 32);
970+
for &h in &hashes {
971+
fresh.insert_hash(h);
972+
}
973+
974+
for j in 0..half {
975+
assert_eq!(
976+
folded.0[j].0, fresh.0[j].0,
977+
"Step 4 failed: block {j} differs (N={num_blocks}→{half})"
978+
);
979+
}
980+
}
981+
}
982+
983+
/// show multi-step folding.
984+
///
985+
/// You can apply the above inductively, folding k times from N blocks prodcues a filter bit-identical to a fresh N/2^k filter
986+
#[test]
987+
fn test_multi_step_fold() {
988+
let values = (0..3000).map(|i| format!("x_{i}")).collect::<Vec<_>>();
989+
990+
let mut filter = Sbbf::new_with_num_of_bytes(512 * 32);
991+
for v in &values {
992+
filter.insert(v.as_str());
993+
}
994+
995+
for expected_blocks in [256, 128, 64, 32, 16, 8, 4, 2, 1] {
996+
filter.fold_once();
997+
assert_eq!(filter.num_blocks(), expected_blocks);
998+
999+
let mut fresh = Sbbf::new_with_num_of_bytes(expected_blocks * 32);
1000+
for v in &values {
1001+
fresh.insert(v.as_str());
1002+
}
1003+
for (fb, rb) in filter.0.iter().zip(fresh.0.iter()) {
1004+
assert_eq!(fb.0, rb.0,);
1005+
}
1006+
}
1007+
}
1008+
1009+
/// test that the fpp estimator's overestimation doesn't cause fold_to_target_fpp
1010+
/// to produce significantly oversized filters
1011+
///
1012+
/// compare the final size after folding agains tthe theoretical optimal size
1013+
#[test]
1014+
fn test_fold_size_vs_optimal_fixed_size() {
1015+
for (ndv, target_fpp) in [
1016+
(1000, 0.05),
1017+
(1000, 0.01),
1018+
(5000, 0.05),
1019+
(5000, 0.01),
1020+
(10000, 0.05),
1021+
] {
1022+
let values = (0..ndv).map(|i| format!("d_{i}")).collect::<Vec<_>>();
1023+
1024+
let mut folded = Sbbf::new_with_num_of_bytes(128 * 1024); // 128KB
1025+
for v in &values {
1026+
folded.insert(v.as_str());
1027+
}
1028+
folded.fold_to_target_fpp(target_fpp);
1029+
1030+
let folded_bytes = folded.num_blocks() * 32;
1031+
1032+
let optimal = Sbbf::new_with_ndv_fpp(ndv as u64, target_fpp).unwrap();
1033+
let optimal_bytes = optimal.num_blocks() * 32;
1034+
1035+
let ratio = folded_bytes as f64 / optimal_bytes as f64;
1036+
1037+
assert_eq!(ratio, 1.0);
1038+
}
1039+
}
1040+
1041+
/// verify that a folded sbbf has the same empirical fpp as a fresh filter of the same size
1042+
/// this bridges the bit-identity proof above with the FPP guarantee from the folding paper
1043+
/// since the bits are identical, the false-positive rate must be too
1044+
///
1045+
/// we measure fpp empirically by probing with values that were never inserted
1046+
/// and counting how many are incorrectly marked as present
1047+
#[test]
1048+
fn test_folded_fpp_matches_fresh_fpp() {
1049+
let ndv = 2000;
1050+
let num_probes = 50_000;
1051+
let inserted = (0..ndv)
1052+
.map(|i| format!("ins_{i}"))
1053+
.collect::<Vec<String>>();
1054+
1055+
// probe values that were NOT inserted (different prefix guarantees no overlap)
1056+
let probes = (0..num_probes)
1057+
.map(|i| format!("probe_{i}"))
1058+
.collect::<Vec<String>>();
1059+
1060+
// build a large filter and fold it down several times
1061+
let mut folded = Sbbf::new_with_num_of_bytes(512 * 32); // 512 blocks
1062+
for v in &inserted {
1063+
folded.insert(v.as_str());
1064+
}
1065+
1066+
// check FPP at each fold level
1067+
for expected_blocks in [256, 128, 64, 32, 16, 8, 4, 2, 1] {
1068+
folded.fold_once();
1069+
assert_eq!(folded.num_blocks(), expected_blocks);
1070+
1071+
// build a fresh filter of the same size with the same values
1072+
let mut fresh = Sbbf::new_with_num_of_bytes(expected_blocks * 32);
1073+
for v in &inserted {
1074+
fresh.insert(v.as_str());
1075+
}
1076+
1077+
// measure empirical FPP on both
1078+
let mut folded_fp = 0u64;
1079+
let mut fresh_fp = 0u64;
1080+
for p in &probes {
1081+
if folded.check(p.as_str()) {
1082+
folded_fp += 1;
1083+
}
1084+
if fresh.check(p.as_str()) {
1085+
fresh_fp += 1;
1086+
}
1087+
}
1088+
1089+
// bit-identity means these must be exactly equal
1090+
assert_eq!(folded_fp, fresh_fp);
1091+
}
1092+
}
9001093
}

0 commit comments

Comments
 (0)