@@ -54,8 +54,7 @@ pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
5454/// Default value for [`BloomFilterProperties::fpp`]
5555pub const DEFAULT_BLOOM_FILTER_FPP : f64 = 0.05 ;
5656/// Default value for [`BloomFilterProperties::ndv`]
57- #[ deprecated( note = "NDV is now optional; bloom filters use folding mode by default" ) ]
58- pub const DEFAULT_BLOOM_FILTER_NDV : u64 = 1_000_000_u64 ;
57+ pub const DEFAULT_BLOOM_FILTER_NDV : u64 = DEFAULT_MAX_ROW_GROUP_ROW_COUNT as u64 ;
5958/// Default values for [`WriterProperties::statistics_truncate_length`]
6059pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH : Option < usize > = Some ( 64 ) ;
6160/// Default value for [`WriterProperties::offset_index_disabled`]
@@ -997,11 +996,13 @@ impl WriterPropertiesBuilder {
997996 self
998997 }
999998
1000- /// Sets default number of distinct values (ndv) for bloom filter for all columns.
999+ /// Sets default maximum expected number of distinct values (ndv) for bloom filter
1000+ /// for all columns (defaults to [`DEFAULT_BLOOM_FILTER_NDV`]).
10011001 ///
1002- /// When set, this activates fixed-size mode: the bloom filter is sized exactly for
1003- /// the given NDV at the configured FPP, with no folding. When not set (default),
1004- /// the bloom filter uses folding mode instead.
1002+ /// The bloom filter is initially sized for this many distinct values at the
1003+ /// configured FPP, then folded down after all values are inserted to achieve
1004+ /// optimal size. A good heuristic is to set this to the expected number of rows
1005+ /// in the row group.
10051006 ///
10061007 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
10071008 /// been called.
@@ -1012,26 +1013,6 @@ impl WriterPropertiesBuilder {
10121013 self
10131014 }
10141015
1015- /// Sets the default maximum initial allocation size in bytes for bloom filter folding mode
1016- /// for all columns.
1017- ///
1018- /// When bloom filters use folding mode (no explicit NDV), this controls the initial
1019- /// allocation size. The filter will be folded down at flush time to meet the target FPP.
1020- /// If not set, the initial size is derived from `max_row_group_row_count` and `fpp`.
1021- ///
1022- /// The value will be rounded up to the next power of two, bounded by
1023- /// [`BITSET_MIN_LENGTH`](crate::bloom_filter::BITSET_MIN_LENGTH) and
1024- /// [`BITSET_MAX_LENGTH`](crate::bloom_filter::BITSET_MAX_LENGTH).
1025- ///
1026- /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had been called.
1027- ///
1028- /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
1029- pub fn set_bloom_filter_max_bytes ( mut self , value : usize ) -> Self {
1030- self . default_column_properties
1031- . set_bloom_filter_max_bytes ( value) ;
1032- self
1033- }
1034-
10351016 // ----------------------------------------------------------------------
10361017 // Setters for a specific column
10371018
@@ -1137,15 +1118,6 @@ impl WriterPropertiesBuilder {
11371118 self . get_mut_props ( col) . set_bloom_filter_ndv ( value) ;
11381119 self
11391120 }
1140-
1141- /// Sets the maximum initial allocation size in bytes for bloom filter folding mode
1142- /// for a specific column.
1143- ///
1144- /// Takes precedence over [`Self::set_bloom_filter_max_bytes`].
1145- pub fn set_column_bloom_filter_max_bytes ( mut self , col : ColumnPath , value : usize ) -> Self {
1146- self . get_mut_props ( col) . set_bloom_filter_max_bytes ( value) ;
1147- self
1148- }
11491121}
11501122
11511123impl From < WriterProperties > for WriterPropertiesBuilder {
@@ -1225,15 +1197,12 @@ impl Default for EnabledStatistics {
12251197
12261198/// Controls the bloom filter to be computed by the writer.
12271199///
1228- /// Two modes are supported:
1200+ /// The bloom filter is initially sized for `ndv` distinct values at the given `fpp`, then
1201+ /// automatically folded down after all values are inserted to achieve optimal size while
1202+ /// maintaining the target `fpp`. See [`Sbbf::fold_to_target_fpp`] for details on the
1203+ /// folding algorithm.
12291204///
1230- /// - **Fixed-size mode**: When `ndv` is set to `Some(n)`, the bloom filter is sized based on `ndv`
1231- /// and `fpp` at allocation time. This is the legacy behavior.
1232- ///
1233- /// - **Folding mode** (default): When `ndv` is `None`, a conservatively large bloom filter is
1234- /// allocated (sized for worst-case NDV = max row group rows, or `max_bytes` if set), then
1235- /// folded down at flush time to meet the target `fpp`. This eliminates the need to guess NDV
1236- /// upfront and produces optimally-sized filters automatically.
1205+ /// [`Sbbf::fold_to_target_fpp`]: crate::bloom_filter::Sbbf::fold_to_target_fpp
12371206#[ derive( Debug , Clone , PartialEq ) ]
12381207pub struct BloomFilterProperties {
12391208 /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
@@ -1242,32 +1211,28 @@ pub struct BloomFilterProperties {
12421211 ///
12431212 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
12441213 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
1245- /// e.g. 0.1, 0.01, or 0.001 is recommended.
1214+ /// e.g. 0.1, 0.05, or 0.001 is recommended.
1215+ ///
1216+ /// This value also serves as the target FPP for bloom filter folding: after all values
1217+ /// are inserted, the filter is folded down to the smallest size that still meets this FPP.
12461218 pub fpp : f64 ,
1247- /// Number of distinct values. When set to `Some(n)`, the bloom filter is sized exactly for
1248- /// `n` distinct values at the given `fpp` (fixed-size mode). When `None` (default), the
1249- /// filter uses folding mode instead.
1219+ /// Maximum expected number of distinct values. When `None` (default), the bloom filter
1220+ /// is sized based on the row group's `max_row_group_row_count` at runtime.
12501221 ///
12511222 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
12521223 ///
1253- /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
1254- /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
1255- /// number of distinct values.
1224+ /// The bloom filter is initially sized for this many distinct values at the given `fpp`,
1225+ /// then folded down after insertion to achieve optimal size. A good heuristic is to set
1226+ /// this to the expected number of rows in the row group. If fewer distinct values are
1227+ /// actually written, the filter will be automatically compacted via folding.
12561228 pub ndv : Option < u64 > ,
1257- /// Maximum initial allocation size in bytes for folding mode. When `None` (default), the
1258- /// initial size is derived from `max_row_group_row_count` and `fpp`. Only used when `ndv`
1259- /// is `None`.
1260- ///
1261- /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_max_bytes`].
1262- pub max_bytes : Option < usize > ,
12631229}
12641230
12651231impl Default for BloomFilterProperties {
12661232 fn default ( ) -> Self {
12671233 BloomFilterProperties {
12681234 fpp : DEFAULT_BLOOM_FILTER_FPP ,
12691235 ndv : None ,
1270- max_bytes : None ,
12711236 }
12721237 }
12731238}
@@ -1364,22 +1329,14 @@ impl ColumnProperties {
13641329 . fpp = value;
13651330 }
13661331
1367- /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1368- /// enables bloom filter if not previously enabled. This activates fixed-size mode (no folding) .
1332+ /// Sets the maximum expected number of distinct (unique) values for bloom filter for this
1333+ /// column, and implicitly enables bloom filter if not previously enabled.
13691334 fn set_bloom_filter_ndv ( & mut self , value : u64 ) {
13701335 self . bloom_filter_properties
13711336 . get_or_insert_with ( Default :: default)
13721337 . ndv = Some ( value) ;
13731338 }
13741339
1375- /// Sets the maximum initial allocation size in bytes for bloom filter folding mode, and
1376- /// implicitly enables bloom filter if not previously enabled.
1377- fn set_bloom_filter_max_bytes ( & mut self , value : usize ) {
1378- self . bloom_filter_properties
1379- . get_or_insert_with ( Default :: default)
1380- . max_bytes = Some ( value) ;
1381- }
1382-
13831340 /// Returns optional encoding for this column.
13841341 fn encoding ( & self ) -> Option < Encoding > {
13851342 self . encoding
@@ -1723,7 +1680,6 @@ mod tests {
17231680 Some ( & BloomFilterProperties {
17241681 fpp: 0.1 ,
17251682 ndv: Some ( 100 ) ,
1726- max_bytes: None ,
17271683 } )
17281684 ) ;
17291685 }
@@ -1762,7 +1718,6 @@ mod tests {
17621718 Some ( & BloomFilterProperties {
17631719 fpp: DEFAULT_BLOOM_FILTER_FPP ,
17641720 ndv: None ,
1765- max_bytes: None ,
17661721 } )
17671722 ) ;
17681723 }
@@ -1806,7 +1761,6 @@ mod tests {
18061761 Some ( & BloomFilterProperties {
18071762 fpp: DEFAULT_BLOOM_FILTER_FPP ,
18081763 ndv: Some ( 100 ) ,
1809- max_bytes: None ,
18101764 } )
18111765 ) ;
18121766 assert_eq ! (
@@ -1817,7 +1771,6 @@ mod tests {
18171771 Some ( & BloomFilterProperties {
18181772 fpp: 0.1 ,
18191773 ndv: None ,
1820- max_bytes: None ,
18211774 } )
18221775 ) ;
18231776 }
0 commit comments