@@ -53,7 +53,13 @@ pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_
5353pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH : Option < usize > = Some ( 64 ) ;
5454/// Default value for [`BloomFilterProperties::fpp`]
5555pub const DEFAULT_BLOOM_FILTER_FPP : f64 = 0.05 ;
56- /// Default value for [`BloomFilterProperties::ndv`]
56+ /// Default value for [`BloomFilterProperties::ndv`].
57+ ///
58+ /// Note: this is only the fallback default used when constructing [`BloomFilterProperties`]
59+ /// directly. When using [`WriterPropertiesBuilder`], columns with bloom filters enabled
60+ /// but without an explicit NDV will have their NDV resolved at build time to
61+ /// [`WriterProperties::max_row_group_row_count`], which may differ from this constant
62+ /// if the user configured a custom row group size.
5763pub const DEFAULT_BLOOM_FILTER_NDV : u64 = DEFAULT_MAX_ROW_GROUP_ROW_COUNT as u64 ;
5864/// Default values for [`WriterProperties::statistics_truncate_length`]
5965pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH : Option < usize > = Some ( 64 ) ;
@@ -587,6 +593,18 @@ impl Default for WriterPropertiesBuilder {
587593impl WriterPropertiesBuilder {
588594 /// Finalizes the configuration and returns immutable writer properties struct.
589595 pub fn build ( self ) -> WriterProperties {
596+ // Resolve bloom filter NDV for columns where it wasn't explicitly set:
597+ // default to max_row_group_row_count so the filter is never undersized.
598+ let default_ndv = self
599+ . max_row_group_row_count
600+ . unwrap_or ( DEFAULT_MAX_ROW_GROUP_ROW_COUNT ) as u64 ;
601+ let mut default_column_properties = self . default_column_properties ;
602+ default_column_properties. resolve_bloom_filter_ndv ( default_ndv) ;
603+ let mut column_properties = self . column_properties ;
604+ for props in column_properties. values_mut ( ) {
605+ props. resolve_bloom_filter_ndv ( default_ndv) ;
606+ }
607+
590608 WriterProperties {
591609 data_page_row_count_limit : self . data_page_row_count_limit ,
592610 write_batch_size : self . write_batch_size ,
@@ -597,8 +615,8 @@ impl WriterPropertiesBuilder {
597615 created_by : self . created_by ,
598616 offset_index_disabled : self . offset_index_disabled ,
599617 key_value_metadata : self . key_value_metadata ,
600- default_column_properties : self . default_column_properties ,
601- column_properties : self . column_properties ,
618+ default_column_properties,
619+ column_properties,
602620 sorting_columns : self . sorting_columns ,
603621 column_index_truncate_length : self . column_index_truncate_length ,
604622 statistics_truncate_length : self . statistics_truncate_length ,
@@ -1216,15 +1234,16 @@ pub struct BloomFilterProperties {
12161234 /// This value also serves as the target FPP for bloom filter folding: after all values
12171235 /// are inserted, the filter is folded down to the smallest size that still meets this FPP.
12181236 pub fpp : f64 ,
1219- /// Maximum expected number of distinct values. When `None` (default), the bloom filter
1220- /// is sized based on the row group's `max_row_group_row_count` at runtime.
1237+ /// Maximum expected number of distinct values. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
12211238 ///
12221239 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
12231240 ///
1224- /// The bloom filter is initially sized for this many distinct values at the given `fpp`,
1225- /// then folded down after insertion to achieve optimal size. A good heuristic is to set
1226- /// this to the expected number of rows in the row group. If fewer distinct values are
1227- /// actually written, the filter will be automatically compacted via folding.
1241+ /// When not explicitly set via the builder, this defaults to
1242+ /// [`max_row_group_row_count`](WriterProperties::max_row_group_row_count) (resolved at
1243+ /// build time). The bloom filter is initially sized for this many distinct values at the
1244+ /// given `fpp`, then folded down after insertion to achieve optimal size. A good heuristic
1245+ /// is to set this to the expected number of rows in the row group. If fewer distinct values
1246+ /// are actually written, the filter will be automatically compacted via folding.
12281247 ///
12291248 /// Thus the only negative side of overestimating this value is that the bloom filter
12301249 /// will use more memory during writing than necessary, but it will not affect the final
@@ -1236,14 +1255,14 @@ pub struct BloomFilterProperties {
12361255 /// If you do set this value explicitly it is probably best to set it for each column
12371256 /// individually via [`WriterPropertiesBuilder::set_column_bloom_filter_ndv`] rather than globally,
12381257 /// since different columns may have different numbers of distinct values.
1239- pub ndv : Option < u64 > ,
1258+ pub ndv : u64 ,
12401259}
12411260
12421261impl Default for BloomFilterProperties {
12431262 fn default ( ) -> Self {
12441263 BloomFilterProperties {
12451264 fpp : DEFAULT_BLOOM_FILTER_FPP ,
1246- ndv : None ,
1265+ ndv : DEFAULT_BLOOM_FILTER_NDV ,
12471266 }
12481267 }
12491268}
@@ -1263,6 +1282,8 @@ struct ColumnProperties {
12631282 write_page_header_statistics : Option < bool > ,
12641283 /// bloom filter related properties
12651284 bloom_filter_properties : Option < BloomFilterProperties > ,
1285+ /// Whether the bloom filter NDV was explicitly set by the user
1286+ bloom_filter_ndv_is_set : bool ,
12661287}
12671288
12681289impl ColumnProperties {
@@ -1345,7 +1366,8 @@ impl ColumnProperties {
13451366 fn set_bloom_filter_ndv ( & mut self , value : u64 ) {
13461367 self . bloom_filter_properties
13471368 . get_or_insert_with ( Default :: default)
1348- . ndv = Some ( value) ;
1369+ . ndv = value;
1370+ self . bloom_filter_ndv_is_set = true ;
13491371 }
13501372
13511373 /// Returns optional encoding for this column.
@@ -1393,6 +1415,16 @@ impl ColumnProperties {
13931415 fn bloom_filter_properties ( & self ) -> Option < & BloomFilterProperties > {
13941416 self . bloom_filter_properties . as_ref ( )
13951417 }
1418+
1419+ /// If bloom filter is enabled and NDV was not explicitly set, resolve it to the
1420+ /// given `default_ndv` (typically derived from `max_row_group_row_count`).
1421+ fn resolve_bloom_filter_ndv ( & mut self , default_ndv : u64 ) {
1422+ if !self . bloom_filter_ndv_is_set {
1423+ if let Some ( ref mut bf) = self . bloom_filter_properties {
1424+ bf. ndv = default_ndv;
1425+ }
1426+ }
1427+ }
13961428}
13971429
13981430/// Reference counted reader properties.
@@ -1690,7 +1722,7 @@ mod tests {
16901722 props. bloom_filter_properties( & ColumnPath :: from( "col" ) ) ,
16911723 Some ( & BloomFilterProperties {
16921724 fpp: 0.1 ,
1693- ndv: Some ( 100 ) ,
1725+ ndv: 100 ,
16941726 } )
16951727 ) ;
16961728 }
@@ -1728,7 +1760,7 @@ mod tests {
17281760 props. bloom_filter_properties( & ColumnPath :: from( "col" ) ) ,
17291761 Some ( & BloomFilterProperties {
17301762 fpp: DEFAULT_BLOOM_FILTER_FPP ,
1731- ndv: None ,
1763+ ndv: DEFAULT_BLOOM_FILTER_NDV ,
17321764 } )
17331765 ) ;
17341766 }
@@ -1771,7 +1803,7 @@ mod tests {
17711803 . bloom_filter_properties( & ColumnPath :: from( "col" ) ) ,
17721804 Some ( & BloomFilterProperties {
17731805 fpp: DEFAULT_BLOOM_FILTER_FPP ,
1774- ndv: Some ( 100 ) ,
1806+ ndv: 100 ,
17751807 } )
17761808 ) ;
17771809 assert_eq ! (
@@ -1781,7 +1813,7 @@ mod tests {
17811813 . bloom_filter_properties( & ColumnPath :: from( "col" ) ) ,
17821814 Some ( & BloomFilterProperties {
17831815 fpp: 0.1 ,
1784- ndv: None ,
1816+ ndv: DEFAULT_BLOOM_FILTER_NDV ,
17851817 } )
17861818 ) ;
17871819 }
0 commit comments