Skip to content

Commit bb6c3a9

Browse files
adriangbclaude
andcommitted
keep BloomFilterProperties::ndv as u64 to avoid breaking API change
Instead of changing ndv from u64 to Option<u64> (which breaks downstream users), resolve the dynamic default at WriterPropertiesBuilder::build() time. Columns with bloom filters enabled but no explicit NDV get their ndv set to max_row_group_row_count, so the filter is never undersized. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 50eff0d commit bb6c3a9

File tree

2 files changed

+55
-32
lines changed

2 files changed

+55
-32
lines changed

parquet/src/column/writer/encoder.rs

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,7 @@ use crate::data_type::DataType;
2727
use crate::data_type::private::ParquetValueType;
2828
use crate::encodings::encoding::{DictEncoder, Encoder, get_encoder};
2929
use crate::errors::{ParquetError, Result};
30-
use crate::file::properties::{
31-
DEFAULT_MAX_ROW_GROUP_ROW_COUNT, EnabledStatistics, WriterProperties,
32-
};
30+
use crate::file::properties::{EnabledStatistics, WriterProperties};
3331
use crate::geospatial::accumulator::{GeoStatsAccumulator, try_new_geo_stats_accumulator};
3432
use crate::geospatial::statistics::GeospatialStatistics;
3533
use crate::schema::types::{ColumnDescPtr, ColumnDescriptor};
@@ -387,24 +385,17 @@ fn replace_zero<T: ParquetValueType>(val: &T, descr: &ColumnDescriptor, replace:
387385
}
388386
}
389387

390-
/// Creates a bloom filter sized for the column's configured NDV (or the row group size
391-
/// if NDV is not explicitly set), returning the filter and the target FPP for folding.
388+
/// Creates a bloom filter sized for the column's configured NDV, returning the filter
389+
/// and the target FPP for folding.
392390
pub(crate) fn create_bloom_filter(
393391
props: &WriterProperties,
394392
descr: &ColumnDescPtr,
395393
) -> Result<(Option<Sbbf>, f64)> {
396394
match props.bloom_filter_properties(descr.path()) {
397-
Some(bf_props) => {
398-
let ndv = bf_props.ndv.unwrap_or(
399-
props
400-
.max_row_group_row_count()
401-
.unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT) as u64,
402-
);
403-
Ok((
404-
Some(Sbbf::new_with_ndv_fpp(ndv, bf_props.fpp)?),
405-
bf_props.fpp,
406-
))
407-
}
395+
Some(bf_props) => Ok((
396+
Some(Sbbf::new_with_ndv_fpp(bf_props.ndv, bf_props.fpp)?),
397+
bf_props.fpp,
398+
)),
408399
None => Ok((None, 0.0)),
409400
}
410401
}

parquet/src/file/properties.rs

Lines changed: 48 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,13 @@ pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_
5353
pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
5454
/// Default value for [`BloomFilterProperties::fpp`]
5555
pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56-
/// Default value for [`BloomFilterProperties::ndv`]
56+
/// Default value for [`BloomFilterProperties::ndv`].
57+
///
58+
/// Note: this is only the fallback default used when constructing [`BloomFilterProperties`]
59+
/// directly. When using [`WriterPropertiesBuilder`], columns with bloom filters enabled
60+
/// but without an explicit NDV will have their NDV resolved at build time to
61+
/// [`WriterProperties::max_row_group_row_count`], which may differ from this constant
62+
/// if the user configured a custom row group size.
5763
pub const DEFAULT_BLOOM_FILTER_NDV: u64 = DEFAULT_MAX_ROW_GROUP_ROW_COUNT as u64;
5864
/// Default values for [`WriterProperties::statistics_truncate_length`]
5965
pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = Some(64);
@@ -587,6 +593,18 @@ impl Default for WriterPropertiesBuilder {
587593
impl WriterPropertiesBuilder {
588594
/// Finalizes the configuration and returns immutable writer properties struct.
589595
pub fn build(self) -> WriterProperties {
596+
// Resolve bloom filter NDV for columns where it wasn't explicitly set:
597+
// default to max_row_group_row_count so the filter is never undersized.
598+
let default_ndv = self
599+
.max_row_group_row_count
600+
.unwrap_or(DEFAULT_MAX_ROW_GROUP_ROW_COUNT) as u64;
601+
let mut default_column_properties = self.default_column_properties;
602+
default_column_properties.resolve_bloom_filter_ndv(default_ndv);
603+
let mut column_properties = self.column_properties;
604+
for props in column_properties.values_mut() {
605+
props.resolve_bloom_filter_ndv(default_ndv);
606+
}
607+
590608
WriterProperties {
591609
data_page_row_count_limit: self.data_page_row_count_limit,
592610
write_batch_size: self.write_batch_size,
@@ -597,8 +615,8 @@ impl WriterPropertiesBuilder {
597615
created_by: self.created_by,
598616
offset_index_disabled: self.offset_index_disabled,
599617
key_value_metadata: self.key_value_metadata,
600-
default_column_properties: self.default_column_properties,
601-
column_properties: self.column_properties,
618+
default_column_properties,
619+
column_properties,
602620
sorting_columns: self.sorting_columns,
603621
column_index_truncate_length: self.column_index_truncate_length,
604622
statistics_truncate_length: self.statistics_truncate_length,
@@ -1216,15 +1234,16 @@ pub struct BloomFilterProperties {
12161234
/// This value also serves as the target FPP for bloom filter folding: after all values
12171235
/// are inserted, the filter is folded down to the smallest size that still meets this FPP.
12181236
pub fpp: f64,
1219-
/// Maximum expected number of distinct values. When `None` (default), the bloom filter
1220-
/// is sized based on the row group's `max_row_group_row_count` at runtime.
1237+
/// Maximum expected number of distinct values. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
12211238
///
12221239
/// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
12231240
///
1224-
/// The bloom filter is initially sized for this many distinct values at the given `fpp`,
1225-
/// then folded down after insertion to achieve optimal size. A good heuristic is to set
1226-
/// this to the expected number of rows in the row group. If fewer distinct values are
1227-
/// actually written, the filter will be automatically compacted via folding.
1241+
/// When not explicitly set via the builder, this defaults to
1242+
/// [`max_row_group_row_count`](WriterProperties::max_row_group_row_count) (resolved at
1243+
/// build time). The bloom filter is initially sized for this many distinct values at the
1244+
/// given `fpp`, then folded down after insertion to achieve optimal size. A good heuristic
1245+
/// is to set this to the expected number of rows in the row group. If fewer distinct values
1246+
/// are actually written, the filter will be automatically compacted via folding.
12281247
///
12291248
/// Thus the only negative side of overestimating this value is that the bloom filter
12301249
/// will use more memory during writing than necessary, but it will not affect the final
@@ -1236,14 +1255,14 @@ pub struct BloomFilterProperties {
12361255
/// If you do set this value explicitly it is probably best to set it for each column
12371256
/// individually via [`WriterPropertiesBuilder::set_column_bloom_filter_ndv`] rather than globally,
12381257
/// since different columns may have different numbers of distinct values.
1239-
pub ndv: Option<u64>,
1258+
pub ndv: u64,
12401259
}
12411260

12421261
impl Default for BloomFilterProperties {
12431262
fn default() -> Self {
12441263
BloomFilterProperties {
12451264
fpp: DEFAULT_BLOOM_FILTER_FPP,
1246-
ndv: None,
1265+
ndv: DEFAULT_BLOOM_FILTER_NDV,
12471266
}
12481267
}
12491268
}
@@ -1263,6 +1282,8 @@ struct ColumnProperties {
12631282
write_page_header_statistics: Option<bool>,
12641283
/// bloom filter related properties
12651284
bloom_filter_properties: Option<BloomFilterProperties>,
1285+
/// Whether the bloom filter NDV was explicitly set by the user
1286+
bloom_filter_ndv_is_set: bool,
12661287
}
12671288

12681289
impl ColumnProperties {
@@ -1345,7 +1366,8 @@ impl ColumnProperties {
13451366
fn set_bloom_filter_ndv(&mut self, value: u64) {
13461367
self.bloom_filter_properties
13471368
.get_or_insert_with(Default::default)
1348-
.ndv = Some(value);
1369+
.ndv = value;
1370+
self.bloom_filter_ndv_is_set = true;
13491371
}
13501372

13511373
/// Returns optional encoding for this column.
@@ -1393,6 +1415,16 @@ impl ColumnProperties {
13931415
fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
13941416
self.bloom_filter_properties.as_ref()
13951417
}
1418+
1419+
/// If bloom filter is enabled and NDV was not explicitly set, resolve it to the
1420+
/// given `default_ndv` (typically derived from `max_row_group_row_count`).
1421+
fn resolve_bloom_filter_ndv(&mut self, default_ndv: u64) {
1422+
if !self.bloom_filter_ndv_is_set {
1423+
if let Some(ref mut bf) = self.bloom_filter_properties {
1424+
bf.ndv = default_ndv;
1425+
}
1426+
}
1427+
}
13961428
}
13971429

13981430
/// Reference counted reader properties.
@@ -1690,7 +1722,7 @@ mod tests {
16901722
props.bloom_filter_properties(&ColumnPath::from("col")),
16911723
Some(&BloomFilterProperties {
16921724
fpp: 0.1,
1693-
ndv: Some(100),
1725+
ndv: 100,
16941726
})
16951727
);
16961728
}
@@ -1728,7 +1760,7 @@ mod tests {
17281760
props.bloom_filter_properties(&ColumnPath::from("col")),
17291761
Some(&BloomFilterProperties {
17301762
fpp: DEFAULT_BLOOM_FILTER_FPP,
1731-
ndv: None,
1763+
ndv: DEFAULT_BLOOM_FILTER_NDV,
17321764
})
17331765
);
17341766
}
@@ -1771,7 +1803,7 @@ mod tests {
17711803
.bloom_filter_properties(&ColumnPath::from("col")),
17721804
Some(&BloomFilterProperties {
17731805
fpp: DEFAULT_BLOOM_FILTER_FPP,
1774-
ndv: Some(100),
1806+
ndv: 100,
17751807
})
17761808
);
17771809
assert_eq!(
@@ -1781,7 +1813,7 @@ mod tests {
17811813
.bloom_filter_properties(&ColumnPath::from("col")),
17821814
Some(&BloomFilterProperties {
17831815
fpp: 0.1,
1784-
ndv: None,
1816+
ndv: DEFAULT_BLOOM_FILTER_NDV,
17851817
})
17861818
);
17871819
}

0 commit comments

Comments
 (0)