From d058cb4e3413351c20d981d8416de4d6d984b392 Mon Sep 17 00:00:00 2001 From: codedump Date: Sun, 14 Dec 2025 11:36:09 +0800 Subject: [PATCH] [Arrow]Configure max deduplication length when deduplicating strings while building the array --- .../src/builder/generic_bytes_view_builder.rs | 125 ++++++++++++++---- 1 file changed, 98 insertions(+), 27 deletions(-) diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 35f5684cc7f4..d7ff7ce051d8 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder { /// Some if deduplicating strings /// map ` -> ` string_tracker: Option<(HashTable, ahash::RandomState)>, + max_deduplication_len: Option, phantom: PhantomData, } @@ -107,10 +108,28 @@ impl GenericByteViewBuilder { current_size: STARTING_BLOCK_SIZE, }, string_tracker: None, + max_deduplication_len: None, phantom: Default::default(), } } + /// Configure max deduplication length when deduplicating strings while building the array. + /// Default is None. + /// + /// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate + /// any strings longer than 12 bytes. However, since it takes time proportional to the length + /// of the string to deduplicate, setting this option limits the CPU overhead for this option. + pub fn with_max_deduplication_len(self, max_deduplication_len: u32) -> Self { + debug_assert!( + max_deduplication_len > 0, + "max_deduplication_len must be greater than 0" + ); + Self { + max_deduplication_len: Some(max_deduplication_len), + ..self + } + } + /// Set a fixed buffer size for variable length strings /// /// The block size is the size of the buffer used to store values greater @@ -334,35 +353,42 @@ impl GenericByteViewBuilder { // Deduplication if: // (1) deduplication is enabled. - // (2) len > 12 - if let Some((mut ht, hasher)) = self.string_tracker.take() { - let hash_val = hasher.hash_one(v); - let hasher_fn = |v: &_| hasher.hash_one(v); - - let entry = ht.entry( - hash_val, - |idx| { - let stored_value = self.get_value(*idx); - v == stored_value - }, - hasher_fn, - ); - match entry { - Entry::Occupied(occupied) => { - // If the string already exists, we will directly use the view - let idx = occupied.get(); - self.views_buffer.push(self.views_buffer[*idx]); - self.null_buffer_builder.append_non_null(); - self.string_tracker = Some((ht, hasher)); - return Ok(()); - } - Entry::Vacant(vacant) => { - // o.w. we insert the (string hash -> view index) - // the idx is current length of views_builder, as we are inserting a new view - vacant.insert(self.views_buffer.len()); + // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len` + let can_deduplicate = self.string_tracker.is_some() + && self + .max_deduplication_len + .map(|max_length| length <= max_length) + .unwrap_or(true); + if can_deduplicate { + if let Some((mut ht, hasher)) = self.string_tracker.take() { + let hash_val = hasher.hash_one(v); + let hasher_fn = |v: &_| hasher.hash_one(v); + + let entry = ht.entry( + hash_val, + |idx| { + let stored_value = self.get_value(*idx); + v == stored_value + }, + hasher_fn, + ); + match entry { + Entry::Occupied(occupied) => { + // If the string already exists, we will directly use the view + let idx = occupied.get(); + self.views_buffer.push(self.views_buffer[*idx]); + self.null_buffer_builder.append_non_null(); + self.string_tracker = Some((ht, hasher)); + return Ok(()); + } + Entry::Vacant(vacant) => { + // o.w. we insert the (string hash -> view index) + // the idx is current length of views_builder, as we are inserting a new view + vacant.insert(self.views_buffer.len()); + } } + self.string_tracker = Some((ht, hasher)); } - self.string_tracker = Some((ht, hasher)); } let required_cap = self.in_progress.len() + v.len(); @@ -636,8 +662,53 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { mod tests { use core::str; + use arrow_buffer::ArrowNativeType; + use super::*; + #[test] + fn test_string_max_deduplication_len() { + let value_1 = "short"; + let value_2 = "not so similar string but long"; + let value_3 = "1234567890123"; + + let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2; + + let mut builder = StringViewBuilder::new() + .with_deduplicate_strings() + .with_max_deduplication_len(max_deduplication_len); + + assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize()); + assert!(value_2.len() > max_deduplication_len.as_usize()); + assert!( + value_3.len() > MAX_INLINE_VIEW_LEN.as_usize() + && value_3.len() < max_deduplication_len.as_usize() + ); + + // append value1 (short), expect it is inlined and not deduplicated + builder.append_value(value_1); // view 0 + builder.append_value(value_1); // view 1 + // append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len + builder.append_value(value_2); // view 2 + builder.append_value(value_2); // view 3 + // append value3, expect second copy is deduplicated + builder.append_value(value_3); // view 4 + builder.append_value(value_3); // view 5 + + let array = builder.finish(); + + // verify + let v2 = ByteView::from(array.views()[2]); + let v3 = ByteView::from(array.views()[3]); + assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same buffer + assert_ne!(v2.offset, v3.offset); // different offsets --> not deduplicated + + let v4 = ByteView::from(array.views()[4]); + let v5 = ByteView::from(array.views()[5]); + assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same buffer + assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated + } + #[test] fn test_string_view_deduplicate() { let value_1 = "long string to test string view";