-
Notifications
You must be signed in to change notification settings - Fork 1.1k
[Arrow]Configure max deduplication length for StringView
#8990
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> { | |
| /// Some if deduplicating strings | ||
| /// map `<string hash> -> <index to the views>` | ||
| string_tracker: Option<(HashTable<usize>, ahash::RandomState)>, | ||
| max_deduplication_len: Option<u32>, | ||
| phantom: PhantomData<T>, | ||
| } | ||
|
|
||
|
|
@@ -107,10 +108,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { | |
| current_size: STARTING_BLOCK_SIZE, | ||
| }, | ||
| string_tracker: None, | ||
| max_deduplication_len: None, | ||
| phantom: Default::default(), | ||
| } | ||
| } | ||
|
|
||
| /// Configure max deduplication length when deduplicating strings while building the array. | ||
| /// Default is None. | ||
| /// | ||
| /// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate | ||
| /// any strings longer than 12 bytes. However, since it takes time proportional to the length | ||
| /// of the string to deduplicate, setting this option limits the CPU overhead for this option. | ||
| pub fn with_max_deduplication_len(self, max_deduplication_len: u32) -> Self { | ||
| debug_assert!( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not leave use We should also document in the comments that the parameter must be greater than zero if we are going to assert
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i look at the |
||
| max_deduplication_len > 0, | ||
| "max_deduplication_len must be greater than 0" | ||
| ); | ||
| Self { | ||
| max_deduplication_len: Some(max_deduplication_len), | ||
| ..self | ||
| } | ||
| } | ||
|
|
||
| /// Set a fixed buffer size for variable length strings | ||
| /// | ||
| /// The block size is the size of the buffer used to store values greater | ||
|
|
@@ -334,35 +353,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> { | |
|
|
||
| // Deduplication if: | ||
| // (1) deduplication is enabled. | ||
| // (2) len > 12 | ||
| if let Some((mut ht, hasher)) = self.string_tracker.take() { | ||
| let hash_val = hasher.hash_one(v); | ||
| let hasher_fn = |v: &_| hasher.hash_one(v); | ||
|
|
||
| let entry = ht.entry( | ||
| hash_val, | ||
| |idx| { | ||
| let stored_value = self.get_value(*idx); | ||
| v == stored_value | ||
| }, | ||
| hasher_fn, | ||
| ); | ||
| match entry { | ||
| Entry::Occupied(occupied) => { | ||
| // If the string already exists, we will directly use the view | ||
| let idx = occupied.get(); | ||
| self.views_buffer.push(self.views_buffer[*idx]); | ||
| self.null_buffer_builder.append_non_null(); | ||
| self.string_tracker = Some((ht, hasher)); | ||
| return Ok(()); | ||
| } | ||
| Entry::Vacant(vacant) => { | ||
| // o.w. we insert the (string hash -> view index) | ||
| // the idx is current length of views_builder, as we are inserting a new view | ||
| vacant.insert(self.views_buffer.len()); | ||
| // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len` | ||
| let can_deduplicate = self.string_tracker.is_some() | ||
| && self | ||
| .max_deduplication_len | ||
| .map(|max_length| length <= max_length) | ||
| .unwrap_or(true); | ||
| if can_deduplicate { | ||
| if let Some((mut ht, hasher)) = self.string_tracker.take() { | ||
| let hash_val = hasher.hash_one(v); | ||
| let hasher_fn = |v: &_| hasher.hash_one(v); | ||
|
|
||
| let entry = ht.entry( | ||
| hash_val, | ||
| |idx| { | ||
| let stored_value = self.get_value(*idx); | ||
| v == stored_value | ||
| }, | ||
| hasher_fn, | ||
| ); | ||
| match entry { | ||
| Entry::Occupied(occupied) => { | ||
| // If the string already exists, we will directly use the view | ||
| let idx = occupied.get(); | ||
| self.views_buffer.push(self.views_buffer[*idx]); | ||
| self.null_buffer_builder.append_non_null(); | ||
| self.string_tracker = Some((ht, hasher)); | ||
| return Ok(()); | ||
| } | ||
| Entry::Vacant(vacant) => { | ||
| // o.w. we insert the (string hash -> view index) | ||
| // the idx is current length of views_builder, as we are inserting a new view | ||
| vacant.insert(self.views_buffer.len()); | ||
| } | ||
| } | ||
| self.string_tracker = Some((ht, hasher)); | ||
| } | ||
| self.string_tracker = Some((ht, hasher)); | ||
| } | ||
|
|
||
| let required_cap = self.in_progress.len() + v.len(); | ||
|
|
@@ -636,8 +662,53 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 { | |
| mod tests { | ||
| use core::str; | ||
|
|
||
| use arrow_buffer::ArrowNativeType; | ||
|
|
||
| use super::*; | ||
|
|
||
| #[test] | ||
| fn test_string_max_deduplication_len() { | ||
| let value_1 = "short"; | ||
| let value_2 = "not so similar string but long"; | ||
| let value_3 = "1234567890123"; | ||
|
|
||
| let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2; | ||
|
|
||
| let mut builder = StringViewBuilder::new() | ||
| .with_deduplicate_strings() | ||
| .with_max_deduplication_len(max_deduplication_len); | ||
|
|
||
| assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize()); | ||
| assert!(value_2.len() > max_deduplication_len.as_usize()); | ||
| assert!( | ||
| value_3.len() > MAX_INLINE_VIEW_LEN.as_usize() | ||
| && value_3.len() < max_deduplication_len.as_usize() | ||
| ); | ||
|
|
||
| // append value1 (short), expect it is inlined and not deduplicated | ||
| builder.append_value(value_1); // view 0 | ||
| builder.append_value(value_1); // view 1 | ||
| // append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len | ||
| builder.append_value(value_2); // view 2 | ||
| builder.append_value(value_2); // view 3 | ||
| // append value3, expect second copy is deduplicated | ||
| builder.append_value(value_3); // view 4 | ||
| builder.append_value(value_3); // view 5 | ||
|
|
||
| let array = builder.finish(); | ||
|
|
||
| // verify | ||
| let v2 = ByteView::from(array.views()[2]); | ||
| let v3 = ByteView::from(array.views()[3]); | ||
| assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same buffer | ||
| assert_ne!(v2.offset, v3.offset); // different offsets --> not deduplicated | ||
|
|
||
| let v4 = ByteView::from(array.views()[4]); | ||
| let v5 = ByteView::from(array.views()[5]); | ||
| assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same buffer | ||
| assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_string_view_deduplicate() { | ||
| let value_1 = "long string to test string view"; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will changing
self.max_deduplication_lentou32and setting the default value toMAX_INLINE_VIEW_LENbe better? After this, we can unify the logic in L354 tolength < self.max_deduplication_len