Skip to content

Commit 46b786a

Browse files
committed
[Arrow]Configure max deduplication length when deduplicating strings while building the array
1 parent f8796fd commit 46b786a

File tree

1 file changed

+101
-27
lines changed

1 file changed

+101
-27
lines changed

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 101 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
8787
/// Some if deduplicating strings
8888
/// map `<string hash> -> <index to the views>`
8989
string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
90+
max_deduplication_len: Option<u32>,
9091
phantom: PhantomData<T>,
9192
}
9293

@@ -107,10 +108,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
107108
current_size: STARTING_BLOCK_SIZE,
108109
},
109110
string_tracker: None,
111+
max_deduplication_len: None,
110112
phantom: Default::default(),
111113
}
112114
}
113115

116+
/// Configure max deduplication length when deduplicating strings while building the array.
117+
/// Default is [`MAX_INLINE_VIEW_LEN`] bytes.
118+
///
119+
/// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate
120+
/// any strings longer than 12 bytes. However, since it takes time proportional to the length
121+
/// of the string to deduplicate, setting this option limits the CPU overhead for this option.
122+
pub fn with_max_deduplication_len(self, max_deduplication_len: u32) -> Self {
123+
debug_assert!(
124+
max_deduplication_len > 0,
125+
"max_deduplication_len must be greater than 0"
126+
);
127+
Self {
128+
max_deduplication_len: Some(max_deduplication_len),
129+
..self
130+
}
131+
}
132+
114133
/// Set a fixed buffer size for variable length strings
115134
///
116135
/// The block size is the size of the buffer used to store values greater
@@ -334,35 +353,45 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
334353

335354
// Deduplication if:
336355
// (1) deduplication is enabled.
337-
// (2) len > 12
338-
if let Some((mut ht, hasher)) = self.string_tracker.take() {
339-
let hash_val = hasher.hash_one(v);
340-
let hasher_fn = |v: &_| hasher.hash_one(v);
341-
342-
let entry = ht.entry(
343-
hash_val,
344-
|idx| {
345-
let stored_value = self.get_value(*idx);
346-
v == stored_value
347-
},
348-
hasher_fn,
349-
);
350-
match entry {
351-
Entry::Occupied(occupied) => {
352-
// If the string already exists, we will directly use the view
353-
let idx = occupied.get();
354-
self.views_buffer.push(self.views_buffer[*idx]);
355-
self.null_buffer_builder.append_non_null();
356-
self.string_tracker = Some((ht, hasher));
357-
return Ok(());
358-
}
359-
Entry::Vacant(vacant) => {
360-
// o.w. we insert the (string hash -> view index)
361-
// the idx is current length of views_builder, as we are inserting a new view
362-
vacant.insert(self.views_buffer.len());
356+
// (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
357+
let can_deduplicate = if self.string_tracker.is_some() {
358+
match self.max_deduplication_len {
359+
Some(max_deduplication_len) => length <= max_deduplication_len,
360+
None => true,
361+
}
362+
} else {
363+
false
364+
};
365+
if can_deduplicate {
366+
if let Some((mut ht, hasher)) = self.string_tracker.take() {
367+
let hash_val = hasher.hash_one(v);
368+
let hasher_fn = |v: &_| hasher.hash_one(v);
369+
370+
let entry = ht.entry(
371+
hash_val,
372+
|idx| {
373+
let stored_value = self.get_value(*idx);
374+
v == stored_value
375+
},
376+
hasher_fn,
377+
);
378+
match entry {
379+
Entry::Occupied(occupied) => {
380+
// If the string already exists, we will directly use the view
381+
let idx = occupied.get();
382+
self.views_buffer.push(self.views_buffer[*idx]);
383+
self.null_buffer_builder.append_non_null();
384+
self.string_tracker = Some((ht, hasher));
385+
return Ok(());
386+
}
387+
Entry::Vacant(vacant) => {
388+
// o.w. we insert the (string hash -> view index)
389+
// the idx is current length of views_builder, as we are inserting a new view
390+
vacant.insert(self.views_buffer.len());
391+
}
363392
}
393+
self.string_tracker = Some((ht, hasher));
364394
}
365-
self.string_tracker = Some((ht, hasher));
366395
}
367396

368397
let required_cap = self.in_progress.len() + v.len();
@@ -636,8 +665,53 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
636665
mod tests {
637666
use core::str;
638667

668+
use arrow_buffer::ArrowNativeType;
669+
639670
use super::*;
640671

672+
#[test]
673+
fn test_string_max_deduplication_len() {
674+
let value_1 = "short";
675+
let value_2 = "not so similar string but long";
676+
let value_3 = "1234567890123";
677+
678+
let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2;
679+
680+
let mut builder = StringViewBuilder::new()
681+
.with_deduplicate_strings()
682+
.with_max_deduplication_len(max_deduplication_len);
683+
684+
assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
685+
assert!(value_2.len() > max_deduplication_len.as_usize());
686+
assert!(
687+
value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
688+
&& value_3.len() < max_deduplication_len.as_usize()
689+
);
690+
691+
// append value1 (short), expect it is inlined and not deduplicated
692+
builder.append_value(value_1); // view 0
693+
builder.append_value(value_1); // view 1
694+
// append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len
695+
builder.append_value(value_2); // view 2
696+
builder.append_value(value_2); // view 3
697+
// append value3, expect second copy is deduplicated
698+
builder.append_value(value_3); // view 4
699+
builder.append_value(value_3); // view 5
700+
701+
let array = builder.finish();
702+
703+
// verify
704+
let v2 = ByteView::from(array.views()[2]);
705+
let v3 = ByteView::from(array.views()[3]);
706+
assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same buffer
707+
assert_ne!(v2.offset, v3.offset); // different offsets --> not deduplicated
708+
709+
let v4 = ByteView::from(array.views()[4]);
710+
let v5 = ByteView::from(array.views()[5]);
711+
assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same buffer
712+
assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated
713+
}
714+
641715
#[test]
642716
fn test_string_view_deduplicate() {
643717
let value_1 = "long string to test string view";

0 commit comments

Comments
 (0)