Skip to content

Commit 86006cc

Browse files
committed
[Arrow]Configure max deduplication length when deduplicating strings while building the array
1 parent f8796fd commit 86006cc

File tree

1 file changed

+100
-27
lines changed

1 file changed

+100
-27
lines changed

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 100 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
8787
/// Some if deduplicating strings
8888
/// map `<string hash> -> <index to the views>`
8989
string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
90+
max_deduplication_len: Option<u32>,
9091
phantom: PhantomData<T>,
9192
}
9293

@@ -107,10 +108,25 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
107108
current_size: STARTING_BLOCK_SIZE,
108109
},
109110
string_tracker: None,
111+
max_deduplication_len: None,
110112
phantom: Default::default(),
111113
}
112114
}
113115

116+
/// Configure max deduplication length when deduplicating strings while building the array.
117+
/// Default is [`MAX_INLINE_VIEW_LEN`] bytes.
118+
/// See <https://github.com/apache/arrow-rs/issues/7187> for more details on the implications.
119+
pub fn with_max_deduplication_len(self, max_deduplication_len: u32) -> Self {
120+
debug_assert!(
121+
max_deduplication_len > 0,
122+
"max_deduplication_len must be greater than 0"
123+
);
124+
Self {
125+
max_deduplication_len: Some(max_deduplication_len),
126+
..self
127+
}
128+
}
129+
114130
/// Set a fixed buffer size for variable length strings
115131
///
116132
/// The block size is the size of the buffer used to store values greater
@@ -334,35 +350,41 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
334350

335351
// Deduplication if:
336352
// (1) deduplication is enabled.
337-
// (2) len > 12
338-
if let Some((mut ht, hasher)) = self.string_tracker.take() {
339-
let hash_val = hasher.hash_one(v);
340-
let hasher_fn = |v: &_| hasher.hash_one(v);
341-
342-
let entry = ht.entry(
343-
hash_val,
344-
|idx| {
345-
let stored_value = self.get_value(*idx);
346-
v == stored_value
347-
},
348-
hasher_fn,
349-
);
350-
match entry {
351-
Entry::Occupied(occupied) => {
352-
// If the string already exists, we will directly use the view
353-
let idx = occupied.get();
354-
self.views_buffer.push(self.views_buffer[*idx]);
355-
self.null_buffer_builder.append_non_null();
356-
self.string_tracker = Some((ht, hasher));
357-
return Ok(());
358-
}
359-
Entry::Vacant(vacant) => {
360-
// o.w. we insert the (string hash -> view index)
361-
// the idx is current length of views_builder, as we are inserting a new view
362-
vacant.insert(self.views_buffer.len());
353+
// (2) len > `MAX_INLINE_VIEW_LEN` and len < `max_deduplication_len`
354+
let can_deduplicate = match self.max_deduplication_len {
355+
Some(max_deduplication_len) => length <= max_deduplication_len,
356+
None => true,
357+
};
358+
if can_deduplicate {
359+
if let Some((mut ht, hasher)) = self.string_tracker.take() {
360+
let hash_val = hasher.hash_one(v);
361+
let hasher_fn = |v: &_| hasher.hash_one(v);
362+
363+
let entry = ht.entry(
364+
hash_val,
365+
|idx| {
366+
let stored_value = self.get_value(*idx);
367+
v == stored_value
368+
},
369+
hasher_fn,
370+
);
371+
match entry {
372+
Entry::Occupied(occupied) => {
373+
// If the string already exists, we will directly use the view
374+
let idx = occupied.get();
375+
self.views_buffer.push(self.views_buffer[*idx]);
376+
self.null_buffer_builder.append_non_null();
377+
self.string_tracker = Some((ht, hasher));
378+
return Ok(());
379+
}
380+
Entry::Vacant(vacant) => {
381+
// o.w. we insert the (string hash -> view index)
382+
// the idx is current length of views_builder, as we are inserting a new view
383+
vacant.insert(self.views_buffer.len());
384+
}
363385
}
386+
self.string_tracker = Some((ht, hasher));
364387
}
365-
self.string_tracker = Some((ht, hasher));
366388
}
367389

368390
let required_cap = self.in_progress.len() + v.len();
@@ -636,8 +658,59 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
636658
mod tests {
637659
use core::str;
638660

661+
use arrow_buffer::ArrowNativeType;
662+
639663
use super::*;
640664

665+
#[test]
666+
fn test_string_max_deduplication_len() {
667+
let value_1 = "short";
668+
let value_2 = "not so similar string but long";
669+
let value_3 = "1234567890123";
670+
671+
let mut builder = StringViewBuilder::new()
672+
.with_deduplicate_strings()
673+
.with_max_deduplication_len(MAX_INLINE_VIEW_LEN * 2);
674+
assert!(builder.string_tracker.is_some());
675+
assert!(builder.max_deduplication_len.is_some());
676+
// Since assert `max_deduplication_len` is some, so it is safe to unwrap here
677+
let max_deduplication_len = builder.max_deduplication_len.unwrap();
678+
assert!(max_deduplication_len > MAX_INLINE_VIEW_LEN);
679+
assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
680+
assert!(value_2.len() > max_deduplication_len.as_usize());
681+
assert!(
682+
value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
683+
&& value_3.len() < max_deduplication_len.as_usize()
684+
);
685+
686+
let value_checker = |v: &[u8], builder: &StringViewBuilder| {
687+
// safe to unwrap
688+
let (ht, hasher) = builder.string_tracker.clone().unwrap();
689+
let hash_val = hasher.hash_one(v);
690+
691+
ht.find(hash_val, |idx| {
692+
let stored_value = builder.get_value(*idx);
693+
v == stored_value
694+
})
695+
.cloned()
696+
};
697+
698+
// append value1, it MUST not in the string_tracker
699+
let v: &[u8] = value_1.as_ref();
700+
builder.append_value(value_1);
701+
assert!(value_checker(v, &builder).is_none());
702+
703+
// append value2, it MUST not in the string_tracker
704+
let v: &[u8] = value_2.as_ref();
705+
builder.append_value(value_2);
706+
assert!(value_checker(v, &builder).is_none());
707+
708+
// append value3, it MUST in the string_tracker
709+
let v: &[u8] = value_3.as_ref();
710+
builder.append_value(value_3);
711+
assert!(value_checker(v, &builder).is_some());
712+
}
713+
641714
#[test]
642715
fn test_string_view_deduplicate() {
643716
let value_1 = "long string to test string view";

0 commit comments

Comments
 (0)