Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 98 additions & 27 deletions arrow-array/src/builder/generic_bytes_view_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
/// Some if deduplicating strings
/// map `<string hash> -> <index to the views>`
string_tracker: Option<(HashTable<usize>, ahash::RandomState)>,
max_deduplication_len: Option<u32>,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will changing self.max_deduplication_len to u32 and setting the default value to MAX_INLINE_VIEW_LEN be better? After this, we can unify the logic in L354 to length < self.max_deduplication_len

phantom: PhantomData<T>,
}

Expand All @@ -107,10 +108,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
current_size: STARTING_BLOCK_SIZE,
},
string_tracker: None,
max_deduplication_len: None,
phantom: Default::default(),
}
}

/// Configure max deduplication length when deduplicating strings while building the array.
/// Default is None.
///
/// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate
/// any strings longer than 12 bytes. However, since it takes time proportional to the length
/// of the string to deduplicate, setting this option limits the CPU overhead for this option.
pub fn with_max_deduplication_len(self, max_deduplication_len: u32) -> Self {
debug_assert!(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not leave use assert rather than debug_assert?

We should also document in the comments that the parameter must be greater than zero if we are going to assert

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i look at the with_fixed_block_size function, it use debug_assert:

    pub fn with_fixed_block_size(self, block_size: u32) -> Self {
        debug_assert!(block_size > 0, "Block size must be greater than 0");
        Self {
            block_size: BlockSizeGrowthStrategy::Fixed { size: block_size },
            ..self
        }
    }

max_deduplication_len > 0,
"max_deduplication_len must be greater than 0"
);
Self {
max_deduplication_len: Some(max_deduplication_len),
..self
}
}

/// Set a fixed buffer size for variable length strings
///
/// The block size is the size of the buffer used to store values greater
Expand Down Expand Up @@ -334,35 +353,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {

// Deduplication if:
// (1) deduplication is enabled.
// (2) len > 12
if let Some((mut ht, hasher)) = self.string_tracker.take() {
let hash_val = hasher.hash_one(v);
let hasher_fn = |v: &_| hasher.hash_one(v);

let entry = ht.entry(
hash_val,
|idx| {
let stored_value = self.get_value(*idx);
v == stored_value
},
hasher_fn,
);
match entry {
Entry::Occupied(occupied) => {
// If the string already exists, we will directly use the view
let idx = occupied.get();
self.views_buffer.push(self.views_buffer[*idx]);
self.null_buffer_builder.append_non_null();
self.string_tracker = Some((ht, hasher));
return Ok(());
}
Entry::Vacant(vacant) => {
// o.w. we insert the (string hash -> view index)
// the idx is current length of views_builder, as we are inserting a new view
vacant.insert(self.views_buffer.len());
// (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
let can_deduplicate = self.string_tracker.is_some()
&& self
.max_deduplication_len
.map(|max_length| length <= max_length)
.unwrap_or(true);
if can_deduplicate {
if let Some((mut ht, hasher)) = self.string_tracker.take() {
let hash_val = hasher.hash_one(v);
let hasher_fn = |v: &_| hasher.hash_one(v);

let entry = ht.entry(
hash_val,
|idx| {
let stored_value = self.get_value(*idx);
v == stored_value
},
hasher_fn,
);
match entry {
Entry::Occupied(occupied) => {
// If the string already exists, we will directly use the view
let idx = occupied.get();
self.views_buffer.push(self.views_buffer[*idx]);
self.null_buffer_builder.append_non_null();
self.string_tracker = Some((ht, hasher));
return Ok(());
}
Entry::Vacant(vacant) => {
// o.w. we insert the (string hash -> view index)
// the idx is current length of views_builder, as we are inserting a new view
vacant.insert(self.views_buffer.len());
}
}
self.string_tracker = Some((ht, hasher));
}
self.string_tracker = Some((ht, hasher));
}

let required_cap = self.in_progress.len() + v.len();
Expand Down Expand Up @@ -636,8 +662,53 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
mod tests {
use core::str;

use arrow_buffer::ArrowNativeType;

use super::*;

#[test]
fn test_string_max_deduplication_len() {
let value_1 = "short";
let value_2 = "not so similar string but long";
let value_3 = "1234567890123";

let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2;

let mut builder = StringViewBuilder::new()
.with_deduplicate_strings()
.with_max_deduplication_len(max_deduplication_len);

assert!(value_1.len() < MAX_INLINE_VIEW_LEN.as_usize());
assert!(value_2.len() > max_deduplication_len.as_usize());
assert!(
value_3.len() > MAX_INLINE_VIEW_LEN.as_usize()
&& value_3.len() < max_deduplication_len.as_usize()
);

// append value1 (short), expect it is inlined and not deduplicated
builder.append_value(value_1); // view 0
builder.append_value(value_1); // view 1
// append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len
builder.append_value(value_2); // view 2
builder.append_value(value_2); // view 3
// append value3, expect second copy is deduplicated
builder.append_value(value_3); // view 4
builder.append_value(value_3); // view 5

let array = builder.finish();

// verify
let v2 = ByteView::from(array.views()[2]);
let v3 = ByteView::from(array.views()[3]);
assert_eq!(v2.buffer_index, v3.buffer_index); // stored in same buffer
assert_ne!(v2.offset, v3.offset); // different offsets --> not deduplicated

let v4 = ByteView::from(array.views()[4]);
let v5 = ByteView::from(array.views()[5]);
assert_eq!(v4.buffer_index, v5.buffer_index); // stored in same buffer
assert_eq!(v4.offset, v5.offset); // same offsets --> deduplicated
}

#[test]
fn test_string_view_deduplicate() {
let value_1 = "long string to test string view";
Expand Down
Loading