@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
8787 /// Some if deduplicating strings
8888 /// map `<string hash> -> <index to the views>`
8989 string_tracker : Option < ( HashTable < usize > , ahash:: RandomState ) > ,
90+ max_deduplication_len : Option < u32 > ,
9091 phantom : PhantomData < T > ,
9192}
9293
@@ -107,10 +108,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
107108 current_size : STARTING_BLOCK_SIZE ,
108109 } ,
109110 string_tracker : None ,
111+ max_deduplication_len : None ,
110112 phantom : Default :: default ( ) ,
111113 }
112114 }
113115
116+ /// Configure max deduplication length when deduplicating strings while building the array.
117+ /// Default is [`MAX_INLINE_VIEW_LEN`] bytes.
118+ ///
119+ /// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate
120+ /// any strings longer than 12 bytes. However, since it takes time proportional to the length
121+ /// of the string to deduplicate, setting this option limits the CPU overhead for this option.
122+ pub fn with_max_deduplication_len ( self , max_deduplication_len : u32 ) -> Self {
123+ debug_assert ! (
124+ max_deduplication_len > 0 ,
125+ "max_deduplication_len must be greater than 0"
126+ ) ;
127+ Self {
128+ max_deduplication_len : Some ( max_deduplication_len) ,
129+ ..self
130+ }
131+ }
132+
114133 /// Set a fixed buffer size for variable length strings
115134 ///
116135 /// The block size is the size of the buffer used to store values greater
@@ -334,35 +353,45 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
334353
335354 // Deduplication if:
336355 // (1) deduplication is enabled.
337- // (2) len > 12
338- if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
339- let hash_val = hasher. hash_one ( v) ;
340- let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
341-
342- let entry = ht. entry (
343- hash_val,
344- |idx| {
345- let stored_value = self . get_value ( * idx) ;
346- v == stored_value
347- } ,
348- hasher_fn,
349- ) ;
350- match entry {
351- Entry :: Occupied ( occupied) => {
352- // If the string already exists, we will directly use the view
353- let idx = occupied. get ( ) ;
354- self . views_buffer . push ( self . views_buffer [ * idx] ) ;
355- self . null_buffer_builder . append_non_null ( ) ;
356- self . string_tracker = Some ( ( ht, hasher) ) ;
357- return Ok ( ( ) ) ;
358- }
359- Entry :: Vacant ( vacant) => {
360- // o.w. we insert the (string hash -> view index)
361- // the idx is current length of views_builder, as we are inserting a new view
362- vacant. insert ( self . views_buffer . len ( ) ) ;
356+ // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
357+ let can_deduplicate = if self . string_tracker . is_some ( ) {
358+ match self . max_deduplication_len {
359+ Some ( max_deduplication_len) => length <= max_deduplication_len,
360+ None => true ,
361+ }
362+ } else {
363+ false
364+ } ;
365+ if can_deduplicate {
366+ if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
367+ let hash_val = hasher. hash_one ( v) ;
368+ let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
369+
370+ let entry = ht. entry (
371+ hash_val,
372+ |idx| {
373+ let stored_value = self . get_value ( * idx) ;
374+ v == stored_value
375+ } ,
376+ hasher_fn,
377+ ) ;
378+ match entry {
379+ Entry :: Occupied ( occupied) => {
380+ // If the string already exists, we will directly use the view
381+ let idx = occupied. get ( ) ;
382+ self . views_buffer . push ( self . views_buffer [ * idx] ) ;
383+ self . null_buffer_builder . append_non_null ( ) ;
384+ self . string_tracker = Some ( ( ht, hasher) ) ;
385+ return Ok ( ( ) ) ;
386+ }
387+ Entry :: Vacant ( vacant) => {
388+ // o.w. we insert the (string hash -> view index)
389+ // the idx is current length of views_builder, as we are inserting a new view
390+ vacant. insert ( self . views_buffer . len ( ) ) ;
391+ }
363392 }
393+ self . string_tracker = Some ( ( ht, hasher) ) ;
364394 }
365- self . string_tracker = Some ( ( ht, hasher) ) ;
366395 }
367396
368397 let required_cap = self . in_progress . len ( ) + v. len ( ) ;
@@ -636,8 +665,53 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
636665mod tests {
637666 use core:: str;
638667
668+ use arrow_buffer:: ArrowNativeType ;
669+
639670 use super :: * ;
640671
672+ #[ test]
673+ fn test_string_max_deduplication_len ( ) {
674+ let value_1 = "short" ;
675+ let value_2 = "not so similar string but long" ;
676+ let value_3 = "1234567890123" ;
677+
678+ let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2 ;
679+
680+ let mut builder = StringViewBuilder :: new ( )
681+ . with_deduplicate_strings ( )
682+ . with_max_deduplication_len ( max_deduplication_len) ;
683+
684+ assert ! ( value_1. len( ) < MAX_INLINE_VIEW_LEN . as_usize( ) ) ;
685+ assert ! ( value_2. len( ) > max_deduplication_len. as_usize( ) ) ;
686+ assert ! (
687+ value_3. len( ) > MAX_INLINE_VIEW_LEN . as_usize( )
688+ && value_3. len( ) < max_deduplication_len. as_usize( )
689+ ) ;
690+
691+ // append value1 (short), expect it is inlined and not deduplicated
692+ builder. append_value ( value_1) ; // view 0
693+ builder. append_value ( value_1) ; // view 1
694+ // append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len
695+ builder. append_value ( value_2) ; // view 2
696+ builder. append_value ( value_2) ; // view 3
697+ // append value3, expect second copy is deduplicated
698+ builder. append_value ( value_3) ; // view 4
699+ builder. append_value ( value_3) ; // view 5
700+
701+ let array = builder. finish ( ) ;
702+
703+ // verify
704+ let v2 = ByteView :: from ( array. views ( ) [ 2 ] ) ;
705+ let v3 = ByteView :: from ( array. views ( ) [ 3 ] ) ;
706+ assert_eq ! ( v2. buffer_index, v3. buffer_index) ; // stored in same buffer
707+ assert_ne ! ( v2. offset, v3. offset) ; // different offsets --> not deduplicated
708+
709+ let v4 = ByteView :: from ( array. views ( ) [ 4 ] ) ;
710+ let v5 = ByteView :: from ( array. views ( ) [ 5 ] ) ;
711+ assert_eq ! ( v4. buffer_index, v5. buffer_index) ; // stored in same buffer
712+ assert_eq ! ( v4. offset, v5. offset) ; // same offsets --> deduplicated
713+ }
714+
641715 #[ test]
642716 fn test_string_view_deduplicate ( ) {
643717 let value_1 = "long string to test string view" ;
0 commit comments