@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
8787 /// Some if deduplicating strings
8888 /// map `<string hash> -> <index to the views>`
8989 string_tracker : Option < ( HashTable < usize > , ahash:: RandomState ) > ,
90+ max_deduplication_len : Option < u32 > ,
9091 phantom : PhantomData < T > ,
9192}
9293
@@ -107,10 +108,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
107108 current_size : STARTING_BLOCK_SIZE ,
108109 } ,
109110 string_tracker : None ,
111+ max_deduplication_len : None ,
110112 phantom : Default :: default ( ) ,
111113 }
112114 }
113115
116+ /// Configure max deduplication length when deduplicating strings while building the array.
117+ /// Default is None.
118+ ///
119+ /// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate
120+ /// any strings longer than 12 bytes. However, since it takes time proportional to the length
121+ /// of the string to deduplicate, setting this option limits the CPU overhead for this option.
122+ pub fn with_max_deduplication_len ( self , max_deduplication_len : u32 ) -> Self {
123+ debug_assert ! (
124+ max_deduplication_len > 0 ,
125+ "max_deduplication_len must be greater than 0"
126+ ) ;
127+ Self {
128+ max_deduplication_len : Some ( max_deduplication_len) ,
129+ ..self
130+ }
131+ }
132+
114133 /// Set a fixed buffer size for variable length strings
115134 ///
116135 /// The block size is the size of the buffer used to store values greater
@@ -334,35 +353,42 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
334353
335354 // Deduplication if:
336355 // (1) deduplication is enabled.
337- // (2) len > 12
338- if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
339- let hash_val = hasher. hash_one ( v) ;
340- let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
341-
342- let entry = ht. entry (
343- hash_val,
344- |idx| {
345- let stored_value = self . get_value ( * idx) ;
346- v == stored_value
347- } ,
348- hasher_fn,
349- ) ;
350- match entry {
351- Entry :: Occupied ( occupied) => {
352- // If the string already exists, we will directly use the view
353- let idx = occupied. get ( ) ;
354- self . views_buffer . push ( self . views_buffer [ * idx] ) ;
355- self . null_buffer_builder . append_non_null ( ) ;
356- self . string_tracker = Some ( ( ht, hasher) ) ;
357- return Ok ( ( ) ) ;
358- }
359- Entry :: Vacant ( vacant) => {
360- // o.w. we insert the (string hash -> view index)
361- // the idx is current length of views_builder, as we are inserting a new view
362- vacant. insert ( self . views_buffer . len ( ) ) ;
356+ // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
357+ let can_deduplicate = self . string_tracker . is_some ( )
358+ && self
359+ . max_deduplication_len
360+ . map ( |max_length| length <= max_length)
361+ . unwrap_or ( true ) ;
362+ if can_deduplicate {
363+ if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
364+ let hash_val = hasher. hash_one ( v) ;
365+ let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
366+
367+ let entry = ht. entry (
368+ hash_val,
369+ |idx| {
370+ let stored_value = self . get_value ( * idx) ;
371+ v == stored_value
372+ } ,
373+ hasher_fn,
374+ ) ;
375+ match entry {
376+ Entry :: Occupied ( occupied) => {
377+ // If the string already exists, we will directly use the view
378+ let idx = occupied. get ( ) ;
379+ self . views_buffer . push ( self . views_buffer [ * idx] ) ;
380+ self . null_buffer_builder . append_non_null ( ) ;
381+ self . string_tracker = Some ( ( ht, hasher) ) ;
382+ return Ok ( ( ) ) ;
383+ }
384+ Entry :: Vacant ( vacant) => {
385+ // o.w. we insert the (string hash -> view index)
386+ // the idx is current length of views_builder, as we are inserting a new view
387+ vacant. insert ( self . views_buffer . len ( ) ) ;
388+ }
363389 }
390+ self . string_tracker = Some ( ( ht, hasher) ) ;
364391 }
365- self . string_tracker = Some ( ( ht, hasher) ) ;
366392 }
367393
368394 let required_cap = self . in_progress . len ( ) + v. len ( ) ;
@@ -636,8 +662,53 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
636662mod tests {
637663 use core:: str;
638664
665+ use arrow_buffer:: ArrowNativeType ;
666+
639667 use super :: * ;
640668
669+ #[ test]
670+ fn test_string_max_deduplication_len ( ) {
671+ let value_1 = "short" ;
672+ let value_2 = "not so similar string but long" ;
673+ let value_3 = "1234567890123" ;
674+
675+ let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2 ;
676+
677+ let mut builder = StringViewBuilder :: new ( )
678+ . with_deduplicate_strings ( )
679+ . with_max_deduplication_len ( max_deduplication_len) ;
680+
681+ assert ! ( value_1. len( ) < MAX_INLINE_VIEW_LEN . as_usize( ) ) ;
682+ assert ! ( value_2. len( ) > max_deduplication_len. as_usize( ) ) ;
683+ assert ! (
684+ value_3. len( ) > MAX_INLINE_VIEW_LEN . as_usize( )
685+ && value_3. len( ) < max_deduplication_len. as_usize( )
686+ ) ;
687+
688+ // append value1 (short), expect it is inlined and not deduplicated
689+ builder. append_value ( value_1) ; // view 0
690+ builder. append_value ( value_1) ; // view 1
691+ // append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len
692+ builder. append_value ( value_2) ; // view 2
693+ builder. append_value ( value_2) ; // view 3
694+ // append value3, expect second copy is deduplicated
695+ builder. append_value ( value_3) ; // view 4
696+ builder. append_value ( value_3) ; // view 5
697+
698+ let array = builder. finish ( ) ;
699+
700+ // verify
701+ let v2 = ByteView :: from ( array. views ( ) [ 2 ] ) ;
702+ let v3 = ByteView :: from ( array. views ( ) [ 3 ] ) ;
703+ assert_eq ! ( v2. buffer_index, v3. buffer_index) ; // stored in same buffer
704+ assert_ne ! ( v2. offset, v3. offset) ; // different offsets --> not deduplicated
705+
706+ let v4 = ByteView :: from ( array. views ( ) [ 4 ] ) ;
707+ let v5 = ByteView :: from ( array. views ( ) [ 5 ] ) ;
708+ assert_eq ! ( v4. buffer_index, v5. buffer_index) ; // stored in same buffer
709+ assert_eq ! ( v4. offset, v5. offset) ; // same offsets --> deduplicated
710+ }
711+
641712 #[ test]
642713 fn test_string_view_deduplicate ( ) {
643714 let value_1 = "long string to test string view" ;
0 commit comments