@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
8787 /// Some if deduplicating strings
8888 /// map `<string hash> -> <index to the views>`
8989 string_tracker : Option < ( HashTable < usize > , ahash:: RandomState ) > ,
90+ max_deduplication_len : Option < u32 > ,
9091 phantom : PhantomData < T > ,
9192}
9293
@@ -107,10 +108,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
107108 current_size : STARTING_BLOCK_SIZE ,
108109 } ,
109110 string_tracker : None ,
111+ max_deduplication_len : None ,
110112 phantom : Default :: default ( ) ,
111113 }
112114 }
113115
116+ /// Configure max deduplication length when deduplicating strings while building the array.
117+ /// Default is None.
118+ ///
119+ /// When [`Self::with_deduplicate_strings`] is enabled, the builder attempts to deduplicate
120+ /// any strings longer than 12 bytes. However, since it takes time proportional to the length
121+ /// of the string to deduplicate, setting this option limits the CPU overhead for this option.
122+ pub fn with_max_deduplication_len ( self , max_deduplication_len : u32 ) -> Self {
123+ debug_assert ! (
124+ max_deduplication_len > 0 ,
125+ "max_deduplication_len must be greater than 0"
126+ ) ;
127+ Self {
128+ max_deduplication_len : Some ( max_deduplication_len) ,
129+ ..self
130+ }
131+ }
132+
114133 /// Set a fixed buffer size for variable length strings
115134 ///
116135 /// The block size is the size of the buffer used to store values greater
@@ -334,8 +353,13 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
334353
335354 // Deduplication if:
336355 // (1) deduplication is enabled.
337- // (2) len > 12
338- if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
356+ // (2) len > `MAX_INLINE_VIEW_LEN` and len <= `max_deduplication_len`
357+ let can_deduplicate = self . string_tracker . is_some ( )
358+ && self
359+ . max_deduplication_len
360+ . map ( |max_length| length <= max_length)
361+ . unwrap_or ( true ) ;
362+ if can_deduplicate && let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
339363 let hash_val = hasher. hash_one ( v) ;
340364 let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
341365
@@ -636,8 +660,53 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
636660mod tests {
637661 use core:: str;
638662
663+ use arrow_buffer:: ArrowNativeType ;
664+
639665 use super :: * ;
640666
667+ #[ test]
668+ fn test_string_max_deduplication_len ( ) {
669+ let value_1 = "short" ;
670+ let value_2 = "not so similar string but long" ;
671+ let value_3 = "1234567890123" ;
672+
673+ let max_deduplication_len = MAX_INLINE_VIEW_LEN * 2 ;
674+
675+ let mut builder = StringViewBuilder :: new ( )
676+ . with_deduplicate_strings ( )
677+ . with_max_deduplication_len ( max_deduplication_len) ;
678+
679+ assert ! ( value_1. len( ) < MAX_INLINE_VIEW_LEN . as_usize( ) ) ;
680+ assert ! ( value_2. len( ) > max_deduplication_len. as_usize( ) ) ;
681+ assert ! (
682+ value_3. len( ) > MAX_INLINE_VIEW_LEN . as_usize( )
683+ && value_3. len( ) < max_deduplication_len. as_usize( )
684+ ) ;
685+
686+ // append value1 (short), expect it is inlined and not deduplicated
687+ builder. append_value ( value_1) ; // view 0
688+ builder. append_value ( value_1) ; // view 1
689+ // append value2, expect second copy is not deduplicated as it exceeds max_deduplication_len
690+ builder. append_value ( value_2) ; // view 2
691+ builder. append_value ( value_2) ; // view 3
692+ // append value3, expect second copy is deduplicated
693+ builder. append_value ( value_3) ; // view 4
694+ builder. append_value ( value_3) ; // view 5
695+
696+ let array = builder. finish ( ) ;
697+
698+ // verify
699+ let v2 = ByteView :: from ( array. views ( ) [ 2 ] ) ;
700+ let v3 = ByteView :: from ( array. views ( ) [ 3 ] ) ;
701+ assert_eq ! ( v2. buffer_index, v3. buffer_index) ; // stored in same buffer
702+ assert_ne ! ( v2. offset, v3. offset) ; // different offsets --> not deduplicated
703+
704+ let v4 = ByteView :: from ( array. views ( ) [ 4 ] ) ;
705+ let v5 = ByteView :: from ( array. views ( ) [ 5 ] ) ;
706+ assert_eq ! ( v4. buffer_index, v5. buffer_index) ; // stored in same buffer
707+ assert_eq ! ( v4. offset, v5. offset) ; // same offsets --> deduplicated
708+ }
709+
641710 #[ test]
642711 fn test_string_view_deduplicate ( ) {
643712 let value_1 = "long string to test string view" ;
0 commit comments