@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
8787 /// Some if deduplicating strings
8888 /// map `<string hash> -> <index to the views>`
8989 string_tracker : Option < ( HashTable < usize > , ahash:: RandomState ) > ,
90+ max_deduplication_len : Option < u32 > ,
9091 phantom : PhantomData < T > ,
9192}
9293
@@ -107,10 +108,25 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
107108 current_size : STARTING_BLOCK_SIZE ,
108109 } ,
109110 string_tracker : None ,
111+ max_deduplication_len : None ,
110112 phantom : Default :: default ( ) ,
111113 }
112114 }
113115
116+ /// Configure max deduplication length when deduplicating strings while building the array.
117+ /// Default is [`MAX_INLINE_VIEW_LEN`] bytes.
118+ /// See <https://github.com/apache/arrow-rs/issues/7187> for more details on the implications.
119+ pub fn with_max_deduplication_len ( self , max_deduplication_len : u32 ) -> Self {
120+ debug_assert ! (
121+ max_deduplication_len > 0 ,
122+ "max_deduplication_len must be greater than 0"
123+ ) ;
124+ Self {
125+ max_deduplication_len : Some ( max_deduplication_len) ,
126+ ..self
127+ }
128+ }
129+
114130 /// Set a fixed buffer size for variable length strings
115131 ///
116132 /// The block size is the size of the buffer used to store values greater
@@ -334,35 +350,41 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
334350
335351 // Deduplication if:
336352 // (1) deduplication is enabled.
337- // (2) len > 12
338- if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
339- let hash_val = hasher. hash_one ( v) ;
340- let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
341-
342- let entry = ht. entry (
343- hash_val,
344- |idx| {
345- let stored_value = self . get_value ( * idx) ;
346- v == stored_value
347- } ,
348- hasher_fn,
349- ) ;
350- match entry {
351- Entry :: Occupied ( occupied) => {
352- // If the string already exists, we will directly use the view
353- let idx = occupied. get ( ) ;
354- self . views_buffer . push ( self . views_buffer [ * idx] ) ;
355- self . null_buffer_builder . append_non_null ( ) ;
356- self . string_tracker = Some ( ( ht, hasher) ) ;
357- return Ok ( ( ) ) ;
358- }
359- Entry :: Vacant ( vacant) => {
360- // o.w. we insert the (string hash -> view index)
361- // the idx is current length of views_builder, as we are inserting a new view
362- vacant. insert ( self . views_buffer . len ( ) ) ;
353+ // (2) len > `MAX_INLINE_VIEW_LEN` and len < `max_deduplication_len`
354+ let can_deduplicate = match self . max_deduplication_len {
355+ Some ( max_deduplication_len) => length < max_deduplication_len,
356+ None => true ,
357+ } ;
358+ if can_deduplicate {
359+ if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
360+ let hash_val = hasher. hash_one ( v) ;
361+ let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
362+
363+ let entry = ht. entry (
364+ hash_val,
365+ |idx| {
366+ let stored_value = self . get_value ( * idx) ;
367+ v == stored_value
368+ } ,
369+ hasher_fn,
370+ ) ;
371+ match entry {
372+ Entry :: Occupied ( occupied) => {
373+ // If the string already exists, we will directly use the view
374+ let idx = occupied. get ( ) ;
375+ self . views_buffer . push ( self . views_buffer [ * idx] ) ;
376+ self . null_buffer_builder . append_non_null ( ) ;
377+ self . string_tracker = Some ( ( ht, hasher) ) ;
378+ return Ok ( ( ) ) ;
379+ }
380+ Entry :: Vacant ( vacant) => {
381+ // o.w. we insert the (string hash -> view index)
382+ // the idx is current length of views_builder, as we are inserting a new view
383+ vacant. insert ( self . views_buffer . len ( ) ) ;
384+ }
363385 }
386+ self . string_tracker = Some ( ( ht, hasher) ) ;
364387 }
365- self . string_tracker = Some ( ( ht, hasher) ) ;
366388 }
367389
368390 let required_cap = self . in_progress . len ( ) + v. len ( ) ;
@@ -636,8 +658,58 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
636658mod tests {
637659 use core:: str;
638660
661+ use arrow_buffer:: ArrowNativeType ;
662+
639663 use super :: * ;
640664
665+ #[ test]
666+ fn test_string_max_deduplication_len ( ) {
667+ let value_1 = "short" ;
668+ let value_2 = "not so similar string but long" ;
669+ let value_3 = "1234567890123" ;
670+
671+ let mut builder = StringViewBuilder :: new ( )
672+ . with_deduplicate_strings ( )
673+ . with_max_deduplication_len ( MAX_INLINE_VIEW_LEN * 2 ) ;
674+ // safe to unwrap
675+ let max_deduplication_len = builder. max_deduplication_len . unwrap ( ) ;
676+ assert ! ( builder. string_tracker. is_some( ) ) ;
677+ assert ! ( max_deduplication_len > MAX_INLINE_VIEW_LEN ) ;
678+ assert ! ( value_1. len( ) < MAX_INLINE_VIEW_LEN . as_usize( ) ) ;
679+ assert ! ( value_2. len( ) > max_deduplication_len. as_usize( ) ) ;
680+ assert ! (
681+ value_3. len( ) > MAX_INLINE_VIEW_LEN . as_usize( )
682+ && value_3. len( ) < max_deduplication_len. as_usize( )
683+ ) ;
684+
685+ let value_checker = |v : & [ u8 ] , builder : & StringViewBuilder | {
686+ // safe to unwrap
687+ let ( ht, hasher) = builder. string_tracker . clone ( ) . unwrap ( ) ;
688+ let hash_val = hasher. hash_one ( v) ;
689+
690+ ht. find ( hash_val, |idx| {
691+ let stored_value = builder. get_value ( * idx) ;
692+ v == stored_value
693+ } )
694+ . cloned ( )
695+ } ;
696+
697+ // append value1, it MUST not in the string_tracker
698+ let v: & [ u8 ] = value_1. as_ref ( ) ;
699+ builder. append_value ( value_1) ;
700+ assert ! ( value_checker( v, & builder) . is_none( ) ) ;
701+
702+ // append value2, it MUST not in the string_tracker
703+ let v: & [ u8 ] = value_2. as_ref ( ) ;
704+ builder. append_value ( value_2) ;
705+ assert ! ( value_checker( v, & builder) . is_none( ) ) ;
706+
707+ // append value3, it MUST in the string_tracker
708+ let v: & [ u8 ] = value_3. as_ref ( ) ;
709+ builder. append_value ( value_3) ;
710+ assert ! ( value_checker( v, & builder) . is_some( ) ) ;
711+ }
712+
641713 #[ test]
642714 fn test_string_view_deduplicate ( ) {
643715 let value_1 = "long string to test string view" ;
0 commit comments