@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
8787 /// Some if deduplicating strings
8888 /// map `<string hash> -> <index to the views>`
8989 string_tracker : Option < ( HashTable < usize > , ahash:: RandomState ) > ,
90+ max_deduplication_len : u32 ,
9091 phantom : PhantomData < T > ,
9192}
9293
@@ -107,10 +108,25 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
107108 current_size : STARTING_BLOCK_SIZE ,
108109 } ,
109110 string_tracker : None ,
111+ max_deduplication_len : MAX_INLINE_VIEW_LEN ,
110112 phantom : Default :: default ( ) ,
111113 }
112114 }
113115
116+ /// Configure max deduplication length when deduplicating strings while building the array.
117+ /// Default is [`MAX_INLINE_VIEW_LEN`] bytes.
118+ /// See <https://github.com/apache/arrow-rs/issues/7187> for more details on the implications.
119+ pub fn with_max_deduplication_len ( self , max_deduplication_len : u32 ) -> Self {
120+ debug_assert ! (
121+ max_deduplication_len > 0 ,
122+ "max_deduplication_len must be greater than 0"
123+ ) ;
124+ Self {
125+ max_deduplication_len,
126+ ..self
127+ }
128+ }
129+
114130 /// Set a fixed buffer size for variable length strings
115131 ///
116132 /// The block size is the size of the buffer used to store values greater
@@ -334,35 +350,37 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
334350
335351 // Deduplication if:
336352 // (1) deduplication is enabled.
337- // (2) len > 12
338- if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
339- let hash_val = hasher. hash_one ( v) ;
340- let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
341-
342- let entry = ht. entry (
343- hash_val,
344- |idx| {
345- let stored_value = self . get_value ( * idx) ;
346- v == stored_value
347- } ,
348- hasher_fn,
349- ) ;
350- match entry {
351- Entry :: Occupied ( occupied) => {
352- // If the string already exists, we will directly use the view
353- let idx = occupied. get ( ) ;
354- self . views_buffer . push ( self . views_buffer [ * idx] ) ;
355- self . null_buffer_builder . append_non_null ( ) ;
356- self . string_tracker = Some ( ( ht, hasher) ) ;
357- return Ok ( ( ) ) ;
358- }
359- Entry :: Vacant ( vacant) => {
360- // o.w. we insert the (string hash -> view index)
361- // the idx is current length of views_builder, as we are inserting a new view
362- vacant. insert ( self . views_buffer . len ( ) ) ;
353+ // (2) len > `max_deduplication_len`
354+ if length > self . max_deduplication_len {
355+ if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
356+ let hash_val = hasher. hash_one ( v) ;
357+ let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
358+
359+ let entry = ht. entry (
360+ hash_val,
361+ |idx| {
362+ let stored_value = self . get_value ( * idx) ;
363+ v == stored_value
364+ } ,
365+ hasher_fn,
366+ ) ;
367+ match entry {
368+ Entry :: Occupied ( occupied) => {
369+ // If the string already exists, we will directly use the view
370+ let idx = occupied. get ( ) ;
371+ self . views_buffer . push ( self . views_buffer [ * idx] ) ;
372+ self . null_buffer_builder . append_non_null ( ) ;
373+ self . string_tracker = Some ( ( ht, hasher) ) ;
374+ return Ok ( ( ) ) ;
375+ }
376+ Entry :: Vacant ( vacant) => {
377+ // o.w. we insert the (string hash -> view index)
378+ // the idx is current length of views_builder, as we are inserting a new view
379+ vacant. insert ( self . views_buffer . len ( ) ) ;
380+ }
363381 }
382+ self . string_tracker = Some ( ( ht, hasher) ) ;
364383 }
365- self . string_tracker = Some ( ( ht, hasher) ) ;
366384 }
367385
368386 let required_cap = self . in_progress . len ( ) + v. len ( ) ;
@@ -638,6 +656,39 @@ mod tests {
638656
639657 use super :: * ;
640658
659+ #[ test]
660+ fn test_string_view_max_view_len ( ) {
661+ let value_1 = "short" ;
662+ let value_2 = "not so similar string but long" ;
663+
664+ let mut builder = StringViewBuilder :: new ( )
665+ . with_deduplicate_strings ( )
666+ . with_max_deduplication_len ( 6 ) ;
667+ assert ! ( builder. string_tracker. is_some( ) ) ;
668+
669+ let value_checker = |v : & [ u8 ] , builder : & StringViewBuilder | {
670+ // safe to unwrap
671+ let ( ht, hasher) = builder. string_tracker . clone ( ) . unwrap ( ) ;
672+ let hash_val = hasher. hash_one ( v) ;
673+
674+ ht. find ( hash_val, |idx| {
675+ let stored_value = builder. get_value ( * idx) ;
676+ v == stored_value
677+ } )
678+ . cloned ( )
679+ } ;
680+
681+ // append the shot value, it MUST not in the string_tracker
682+ let v: & [ u8 ] = value_1. as_ref ( ) ;
683+ builder. append_value ( value_1) ;
684+ assert ! ( value_checker( v, & builder) . is_none( ) ) ;
685+
686+ // append the long value, it MUST in the string_tracker
687+ let v: & [ u8 ] = value_2. as_ref ( ) ;
688+ builder. append_value ( value_2) ;
689+ assert ! ( value_checker( v, & builder) . is_some( ) ) ;
690+ }
691+
641692 #[ test]
642693 fn test_string_view_deduplicate ( ) {
643694 let value_1 = "long string to test string view" ;
0 commit comments