@@ -87,6 +87,7 @@ pub struct GenericByteViewBuilder<T: ByteViewType + ?Sized> {
8787 /// Some if deduplicating strings
8888 /// map `<string hash> -> <index to the views>`
8989 string_tracker : Option < ( HashTable < usize > , ahash:: RandomState ) > ,
90+ max_deduplication_len : u32 ,
9091 phantom : PhantomData < T > ,
9192}
9293
@@ -107,10 +108,25 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
107108 current_size : STARTING_BLOCK_SIZE ,
108109 } ,
109110 string_tracker : None ,
111+ max_deduplication_len : MAX_INLINE_VIEW_LEN ,
110112 phantom : Default :: default ( ) ,
111113 }
112114 }
113115
116+ /// Configure max deduplication length when deduplicating strings while building the array.
117+ /// Default is [`MAX_INLINE_VIEW_LEN`] bytes.
118+ /// See <https://github.com/apache/arrow-rs/issues/7187> for more details on the implications.
119+ pub fn with_max_deduplication_len ( self , max_deduplication_len : u32 ) -> Self {
120+ debug_assert ! (
121+ max_deduplication_len > 0 ,
122+ "max_deduplication_len must be greater than 0"
123+ ) ;
124+ Self {
125+ max_deduplication_len,
126+ ..self
127+ }
128+ }
129+
114130 /// Set a fixed buffer size for variable length strings
115131 ///
116132 /// The block size is the size of the buffer used to store values greater
@@ -334,35 +350,37 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
334350
335351 // Deduplication if:
336352 // (1) deduplication is enabled.
337- // (2) len > 12
338- if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
339- let hash_val = hasher. hash_one ( v) ;
340- let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
341-
342- let entry = ht. entry (
343- hash_val,
344- |idx| {
345- let stored_value = self . get_value ( * idx) ;
346- v == stored_value
347- } ,
348- hasher_fn,
349- ) ;
350- match entry {
351- Entry :: Occupied ( occupied) => {
352- // If the string already exists, we will directly use the view
353- let idx = occupied. get ( ) ;
354- self . views_buffer . push ( self . views_buffer [ * idx] ) ;
355- self . null_buffer_builder . append_non_null ( ) ;
356- self . string_tracker = Some ( ( ht, hasher) ) ;
357- return Ok ( ( ) ) ;
358- }
359- Entry :: Vacant ( vacant) => {
360- // o.w. we insert the (string hash -> view index)
361- // the idx is current length of views_builder, as we are inserting a new view
362- vacant. insert ( self . views_buffer . len ( ) ) ;
353+ // (2) len > `max_deduplication_len`
354+ if length > self . max_deduplication_len {
355+ if let Some ( ( mut ht, hasher) ) = self . string_tracker . take ( ) {
356+ let hash_val = hasher. hash_one ( v) ;
357+ let hasher_fn = |v : & _ | hasher. hash_one ( v) ;
358+
359+ let entry = ht. entry (
360+ hash_val,
361+ |idx| {
362+ let stored_value = self . get_value ( * idx) ;
363+ v == stored_value
364+ } ,
365+ hasher_fn,
366+ ) ;
367+ match entry {
368+ Entry :: Occupied ( occupied) => {
369+ // If the string already exists, we will directly use the view
370+ let idx = occupied. get ( ) ;
371+ self . views_buffer . push ( self . views_buffer [ * idx] ) ;
372+ self . null_buffer_builder . append_non_null ( ) ;
373+ self . string_tracker = Some ( ( ht, hasher) ) ;
374+ return Ok ( ( ) ) ;
375+ }
376+ Entry :: Vacant ( vacant) => {
377+ // o.w. we insert the (string hash -> view index)
378+ // the idx is current length of views_builder, as we are inserting a new view
379+ vacant. insert ( self . views_buffer . len ( ) ) ;
380+ }
363381 }
382+ self . string_tracker = Some ( ( ht, hasher) ) ;
364383 }
365- self . string_tracker = Some ( ( ht, hasher) ) ;
366384 }
367385
368386 let required_cap = self . in_progress . len ( ) + v. len ( ) ;
@@ -636,8 +654,56 @@ pub fn make_view(data: &[u8], block_id: u32, offset: u32) -> u128 {
636654mod tests {
637655 use core:: str;
638656
657+ use arrow_buffer:: ArrowNativeType ;
658+
639659 use super :: * ;
640660
661+ #[ test]
662+ fn test_string_max_deduplication_len ( ) {
663+ let value_1 = "short" ;
664+ let value_2 = "not so similar string but long" ;
665+ let value_3 = "1234567890123" ;
666+
667+ let mut builder = StringViewBuilder :: new ( )
668+ . with_deduplicate_strings ( )
669+ . with_max_deduplication_len ( MAX_INLINE_VIEW_LEN * 2 ) ;
670+ assert ! ( builder. string_tracker. is_some( ) ) ;
671+ assert ! ( builder. max_deduplication_len > MAX_INLINE_VIEW_LEN ) ;
672+ assert ! ( value_1. len( ) < MAX_INLINE_VIEW_LEN . as_usize( ) ) ;
673+ assert ! ( value_2. len( ) > builder. max_deduplication_len. as_usize( ) ) ;
674+ assert ! (
675+ value_3. len( ) > MAX_INLINE_VIEW_LEN . as_usize( )
676+ && value_3. len( ) < builder. max_deduplication_len. as_usize( )
677+ ) ;
678+
679+ let value_checker = |v : & [ u8 ] , builder : & StringViewBuilder | {
680+ // safe to unwrap
681+ let ( ht, hasher) = builder. string_tracker . clone ( ) . unwrap ( ) ;
682+ let hash_val = hasher. hash_one ( v) ;
683+
684+ ht. find ( hash_val, |idx| {
685+ let stored_value = builder. get_value ( * idx) ;
686+ v == stored_value
687+ } )
688+ . cloned ( )
689+ } ;
690+
691+ // append value1, it MUST not in the string_tracker
692+ let v: & [ u8 ] = value_1. as_ref ( ) ;
693+ builder. append_value ( value_1) ;
694+ assert ! ( value_checker( v, & builder) . is_none( ) ) ;
695+
696+ // append value2, it MUST in the string_tracker
697+ let v: & [ u8 ] = value_2. as_ref ( ) ;
698+ builder. append_value ( value_2) ;
699+ assert ! ( value_checker( v, & builder) . is_some( ) ) ;
700+
701+ // append value3, it MUST not in the string_tracker
702+ let v: & [ u8 ] = value_3. as_ref ( ) ;
703+ builder. append_value ( value_3) ;
704+ assert ! ( value_checker( v, & builder) . is_none( ) ) ;
705+ }
706+
641707 #[ test]
642708 fn test_string_view_deduplicate ( ) {
643709 let value_1 = "long string to test string view" ;
0 commit comments