1818use arrow:: array:: { ArrayRef , RecordBatch } ;
1919use arrow:: datatypes:: { DataType , Field , Float32Type , Float64Type , Int32Type , Int64Type , Schema } ;
2020use arrow:: util:: bench_util:: {
21- create_primitive_array_with_seed, create_string_array_with_len_range_and_prefix_and_seed,
21+ create_binary_array_with_len_range_and_prefix_and_seed, create_primitive_array_with_seed,
22+ create_string_array_with_len_range_and_prefix_and_seed,
2223} ;
2324use arrow_array:: FixedSizeBinaryArray ;
2425use bytes:: Bytes ;
@@ -32,6 +33,7 @@ use std::sync::Arc;
3233
3334#[ derive( Copy , Clone ) ]
3435pub enum ColumnType {
36+ String ( usize ) ,
3537 Binary ( usize ) ,
3638 FixedLen ( i32 ) ,
3739 Int32 ,
@@ -71,7 +73,8 @@ fn create_fsb_array_with_seed(
7173
7274fn schema ( column_type : ColumnType , num_columns : usize ) -> Arc < Schema > {
7375 let field_type = match column_type {
74- ColumnType :: Binary ( _) => DataType :: Utf8 ,
76+ ColumnType :: Binary ( _) => DataType :: Binary ,
77+ ColumnType :: String ( _) => DataType :: Utf8 ,
7578 ColumnType :: FixedLen ( size) => DataType :: FixedSizeBinary ( size) ,
7679 ColumnType :: Int32 => DataType :: Int32 ,
7780 ColumnType :: Int64 => DataType :: Int64 ,
@@ -95,7 +98,21 @@ fn create_batch(
9598 let null_density = 0.0001 ;
9699 let mut arrays: Vec < ArrayRef > = vec ! [ ] ;
97100 match column_type {
98- ColumnType :: Binary ( max_str_len) => {
101+ ColumnType :: Binary ( max_len) => {
102+ for i in 0 ..num_columns {
103+ let array_seed = seed * num_columns + i;
104+ let array = create_binary_array_with_len_range_and_prefix_and_seed :: < i32 > (
105+ num_rows,
106+ null_density,
107+ max_len / 2 ,
108+ max_len,
109+ & [ ] ,
110+ array_seed as u64 ,
111+ ) ;
112+ arrays. push ( Arc :: new ( array) ) ;
113+ }
114+ }
115+ ColumnType :: String ( max_str_len) => {
99116 for i in 0 ..num_columns {
100117 let array_seed = seed * num_columns + i;
101118 let array = create_string_array_with_len_range_and_prefix_and_seed :: < i32 > (
@@ -329,20 +346,36 @@ fn float_benches(c: &mut Criterion, column_type: ColumnType) {
329346 read_write ( c, spec, & format ! ( "{ctype} byte_stream_split" ) ) ;
330347}
331348
332- fn binary_benches ( c : & mut Criterion , max_str_len : usize ) {
333- let spec = ParquetFileSpec :: new ( ColumnType :: Binary ( max_str_len) )
349+ fn string_benches ( c : & mut Criterion , max_str_len : usize ) {
350+ let spec = ParquetFileSpec :: new ( ColumnType :: String ( max_str_len) )
351+ . with_num_columns ( 5 )
352+ . with_use_dict ( true ) ;
353+ read_write ( c, spec, & format ! ( "String({max_str_len}) dict" ) ) ;
354+
355+ let spec = spec. with_use_dict ( false ) . with_encoding ( Encoding :: PLAIN ) ;
356+ read_write ( c, spec, & format ! ( "String({max_str_len}) plain" ) ) ;
357+
358+ let spec = spec. with_encoding ( Encoding :: DELTA_LENGTH_BYTE_ARRAY ) ;
359+ read_write ( c, spec, & format ! ( "String({max_str_len}) delta_length" ) ) ;
360+
361+ let spec = spec. with_encoding ( Encoding :: DELTA_BYTE_ARRAY ) ;
362+ read_write ( c, spec, & format ! ( "String({max_str_len}) delta_byte_array" ) ) ;
363+ }
364+
365+ fn binary_benches ( c : & mut Criterion , max_len : usize ) {
366+ let spec = ParquetFileSpec :: new ( ColumnType :: Binary ( max_len) )
334367 . with_num_columns ( 5 )
335368 . with_use_dict ( true ) ;
336- read_write ( c, spec, & format ! ( "Binary({max_str_len }) dict" ) ) ;
369+ read_write ( c, spec, & format ! ( "Binary({max_len }) dict" ) ) ;
337370
338371 let spec = spec. with_use_dict ( false ) . with_encoding ( Encoding :: PLAIN ) ;
339- read_write ( c, spec, & format ! ( "Binary({max_str_len }) plain" ) ) ;
372+ read_write ( c, spec, & format ! ( "Binary({max_len }) plain" ) ) ;
340373
341374 let spec = spec. with_encoding ( Encoding :: DELTA_LENGTH_BYTE_ARRAY ) ;
342- read_write ( c, spec, & format ! ( "Binary({max_str_len }) delta_length" ) ) ;
375+ read_write ( c, spec, & format ! ( "Binary({max_len }) delta_length" ) ) ;
343376
344377 let spec = spec. with_encoding ( Encoding :: DELTA_BYTE_ARRAY ) ;
345- read_write ( c, spec, & format ! ( "Binary({max_str_len }) delta_byte_array" ) ) ;
378+ read_write ( c, spec, & format ! ( "Binary({max_len }) delta_byte_array" ) ) ;
346379}
347380
348381fn flba_benches ( c : & mut Criterion , len : i32 ) {
@@ -366,6 +399,8 @@ fn criterion_benchmark(c: &mut Criterion) {
366399 int_benches ( c, ColumnType :: Int64 ) ;
367400 float_benches ( c, ColumnType :: Float ) ;
368401 float_benches ( c, ColumnType :: Double ) ;
402+ string_benches ( c, 20 ) ;
403+ string_benches ( c, 100 ) ;
369404 binary_benches ( c, 20 ) ;
370405 binary_benches ( c, 100 ) ;
371406 flba_benches ( c, 2 ) ;
0 commit comments