@@ -23,12 +23,15 @@ use parquet_variant::{EMPTY_VARIANT_METADATA_BYTES, Variant, VariantBuilder};
2323use parquet_variant_compute:: {
2424 GetOptions , VariantArray , VariantArrayBuilder , json_to_variant, variant_get,
2525} ;
26+ use parquet_variant_json:: append_json;
2627use rand:: Rng ;
2728use rand:: SeedableRng ;
2829use rand:: distr:: Alphanumeric ;
2930use rand:: rngs:: StdRng ;
31+ use serde_json:: Value ;
3032use std:: fmt:: Write ;
3133use std:: sync:: Arc ;
34+
3235fn benchmark_batch_json_string_to_variant ( c : & mut Criterion ) {
3336 let input_array = StringArray :: from_iter_values ( json_repeated_struct ( 8000 ) ) ;
3437 let array_ref: ArrayRef = Arc :: new ( input_array) ;
@@ -66,6 +69,58 @@ fn benchmark_batch_json_string_to_variant(c: &mut Criterion) {
6669 } ) ;
6770 } ) ;
6871
72+ let input_array = StringArray :: from_iter_values ( random_structure ( 8000 , 200 ) ) ;
73+ let total_input_bytes = input_array
74+ . iter ( )
75+ . flatten ( ) // filter None
76+ . map ( |v| v. len ( ) )
77+ . sum :: < usize > ( ) ;
78+ let id = format ! (
79+ "batch_json_string_to_variant object - 1 depth(200 fields) random_json({} bytes per document)" ,
80+ total_input_bytes / input_array. len( )
81+ ) ;
82+ let array_ref: ArrayRef = Arc :: new ( input_array) ;
83+ let string_array = array_ref. as_any ( ) . downcast_ref :: < StringArray > ( ) . unwrap ( ) ;
84+ let mut json_array: Vec < Value > = Vec :: with_capacity ( string_array. len ( ) ) ;
85+ for i in 0 ..string_array. len ( ) {
86+ json_array. push ( serde_json:: from_str ( string_array. value ( i) ) . unwrap ( ) ) ;
87+ }
88+ c. bench_function ( & id, |b| {
89+ b. iter ( || {
90+ let mut variant_array_builder = VariantArrayBuilder :: new ( string_array. len ( ) ) ;
91+ for json in & json_array {
92+ append_json ( json, & mut variant_array_builder) . unwrap ( ) ;
93+ }
94+ let _ = variant_array_builder. build ( ) ;
95+ } ) ;
96+ } ) ;
97+
98+ let input_array = StringArray :: from_iter_values ( random_structure ( 8000 , 100 ) ) ;
99+ let total_input_bytes = input_array
100+ . iter ( )
101+ . flatten ( ) // filter None
102+ . map ( |v| v. len ( ) )
103+ . sum :: < usize > ( ) ;
104+ let id = format ! (
105+ "batch_json_string_to_variant object - 1 depth(100 fields) random_json({} bytes per document)" ,
106+ total_input_bytes / input_array. len( )
107+ ) ;
108+ let array_ref: ArrayRef = Arc :: new ( input_array) ;
109+ let string_array = array_ref. as_any ( ) . downcast_ref :: < StringArray > ( ) . unwrap ( ) ;
110+ let mut json_array: Vec < Value > = Vec :: with_capacity ( string_array. len ( ) ) ;
111+ for i in 0 ..string_array. len ( ) {
112+ json_array. push ( serde_json:: from_str ( string_array. value ( i) ) . unwrap ( ) ) ;
113+ }
114+ c. bench_function ( & id, |b| {
115+ b. iter ( || {
116+ let mut variant_array_builder = VariantArrayBuilder :: new ( string_array. len ( ) ) ;
117+ for json in & json_array {
118+ append_json ( json, & mut variant_array_builder) . unwrap ( ) ;
119+ }
120+ let _ = variant_array_builder. build ( ) ;
121+ } ) ;
122+ } ) ;
123+
69124 let input_array = StringArray :: from_iter_values ( random_json_structure ( 8000 ) ) ;
70125 let total_input_bytes = input_array
71126 . iter ( )
@@ -240,6 +295,22 @@ fn random_json_structure(count: usize) -> impl Iterator<Item = String> {
240295 ( 0 ..count) . map ( move |_| generator. next ( ) . to_string ( ) )
241296}
242297
298+ fn random_structure ( count : usize , max_fields : usize ) -> impl Iterator < Item = String > {
299+ let mut generator = RandomJsonGenerator {
300+ null_weight : 5 ,
301+ string_weight : 25 ,
302+ number_weight : 25 ,
303+ boolean_weight : 10 ,
304+ object_weight : 25 ,
305+ array_weight : 0 ,
306+ max_fields,
307+ max_array_length : 0 ,
308+ max_depth : 1 ,
309+ ..Default :: default ( )
310+ } ;
311+ ( 0 ..count) . map ( move |_| generator. next_object ( ) . to_string ( ) )
312+ }
313+
243314/// Creates JSON with random structure and fields.
244315///
245316/// Each type is created in proportion controlled by the
@@ -299,6 +370,82 @@ impl RandomJsonGenerator {
299370 & self . output_buffer
300371 }
301372
373+ fn next_object ( & mut self ) -> & str {
374+ self . output_buffer . clear ( ) ;
375+ self . append_random_json_for_object ( ) ;
376+ & self . output_buffer
377+ }
378+
379+ fn append_random_json_for_object ( & mut self ) {
380+ // use destructuring to ensure each field is used
381+ let Self {
382+ rng,
383+ null_weight,
384+ string_weight,
385+ number_weight,
386+ boolean_weight,
387+ max_fields,
388+ output_buffer,
389+ ..
390+ } = self ;
391+
392+ write ! ( output_buffer, "{{" ) . unwrap ( ) ;
393+ for i in 0 ..* max_fields {
394+ let key_length = rng. random_range ( 1 ..=20 ) ;
395+ let key: String = ( 0 ..key_length)
396+ . map ( |_| rng. sample ( Alphanumeric ) as char )
397+ . collect ( ) ;
398+ write ! ( output_buffer, "\" {key}\" :" ) . unwrap ( ) ;
399+
400+ let total_weight = * null_weight + * string_weight + * number_weight + * boolean_weight;
401+
402+ // Generate a random number to determine the type
403+ let mut random_value: usize = rng. random_range ( 0 ..total_weight) ;
404+
405+ if random_value <= * null_weight {
406+ write ! ( output_buffer, "null" ) . unwrap ( ) ;
407+ } else {
408+ random_value -= * null_weight;
409+
410+ if random_value <= * string_weight {
411+ // Generate a random string between 1 and 20 characters
412+ let length = rng. random_range ( 1 ..=20 ) ;
413+ let random_string: String = ( 0 ..length)
414+ . map ( |_| rng. sample ( Alphanumeric ) as char )
415+ . collect ( ) ;
416+ write ! ( output_buffer, "\" {random_string}\" " , ) . unwrap ( ) ;
417+ } else {
418+ random_value -= * string_weight;
419+
420+ if random_value <= * number_weight {
421+ // 50% chance of generating an integer or a float
422+ if rng. random_bool ( 0.5 ) {
423+ // Generate a random integer
424+ let random_integer: i64 = rng. random_range ( -1000 ..1000 ) ;
425+ write ! ( output_buffer, "{random_integer}" , ) . unwrap ( ) ;
426+ } else {
427+ // Generate a random float
428+ let random_float: f64 = rng. random_range ( -1000.0 ..1000.0 ) ;
429+ write ! ( output_buffer, "{random_float}" , ) . unwrap ( ) ;
430+ }
431+ } else {
432+ random_value -= * number_weight;
433+
434+ if random_value <= * boolean_weight {
435+ // Generate a random boolean
436+ let random_boolean: bool = rng. random ( ) ;
437+ write ! ( output_buffer, "{random_boolean}" , ) . unwrap ( ) ;
438+ }
439+ }
440+ }
441+ }
442+ if i < * max_fields - 1 {
443+ write ! ( output_buffer, "," ) . unwrap ( ) ;
444+ }
445+ }
446+ write ! ( & mut self . output_buffer, "}}" ) . unwrap ( ) ;
447+ }
448+
302449 /// Appends a random JSON value to the output buffer.
303450 fn append_random_json ( & mut self , current_depth : usize ) {
304451 // use destructuring to ensure each field is used
0 commit comments