Skip to content

Commit a747f22

Browse files
committed
add actual binary bench, move old binary to string
1 parent 4831595 commit a747f22

File tree

1 file changed

+44
-9
lines changed

1 file changed

+44
-9
lines changed

parquet/benches/parquet_round_trip.rs

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
use arrow::array::{ArrayRef, RecordBatch};
1919
use arrow::datatypes::{DataType, Field, Float32Type, Float64Type, Int32Type, Int64Type, Schema};
2020
use arrow::util::bench_util::{
21-
create_primitive_array_with_seed, create_string_array_with_len_range_and_prefix_and_seed,
21+
create_binary_array_with_len_range_and_prefix_and_seed, create_primitive_array_with_seed,
22+
create_string_array_with_len_range_and_prefix_and_seed,
2223
};
2324
use arrow_array::FixedSizeBinaryArray;
2425
use bytes::Bytes;
@@ -32,6 +33,7 @@ use std::sync::Arc;
3233

3334
#[derive(Copy, Clone)]
3435
pub enum ColumnType {
36+
String(usize),
3537
Binary(usize),
3638
FixedLen(i32),
3739
Int32,
@@ -71,7 +73,8 @@ fn create_fsb_array_with_seed(
7173

7274
fn schema(column_type: ColumnType, num_columns: usize) -> Arc<Schema> {
7375
let field_type = match column_type {
74-
ColumnType::Binary(_) => DataType::Utf8,
76+
ColumnType::Binary(_) => DataType::Binary,
77+
ColumnType::String(_) => DataType::Utf8,
7578
ColumnType::FixedLen(size) => DataType::FixedSizeBinary(size),
7679
ColumnType::Int32 => DataType::Int32,
7780
ColumnType::Int64 => DataType::Int64,
@@ -95,7 +98,21 @@ fn create_batch(
9598
let null_density = 0.0001;
9699
let mut arrays: Vec<ArrayRef> = vec![];
97100
match column_type {
98-
ColumnType::Binary(max_str_len) => {
101+
ColumnType::Binary(max_len) => {
102+
for i in 0..num_columns {
103+
let array_seed = seed * num_columns + i;
104+
let array = create_binary_array_with_len_range_and_prefix_and_seed::<i32>(
105+
num_rows,
106+
null_density,
107+
max_len / 2,
108+
max_len,
109+
&[],
110+
array_seed as u64,
111+
);
112+
arrays.push(Arc::new(array));
113+
}
114+
}
115+
ColumnType::String(max_str_len) => {
99116
for i in 0..num_columns {
100117
let array_seed = seed * num_columns + i;
101118
let array = create_string_array_with_len_range_and_prefix_and_seed::<i32>(
@@ -329,20 +346,36 @@ fn float_benches(c: &mut Criterion, column_type: ColumnType) {
329346
read_write(c, spec, &format!("{ctype} byte_stream_split"));
330347
}
331348

332-
fn binary_benches(c: &mut Criterion, max_str_len: usize) {
333-
let spec = ParquetFileSpec::new(ColumnType::Binary(max_str_len))
349+
fn string_benches(c: &mut Criterion, max_str_len: usize) {
350+
let spec = ParquetFileSpec::new(ColumnType::String(max_str_len))
351+
.with_num_columns(5)
352+
.with_use_dict(true);
353+
read_write(c, spec, &format!("String({max_str_len}) dict"));
354+
355+
let spec = spec.with_use_dict(false).with_encoding(Encoding::PLAIN);
356+
read_write(c, spec, &format!("String({max_str_len}) plain"));
357+
358+
let spec = spec.with_encoding(Encoding::DELTA_LENGTH_BYTE_ARRAY);
359+
read_write(c, spec, &format!("String({max_str_len}) delta_length"));
360+
361+
let spec = spec.with_encoding(Encoding::DELTA_BYTE_ARRAY);
362+
read_write(c, spec, &format!("String({max_str_len}) delta_byte_array"));
363+
}
364+
365+
fn binary_benches(c: &mut Criterion, max_len: usize) {
366+
let spec = ParquetFileSpec::new(ColumnType::Binary(max_len))
334367
.with_num_columns(5)
335368
.with_use_dict(true);
336-
read_write(c, spec, &format!("Binary({max_str_len}) dict"));
369+
read_write(c, spec, &format!("Binary({max_len}) dict"));
337370

338371
let spec = spec.with_use_dict(false).with_encoding(Encoding::PLAIN);
339-
read_write(c, spec, &format!("Binary({max_str_len}) plain"));
372+
read_write(c, spec, &format!("Binary({max_len}) plain"));
340373

341374
let spec = spec.with_encoding(Encoding::DELTA_LENGTH_BYTE_ARRAY);
342-
read_write(c, spec, &format!("Binary({max_str_len}) delta_length"));
375+
read_write(c, spec, &format!("Binary({max_len}) delta_length"));
343376

344377
let spec = spec.with_encoding(Encoding::DELTA_BYTE_ARRAY);
345-
read_write(c, spec, &format!("Binary({max_str_len}) delta_byte_array"));
378+
read_write(c, spec, &format!("Binary({max_len}) delta_byte_array"));
346379
}
347380

348381
fn flba_benches(c: &mut Criterion, len: i32) {
@@ -366,6 +399,8 @@ fn criterion_benchmark(c: &mut Criterion) {
366399
int_benches(c, ColumnType::Int64);
367400
float_benches(c, ColumnType::Float);
368401
float_benches(c, ColumnType::Double);
402+
string_benches(c, 20);
403+
string_benches(c, 100);
369404
binary_benches(c, 20);
370405
binary_benches(c, 100);
371406
flba_benches(c, 2);

0 commit comments

Comments
 (0)