From 8653d3c0894eafe3873ee8271e135d8c89c9efb9 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Fri, 2 Jan 2026 17:44:31 +0200 Subject: [PATCH 1/7] perf: optimize hex decoding in json --- arrow-json/benches/reader.rs | 138 ++++++++++++++++++++++++++ arrow-json/src/reader/binary_array.rs | 104 ++++++++++++++----- 2 files changed, 218 insertions(+), 24 deletions(-) create mode 100644 arrow-json/benches/reader.rs diff --git a/arrow-json/benches/reader.rs b/arrow-json/benches/reader.rs new file mode 100644 index 000000000000..66c26099fb9e --- /dev/null +++ b/arrow-json/benches/reader.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_json::ReaderBuilder; +use arrow_schema::{DataType, Field}; +use criterion::{criterion_group, criterion_main, Criterion}; +use std::fmt::Write; +use std::hint::black_box; +use std::sync::Arc; + +const BINARY_ROWS: usize = 1 << 15; +const LIST_ROWS: usize = 1 << 14; +const BINARY_BYTES: usize = 64; +const LIST_LEN: usize = 32; + +fn bench_decode(c: &mut Criterion, name: &str, data: &[u8], field: Arc, rows: usize) { + c.bench_function(name, |b| { + b.iter(|| { + let mut decoder = ReaderBuilder::new_with_field(field.clone()) + .with_batch_size(rows) + .build_decoder() + .unwrap(); + + let mut offset = 0; + while offset < data.len() { + let read = decoder.decode(black_box(&data[offset..])).unwrap(); + if read == 0 { + break; + } + offset += read; + } + + let batch = decoder.flush().unwrap(); + black_box(batch); + }) + }); +} + +#[inline] +fn append_hex_byte(buf: &mut String, byte: u8) { + const HEX: &[u8; 16] = b"0123456789abcdef"; + buf.push(HEX[(byte >> 4) as usize] as char); + buf.push(HEX[(byte & 0x0f) as usize] as char); +} + +fn build_hex_lines(rows: usize, bytes_per_row: usize) -> Vec { + let mut data = String::with_capacity(rows * (bytes_per_row * 2 + 3)); + for row in 0..rows { + data.push('"'); + for i in 0..bytes_per_row { + let byte = ((row + i) & 0xff) as u8; + append_hex_byte(&mut data, byte); + } + data.push('"'); + data.push('\n'); + } + data.into_bytes() +} + +fn build_numeric_list_lines(rows: usize, list_len: usize) -> Vec { + let mut data = String::with_capacity(rows * (list_len * 6 + 3)); + for row in 0..rows { + data.push('['); + for i in 0..list_len { + let value = (row as i64) * (list_len as i64) + i as i64; + let _ = write!(data, "{value}"); + if i + 1 != list_len { + data.push(','); + } + } + data.push(']'); + data.push('\n'); + } + data.into_bytes() +} + +fn criterion_benchmark(c: &mut Criterion) { + let binary_data = build_hex_lines(BINARY_ROWS, BINARY_BYTES); + + let binary_field = Arc::new(Field::new("item", DataType::Binary, false)); + bench_decode( + c, + "decode_binary_hex_json", + &binary_data, + binary_field, + BINARY_ROWS, + ); + + let fixed_field = Arc::new(Field::new( + "item", + DataType::FixedSizeBinary(BINARY_BYTES as i32), + false, + )); + bench_decode( + c, + "decode_fixed_binary_hex_json", + &binary_data, + fixed_field, + BINARY_ROWS, + ); + + let view_field = Arc::new(Field::new("item", DataType::BinaryView, false)); + bench_decode( + c, + "decode_binary_view_hex_json", + &binary_data, + view_field, + BINARY_ROWS, + ); + + let list_data = build_numeric_list_lines(LIST_ROWS, LIST_LEN); + let element = Arc::new(Field::new("element", DataType::Int64, false)); + let list_field = Arc::new(Field::new_list("item", element, false)); + bench_decode( + c, + "decode_numeric_dense_list_i64_json", + &list_data, + list_field, + LIST_ROWS, + ); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/arrow-json/src/reader/binary_array.rs b/arrow-json/src/reader/binary_array.rs index a71569d57f00..a223dfd8ba7f 100644 --- a/arrow-json/src/reader/binary_array.rs +++ b/arrow-json/src/reader/binary_array.rs @@ -15,30 +15,84 @@ // specific language governing permissions and limitations // under the License. -use arrow_array::builder::{ - BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder, GenericStringBuilder, -}; +use arrow_array::builder::{BinaryViewBuilder, FixedSizeBinaryBuilder, GenericBinaryBuilder}; use arrow_array::{Array, GenericStringArray, OffsetSizeTrait}; use arrow_data::ArrayData; use arrow_schema::ArrowError; +use std::io::Write; use std::marker::PhantomData; use crate::reader::ArrayDecoder; use crate::reader::tape::{Tape, TapeElement}; -/// Decode a hex-encoded string into bytes -fn decode_hex_string(hex_string: &str) -> Result, ArrowError> { - let mut decoded = Vec::with_capacity(hex_string.len() / 2); - for substr in hex_string.as_bytes().chunks(2) { - let str = std::str::from_utf8(substr).map_err(|e| { - ArrowError::JsonError(format!("invalid utf8 in hex encoded binary data: {e}")) - })?; - let byte = u8::from_str_radix(str, 16).map_err(|e| { - ArrowError::JsonError(format!("invalid hex encoding in binary data: {e}")) - })?; - decoded.push(byte); +#[inline] +fn decode_hex_digit(byte: u8) -> Option { + match byte { + b'0'..=b'9' => Some(byte - b'0'), + b'a'..=b'f' => Some(byte - b'a' + 10), + b'A'..=b'F' => Some(byte - b'A' + 10), + _ => None, } - Ok(decoded) +} + +fn invalid_hex_error() -> ArrowError { + ArrowError::JsonError("invalid hex encoding in binary data: invalid digit found in string".to_string()) +} + +fn decode_hex_to_vec(hex_string: &str, out: &mut Vec) -> Result<(), ArrowError> { + let bytes = hex_string.as_bytes(); + out.reserve((bytes.len() + 1) / 2); + + let mut iter = bytes.chunks_exact(2); + for pair in &mut iter { + let high = decode_hex_digit(pair[0]).ok_or_else(invalid_hex_error)?; + let low = decode_hex_digit(pair[1]).ok_or_else(invalid_hex_error)?; + out.push((high << 4) | low); + } + + let remainder = iter.remainder(); + if !remainder.is_empty() { + let low = decode_hex_digit(remainder[0]).ok_or_else(invalid_hex_error)?; + out.push(low); + } + + Ok(()) +} + +fn decode_hex_to_writer(hex_string: &str, writer: &mut W) -> Result<(), ArrowError> { + let bytes = hex_string.as_bytes(); + let mut iter = bytes.chunks_exact(2); + let mut buffer = [0u8; 64]; + let mut buffered = 0; + + for pair in &mut iter { + let high = decode_hex_digit(pair[0]).ok_or_else(invalid_hex_error)?; + let low = decode_hex_digit(pair[1]).ok_or_else(invalid_hex_error)?; + buffer[buffered] = (high << 4) | low; + buffered += 1; + + if buffered == buffer.len() { + writer + .write_all(&buffer) + .map_err(|e| ArrowError::JsonError(format!("failed to write binary data: {e}")))?; + buffered = 0; + } + } + + let remainder = iter.remainder(); + if !remainder.is_empty() { + let low = decode_hex_digit(remainder[0]).ok_or_else(invalid_hex_error)?; + buffer[buffered] = low; + buffered += 1; + } + + if buffered > 0 { + writer + .write_all(&buffer[..buffered]) + .map_err(|e| ArrowError::JsonError(format!("failed to write binary data: {e}")))?; + } + + Ok(()) } #[derive(Default)] @@ -59,14 +113,12 @@ impl ArrayDecoder for BinaryArrayDecoder { let mut builder = GenericBinaryBuilder::::with_capacity(pos.len(), data_capacity); - GenericStringBuilder::::with_capacity(pos.len(), data_capacity); - for p in pos { match tape.get(*p) { TapeElement::String(idx) => { let string = tape.get_string(idx); - let decoded = decode_hex_string(string)?; - builder.append_value(&decoded); + decode_hex_to_writer(string, &mut builder)?; + builder.append_value(b""); } TapeElement::Null => builder.append_null(), _ => unreachable!(), @@ -91,13 +143,15 @@ impl FixedSizeBinaryArrayDecoder { impl ArrayDecoder for FixedSizeBinaryArrayDecoder { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { let mut builder = FixedSizeBinaryBuilder::with_capacity(pos.len(), self.len); + let mut scratch = Vec::with_capacity(self.len as usize); for p in pos { match tape.get(*p) { TapeElement::String(idx) => { let string = tape.get_string(idx); - let decoded = decode_hex_string(string)?; - builder.append_value(&decoded)?; + scratch.clear(); + decode_hex_to_vec(string, &mut scratch)?; + builder.append_value(&scratch)?; } TapeElement::Null => builder.append_null(), _ => unreachable!(), @@ -115,13 +169,15 @@ impl ArrayDecoder for BinaryViewDecoder { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { let data_capacity = estimate_data_capacity(tape, pos)?; let mut builder = BinaryViewBuilder::with_capacity(data_capacity); + let mut scratch = Vec::new(); for p in pos { match tape.get(*p) { TapeElement::String(idx) => { let string = tape.get_string(idx); - let decoded = decode_hex_string(string)?; - builder.append_value(&decoded); + scratch.clear(); + decode_hex_to_vec(string, &mut scratch)?; + builder.append_value(&scratch); } TapeElement::Null => builder.append_null(), _ => unreachable!(), @@ -139,7 +195,7 @@ fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> Result { let string_len = tape.get_string(idx).len(); // two hex characters represent one byte - let decoded_len = string_len / 2; + let decoded_len = (string_len + 1) / 2; data_capacity += decoded_len; } TapeElement::Null => {} From bf3cc14c7aeddd8aad320b90de0d6833c329892a Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Fri, 2 Jan 2026 17:47:33 +0200 Subject: [PATCH 2/7] chore --- arrow-json/benches/reader.rs | 13 +------------ arrow-json/src/reader/binary_array.rs | 4 +++- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/arrow-json/benches/reader.rs b/arrow-json/benches/reader.rs index 66c26099fb9e..5410d951e93d 100644 --- a/arrow-json/benches/reader.rs +++ b/arrow-json/benches/reader.rs @@ -17,7 +17,7 @@ use arrow_json::ReaderBuilder; use arrow_schema::{DataType, Field}; -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{Criterion, criterion_group, criterion_main}; use std::fmt::Write; use std::hint::black_box; use std::sync::Arc; @@ -121,17 +121,6 @@ fn criterion_benchmark(c: &mut Criterion) { view_field, BINARY_ROWS, ); - - let list_data = build_numeric_list_lines(LIST_ROWS, LIST_LEN); - let element = Arc::new(Field::new("element", DataType::Int64, false)); - let list_field = Arc::new(Field::new_list("item", element, false)); - bench_decode( - c, - "decode_numeric_dense_list_i64_json", - &list_data, - list_field, - LIST_ROWS, - ); } criterion_group!(benches, criterion_benchmark); diff --git a/arrow-json/src/reader/binary_array.rs b/arrow-json/src/reader/binary_array.rs index a223dfd8ba7f..85c6bbc34d08 100644 --- a/arrow-json/src/reader/binary_array.rs +++ b/arrow-json/src/reader/binary_array.rs @@ -36,7 +36,9 @@ fn decode_hex_digit(byte: u8) -> Option { } fn invalid_hex_error() -> ArrowError { - ArrowError::JsonError("invalid hex encoding in binary data: invalid digit found in string".to_string()) + ArrowError::JsonError( + "invalid hex encoding in binary data: invalid digit found in string".to_string(), + ) } fn decode_hex_to_vec(hex_string: &str, out: &mut Vec) -> Result<(), ArrowError> { From 303785d49cbb0e6880039de84686ef1e4a001762 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Fri, 2 Jan 2026 18:06:33 +0200 Subject: [PATCH 3/7] chore --- arrow-json/src/reader/binary_array.rs | 95 +++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 12 deletions(-) diff --git a/arrow-json/src/reader/binary_array.rs b/arrow-json/src/reader/binary_array.rs index 85c6bbc34d08..db77b8ed9ef9 100644 --- a/arrow-json/src/reader/binary_array.rs +++ b/arrow-json/src/reader/binary_array.rs @@ -35,10 +35,10 @@ fn decode_hex_digit(byte: u8) -> Option { } } -fn invalid_hex_error() -> ArrowError { - ArrowError::JsonError( - "invalid hex encoding in binary data: invalid digit found in string".to_string(), - ) +fn invalid_hex_error_at(index: usize, byte: u8) -> ArrowError { + ArrowError::JsonError(format!( + "invalid hex encoding in binary data: invalid digit 0x{byte:02x} at position {index}" + )) } fn decode_hex_to_vec(hex_string: &str, out: &mut Vec) -> Result<(), ArrowError> { @@ -46,15 +46,19 @@ fn decode_hex_to_vec(hex_string: &str, out: &mut Vec) -> Result<(), ArrowErr out.reserve((bytes.len() + 1) / 2); let mut iter = bytes.chunks_exact(2); - for pair in &mut iter { - let high = decode_hex_digit(pair[0]).ok_or_else(invalid_hex_error)?; - let low = decode_hex_digit(pair[1]).ok_or_else(invalid_hex_error)?; + for (pair_index, pair) in (&mut iter).enumerate() { + let base = pair_index * 2; + let high = decode_hex_digit(pair[0]).ok_or_else(|| invalid_hex_error_at(base, pair[0]))?; + let low = + decode_hex_digit(pair[1]).ok_or_else(|| invalid_hex_error_at(base + 1, pair[1]))?; out.push((high << 4) | low); } let remainder = iter.remainder(); if !remainder.is_empty() { - let low = decode_hex_digit(remainder[0]).ok_or_else(invalid_hex_error)?; + let index = (bytes.len() / 2) * 2; + let low = decode_hex_digit(remainder[0]) + .ok_or_else(|| invalid_hex_error_at(index, remainder[0]))?; out.push(low); } @@ -67,9 +71,11 @@ fn decode_hex_to_writer(hex_string: &str, writer: &mut W) -> Result<() let mut buffer = [0u8; 64]; let mut buffered = 0; - for pair in &mut iter { - let high = decode_hex_digit(pair[0]).ok_or_else(invalid_hex_error)?; - let low = decode_hex_digit(pair[1]).ok_or_else(invalid_hex_error)?; + for (pair_index, pair) in (&mut iter).enumerate() { + let base = pair_index * 2; + let high = decode_hex_digit(pair[0]).ok_or_else(|| invalid_hex_error_at(base, pair[0]))?; + let low = + decode_hex_digit(pair[1]).ok_or_else(|| invalid_hex_error_at(base + 1, pair[1]))?; buffer[buffered] = (high << 4) | low; buffered += 1; @@ -83,7 +89,9 @@ fn decode_hex_to_writer(hex_string: &str, writer: &mut W) -> Result<() let remainder = iter.remainder(); if !remainder.is_empty() { - let low = decode_hex_digit(remainder[0]).ok_or_else(invalid_hex_error)?; + let index = (bytes.len() / 2) * 2; + let low = decode_hex_digit(remainder[0]) + .ok_or_else(|| invalid_hex_error_at(index, remainder[0]))?; buffer[buffered] = low; buffered += 1; } @@ -208,3 +216,66 @@ fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> Result { + assert!(msg.contains("invalid hex encoding in binary data")); + assert!(msg.contains("position 1")); + } + _ => panic!("expected JsonError"), + } + } + + #[test] + fn test_decode_hex_to_writer_empty() { + let mut out = Vec::new(); + decode_hex_to_writer("", &mut out).unwrap(); + assert!(out.is_empty()); + } + + #[test] + fn test_decode_hex_to_writer_odd_length() { + let mut out = Vec::new(); + decode_hex_to_writer("0f0", &mut out).unwrap(); + assert_eq!(out, vec![0x0f, 0x00]); + } + + #[test] + fn test_decode_hex_to_writer_invalid() { + let mut out = Vec::new(); + let err = decode_hex_to_writer("g", &mut out).unwrap_err(); + match err { + ArrowError::JsonError(msg) => { + assert!(msg.contains("invalid hex encoding in binary data")); + assert!(msg.contains("position 0")); + } + _ => panic!("expected JsonError"), + } + } +} From fe2bee8f58d74e300aef6968a50f5ec284c175a2 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Fri, 2 Jan 2026 18:37:17 +0200 Subject: [PATCH 4/7] fix: Avoid misused decode_hex_to_writer --- arrow-json/benches/reader.rs | 19 ---------------- arrow-json/src/reader/binary_array.rs | 31 +++++++++++++++++++++++---- 2 files changed, 27 insertions(+), 23 deletions(-) diff --git a/arrow-json/benches/reader.rs b/arrow-json/benches/reader.rs index 5410d951e93d..abc9bf21f8cc 100644 --- a/arrow-json/benches/reader.rs +++ b/arrow-json/benches/reader.rs @@ -23,9 +23,7 @@ use std::hint::black_box; use std::sync::Arc; const BINARY_ROWS: usize = 1 << 15; -const LIST_ROWS: usize = 1 << 14; const BINARY_BYTES: usize = 64; -const LIST_LEN: usize = 32; fn bench_decode(c: &mut Criterion, name: &str, data: &[u8], field: Arc, rows: usize) { c.bench_function(name, |b| { @@ -71,23 +69,6 @@ fn build_hex_lines(rows: usize, bytes_per_row: usize) -> Vec { data.into_bytes() } -fn build_numeric_list_lines(rows: usize, list_len: usize) -> Vec { - let mut data = String::with_capacity(rows * (list_len * 6 + 3)); - for row in 0..rows { - data.push('['); - for i in 0..list_len { - let value = (row as i64) * (list_len as i64) + i as i64; - let _ = write!(data, "{value}"); - if i + 1 != list_len { - data.push(','); - } - } - data.push(']'); - data.push('\n'); - } - data.into_bytes() -} - fn criterion_benchmark(c: &mut Criterion) { let binary_data = build_hex_lines(BINARY_ROWS, BINARY_BYTES); diff --git a/arrow-json/src/reader/binary_array.rs b/arrow-json/src/reader/binary_array.rs index db77b8ed9ef9..6f5580845af8 100644 --- a/arrow-json/src/reader/binary_array.rs +++ b/arrow-json/src/reader/binary_array.rs @@ -43,7 +43,7 @@ fn invalid_hex_error_at(index: usize, byte: u8) -> ArrowError { fn decode_hex_to_vec(hex_string: &str, out: &mut Vec) -> Result<(), ArrowError> { let bytes = hex_string.as_bytes(); - out.reserve((bytes.len() + 1) / 2); + out.reserve(bytes.len().div_ceil(2)); let mut iter = bytes.chunks_exact(2); for (pair_index, pair) in (&mut iter).enumerate() { @@ -127,6 +127,8 @@ impl ArrayDecoder for BinaryArrayDecoder { match tape.get(*p) { TapeElement::String(idx) => { let string = tape.get_string(idx); + // Decode directly into the builder for performance. If decoding fails, + // the error is terminal and the builder is discarded by the caller. decode_hex_to_writer(string, &mut builder)?; builder.append_value(b""); } @@ -205,7 +207,7 @@ fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> Result { let string_len = tape.get_string(idx).len(); // two hex characters represent one byte - let decoded_len = (string_len + 1) / 2; + let decoded_len = string_len.div_ceil(2); data_capacity += decoded_len; } TapeElement::Null => {} @@ -220,6 +222,9 @@ fn estimate_data_capacity(tape: &Tape<'_>, pos: &[u32]) -> Result { assert!(msg.contains("invalid hex encoding in binary data")); - assert!(msg.contains("position 0")); + assert!(msg.contains("position 3")); } _ => panic!("expected JsonError"), } } + + #[test] + fn test_binary_reader_invalid_hex_is_terminal() { + let field = Field::new("item", DataType::Binary, false); + let data = b"\"0f0g\"\n\"0f00\"\n"; + let mut reader = ReaderBuilder::new_with_field(field) + .build(Cursor::new(data)) + .unwrap(); + + let err = reader.next().unwrap().unwrap_err().to_string(); + assert!(err.contains("invalid hex encoding in binary data")); + + match reader.next() { + None => {} + Some(Err(_)) => {} + Some(Ok(_)) => panic!("expected terminal error after invalid hex"), + } + } } From 09e70ac7fe9bab4f4e6877af946b7ffefefbb236 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Mon, 5 Jan 2026 19:39:33 +0200 Subject: [PATCH 5/7] chore --- arrow-json/benches/reader.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/arrow-json/benches/reader.rs b/arrow-json/benches/reader.rs index abc9bf21f8cc..993382d29462 100644 --- a/arrow-json/benches/reader.rs +++ b/arrow-json/benches/reader.rs @@ -18,7 +18,6 @@ use arrow_json::ReaderBuilder; use arrow_schema::{DataType, Field}; use criterion::{Criterion, criterion_group, criterion_main}; -use std::fmt::Write; use std::hint::black_box; use std::sync::Arc; From dabfb9a353e5ef83e6a8872f108cfabe7499c25b Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Thu, 8 Jan 2026 10:42:23 +0200 Subject: [PATCH 6/7] chore: remove bench --- arrow-json/benches/reader.rs | 107 ----------------------------------- 1 file changed, 107 deletions(-) delete mode 100644 arrow-json/benches/reader.rs diff --git a/arrow-json/benches/reader.rs b/arrow-json/benches/reader.rs deleted file mode 100644 index 993382d29462..000000000000 --- a/arrow-json/benches/reader.rs +++ /dev/null @@ -1,107 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_json::ReaderBuilder; -use arrow_schema::{DataType, Field}; -use criterion::{Criterion, criterion_group, criterion_main}; -use std::hint::black_box; -use std::sync::Arc; - -const BINARY_ROWS: usize = 1 << 15; -const BINARY_BYTES: usize = 64; - -fn bench_decode(c: &mut Criterion, name: &str, data: &[u8], field: Arc, rows: usize) { - c.bench_function(name, |b| { - b.iter(|| { - let mut decoder = ReaderBuilder::new_with_field(field.clone()) - .with_batch_size(rows) - .build_decoder() - .unwrap(); - - let mut offset = 0; - while offset < data.len() { - let read = decoder.decode(black_box(&data[offset..])).unwrap(); - if read == 0 { - break; - } - offset += read; - } - - let batch = decoder.flush().unwrap(); - black_box(batch); - }) - }); -} - -#[inline] -fn append_hex_byte(buf: &mut String, byte: u8) { - const HEX: &[u8; 16] = b"0123456789abcdef"; - buf.push(HEX[(byte >> 4) as usize] as char); - buf.push(HEX[(byte & 0x0f) as usize] as char); -} - -fn build_hex_lines(rows: usize, bytes_per_row: usize) -> Vec { - let mut data = String::with_capacity(rows * (bytes_per_row * 2 + 3)); - for row in 0..rows { - data.push('"'); - for i in 0..bytes_per_row { - let byte = ((row + i) & 0xff) as u8; - append_hex_byte(&mut data, byte); - } - data.push('"'); - data.push('\n'); - } - data.into_bytes() -} - -fn criterion_benchmark(c: &mut Criterion) { - let binary_data = build_hex_lines(BINARY_ROWS, BINARY_BYTES); - - let binary_field = Arc::new(Field::new("item", DataType::Binary, false)); - bench_decode( - c, - "decode_binary_hex_json", - &binary_data, - binary_field, - BINARY_ROWS, - ); - - let fixed_field = Arc::new(Field::new( - "item", - DataType::FixedSizeBinary(BINARY_BYTES as i32), - false, - )); - bench_decode( - c, - "decode_fixed_binary_hex_json", - &binary_data, - fixed_field, - BINARY_ROWS, - ); - - let view_field = Arc::new(Field::new("item", DataType::BinaryView, false)); - bench_decode( - c, - "decode_binary_view_hex_json", - &binary_data, - view_field, - BINARY_ROWS, - ); -} - -criterion_group!(benches, criterion_benchmark); -criterion_main!(benches); From 2571386d825b1ed73bef62a4ad6d35fa0f6240ba Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Thu, 8 Jan 2026 10:42:37 +0200 Subject: [PATCH 7/7] refactor: replace decode_hex_to_vec with decode_hex_to_writer for improved efficiency --- arrow-json/src/reader/binary_array.rs | 66 ++++----------------------- 1 file changed, 9 insertions(+), 57 deletions(-) diff --git a/arrow-json/src/reader/binary_array.rs b/arrow-json/src/reader/binary_array.rs index 6f5580845af8..712eb6bb4db9 100644 --- a/arrow-json/src/reader/binary_array.rs +++ b/arrow-json/src/reader/binary_array.rs @@ -41,30 +41,6 @@ fn invalid_hex_error_at(index: usize, byte: u8) -> ArrowError { )) } -fn decode_hex_to_vec(hex_string: &str, out: &mut Vec) -> Result<(), ArrowError> { - let bytes = hex_string.as_bytes(); - out.reserve(bytes.len().div_ceil(2)); - - let mut iter = bytes.chunks_exact(2); - for (pair_index, pair) in (&mut iter).enumerate() { - let base = pair_index * 2; - let high = decode_hex_digit(pair[0]).ok_or_else(|| invalid_hex_error_at(base, pair[0]))?; - let low = - decode_hex_digit(pair[1]).ok_or_else(|| invalid_hex_error_at(base + 1, pair[1]))?; - out.push((high << 4) | low); - } - - let remainder = iter.remainder(); - if !remainder.is_empty() { - let index = (bytes.len() / 2) * 2; - let low = decode_hex_digit(remainder[0]) - .ok_or_else(|| invalid_hex_error_at(index, remainder[0]))?; - out.push(low); - } - - Ok(()) -} - fn decode_hex_to_writer(hex_string: &str, writer: &mut W) -> Result<(), ArrowError> { let bytes = hex_string.as_bytes(); let mut iter = bytes.chunks_exact(2); @@ -155,6 +131,7 @@ impl FixedSizeBinaryArrayDecoder { impl ArrayDecoder for FixedSizeBinaryArrayDecoder { fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result { let mut builder = FixedSizeBinaryBuilder::with_capacity(pos.len(), self.len); + // Preallocate for the decoded byte width (FixedSizeBinary len), not the hex string length. let mut scratch = Vec::with_capacity(self.len as usize); for p in pos { @@ -162,7 +139,8 @@ impl ArrayDecoder for FixedSizeBinaryArrayDecoder { TapeElement::String(idx) => { let string = tape.get_string(idx); scratch.clear(); - decode_hex_to_vec(string, &mut scratch)?; + scratch.reserve(string.len().div_ceil(2)); + decode_hex_to_writer(string, &mut scratch)?; builder.append_value(&scratch)?; } TapeElement::Null => builder.append_null(), @@ -188,7 +166,8 @@ impl ArrayDecoder for BinaryViewDecoder { TapeElement::String(idx) => { let string = tape.get_string(idx); scratch.clear(); - decode_hex_to_vec(string, &mut scratch)?; + scratch.reserve(string.len().div_ceil(2)); + decode_hex_to_writer(string, &mut scratch)?; builder.append_value(&scratch); } TapeElement::Null => builder.append_null(), @@ -226,37 +205,6 @@ mod tests { use arrow_schema::{DataType, Field}; use std::io::Cursor; - #[test] - fn test_decode_hex_to_vec_empty() { - let mut out = Vec::new(); - decode_hex_to_vec("", &mut out).unwrap(); - assert!(out.is_empty()); - } - - #[test] - fn test_decode_hex_to_vec_odd_length() { - let mut out = Vec::new(); - decode_hex_to_vec("0f0", &mut out).unwrap(); - assert_eq!(out, vec![0x0f, 0x00]); - - out.clear(); - decode_hex_to_vec("a", &mut out).unwrap(); - assert_eq!(out, vec![0x0a]); - } - - #[test] - fn test_decode_hex_to_vec_invalid() { - let mut out = Vec::new(); - let err = decode_hex_to_vec("0g", &mut out).unwrap_err(); - match err { - ArrowError::JsonError(msg) => { - assert!(msg.contains("invalid hex encoding in binary data")); - assert!(msg.contains("position 1")); - } - _ => panic!("expected JsonError"), - } - } - #[test] fn test_decode_hex_to_writer_empty() { let mut out = Vec::new(); @@ -269,6 +217,10 @@ mod tests { let mut out = Vec::new(); decode_hex_to_writer("0f0", &mut out).unwrap(); assert_eq!(out, vec![0x0f, 0x00]); + + out.clear(); + decode_hex_to_writer("a", &mut out).unwrap(); + assert_eq!(out, vec![0x0a]); } #[test]