Skip to content
Open
11 changes: 10 additions & 1 deletion arrow-json/benches/json-reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,13 @@ fn bench_binary_hex(c: &mut Criterion) {
bench_decode_binary(c, "decode_binary_view_hex_json", &binary_data, view_field);
}

fn bench_decode_schema(c: &mut Criterion, name: &str, data: &[u8], schema: Arc<Schema>) {
fn bench_decode_schema(
c: &mut Criterion,
name: &str,
data: &[u8],
schema: Arc<Schema>,
projection: bool,
) {
let mut group = c.benchmark_group(name);
group.throughput(Throughput::Bytes(data.len() as u64));
group.sample_size(50);
Expand All @@ -190,6 +196,7 @@ fn bench_decode_schema(c: &mut Criterion, name: &str, data: &[u8], schema: Arc<S
b.iter(|| {
let mut decoder = ReaderBuilder::new(schema.clone())
.with_batch_size(BATCH_SIZE)
.with_projection(projection)
.build_decoder()
.unwrap();
decode_and_flush(&mut decoder, data);
Expand Down Expand Up @@ -228,6 +235,7 @@ fn bench_wide_projection(c: &mut Criterion) {
"decode_wide_projection_full_json",
&wide_projection_data,
full_schema,
false,
);

// Projected schema: only 3 fields (f0, f10, f50) out of 100
Expand All @@ -237,6 +245,7 @@ fn bench_wide_projection(c: &mut Criterion) {
"decode_wide_projection_narrow_json",
&wide_projection_data,
projected_schema,
true,
);
}

Expand Down
65 changes: 62 additions & 3 deletions arrow-json/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ use crate::StructMode;
use crate::reader::binary_array::{
BinaryArrayDecoder, BinaryViewDecoder, FixedSizeBinaryArrayDecoder,
};
use std::collections::HashSet;
use std::io::BufRead;
use std::sync::Arc;

Expand Down Expand Up @@ -182,6 +183,7 @@ pub struct ReaderBuilder {
batch_size: usize,
coerce_primitive: bool,
strict_mode: bool,
projection: bool,
is_field: bool,
struct_mode: StructMode,

Expand All @@ -202,6 +204,7 @@ impl ReaderBuilder {
batch_size: 1024,
coerce_primitive: false,
strict_mode: false,
projection: false,
is_field: false,
struct_mode: Default::default(),
schema,
Expand Down Expand Up @@ -243,6 +246,7 @@ impl ReaderBuilder {
batch_size: 1024,
coerce_primitive: false,
strict_mode: false,
projection: false,
is_field: true,
struct_mode: Default::default(),
schema: Arc::new(Schema::new([field.into()])),
Expand Down Expand Up @@ -275,6 +279,12 @@ impl ReaderBuilder {
}
}

/// Enables projection-aware parsing to skip fields not present in the schema.
/// This is ignored when `strict_mode` is true, which always checks projection.
pub fn with_projection(self, projection: bool) -> Self {
Self { projection, ..self }
}

/// Set the [`StructMode`] for the reader, which determines whether structs
/// can be decoded from JSON as objects or lists. For more details refer to
/// the enum documentation. Default is to use `ObjectOnly`.
Expand Down Expand Up @@ -303,6 +313,19 @@ impl ReaderBuilder {
}
};

let num_fields = self.schema.flattened_fields().len();

// Extract projection field set from schema for projection-aware parsing
// - strict_mode: fail-fast on unknown fields during tape parsing
// - projection: skip JSON fields not present in the schema
let enable_projection = self.strict_mode || self.projection;
let projection: Option<HashSet<String>> = match &data_type {
DataType::Struct(fields) if enable_projection && !fields.is_empty() => {
Some(fields.iter().map(|f| f.name().clone()).collect())
}
_ => None,
};

let decoder = make_decoder(
data_type,
self.coerce_primitive,
Expand All @@ -311,12 +334,15 @@ impl ReaderBuilder {
self.struct_mode,
)?;

let num_fields = self.schema.flattened_fields().len();

Ok(Decoder {
decoder,
is_field: self.is_field,
tape_decoder: TapeDecoder::new(self.batch_size, num_fields),
tape_decoder: TapeDecoder::new(
self.batch_size,
num_fields,
projection,
self.strict_mode,
),
batch_size: self.batch_size,
schema: self.schema,
})
Expand Down Expand Up @@ -1783,6 +1809,39 @@ mod tests {
);
}

#[test]
fn test_projection_skip_unknown_fields() {
// JSON has fields a, b, c but schema only has a, c
let buf = r#"
{"a": 1, "b": "ignored", "c": true}
{"a": 2, "b": "also ignored", "c": false}
"#;

let schema = Arc::new(Schema::new(vec![
Field::new("a", DataType::Int32, true),
Field::new("c", DataType::Boolean, true),
]));

// with_projection(true): skip unknown field "b" and succeed
let batch = ReaderBuilder::new(schema)
.with_projection(true)
.build(Cursor::new(buf.as_bytes()))
.unwrap()
.read()
.unwrap()
.unwrap();

assert_eq!(batch.num_rows(), 2);
assert_eq!(batch.num_columns(), 2);

let a = batch.column(0).as_primitive::<Int32Type>();
assert_eq!(a.values(), &[1, 2]);

let c = batch.column(1).as_boolean();
assert!(c.value(0));
assert!(!c.value(1));
}

fn read_file(path: &str, schema: Option<Schema>) -> Reader<BufReader<File>> {
let file = File::open(path).unwrap();
let mut reader = BufReader::new(file);
Expand Down
Loading
Loading