Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 65 additions & 4 deletions avro/src/serde/de.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ use std::{

pub struct Deserializer<'de> {
input: &'de Value,
deserializing_some: bool,
}

struct SeqDeserializer<'de> {
Expand Down Expand Up @@ -75,7 +76,17 @@ struct UnionDeserializer<'de> {

impl<'de> Deserializer<'de> {
pub fn new(input: &'de Value) -> Self {
Deserializer { input }
Deserializer {
input,
deserializing_some: false,
}
}

pub fn new_deserializing_some(input: &'de Value) -> Self {
Deserializer {
input,
deserializing_some: true,
}
}
}

Expand Down Expand Up @@ -616,6 +627,21 @@ impl<'de> de::Deserializer<'de> for Deserializer<'de> {
let d_bytes: [u8; 12] = d.into();
visitor.visit_bytes(&d_bytes[..])
}
Value::Union(i, x) => {
if matches!(x.deref(), Value::Union(_, _)) {
Err(de::Error::custom(format!(
"Directly nested union types are not supported. Got Value::Union({i}, {x:?})"
)))
} else {
Self::new(x.deref())
.deserialize_bytes(visitor)
.map_err(|e| {
de::Error::custom(format!(
"Attempted to deserialize Value::Union({i}, {x:?}) as bytes: {e:?}"
))
})
}
}
_ => Err(de::Error::custom(format!(
"Expected a String|Bytes|Fixed|Uuid|Decimal|Duration, but got {:?}",
self.input
Expand All @@ -638,6 +664,21 @@ impl<'de> de::Deserializer<'de> for Deserializer<'de> {
let d_bytes: [u8; 12] = d.into();
visitor.visit_byte_buf(Vec::from(d_bytes))
}
Value::Union(i, x) => {
if matches!(x.deref(), Value::Union(_, _)) {
Err(de::Error::custom(format!(
"Directly nested union types are not supported. Got Value::Union({i}, {x:?})"
)))
} else {
Self::new(x.deref())
.deserialize_byte_buf(visitor)
.map_err(|e| {
de::Error::custom(format!(
"Attempted to deserialize Value::Union({i}, {x:?}) as bytes: {e:?}"
))
})
Comment on lines +675 to +679
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The error message here seems to be a copy-paste from deserialize_bytes. It says "as bytes", but this is inside deserialize_byte_buf. This could be confusing when debugging. It should probably be updated to "as byte_buf".

Suggested change
.map_err(|e| {
de::Error::custom(format!(
"Attempted to deserialize Value::Union({i}, {x:?}) as bytes: {e:?}"
))
})
.map_err(|e| {
de::Error::custom(format!(
"Attempted to deserialize Value::Union({i}, {x:?}) as byte_buf: {e:?}"
))
})

}
}
_ => Err(de::Error::custom(format!(
"Expected a String|Bytes|Fixed|Uuid|Decimal|Duration, but got {:?}",
self.input
Expand All @@ -651,7 +692,9 @@ impl<'de> de::Deserializer<'de> for Deserializer<'de> {
{
match self.input {
Value::Union(_i, inner) if inner.as_ref() == &Value::Null => visitor.visit_none(),
Value::Union(_i, inner) => visitor.visit_some(Deserializer::new(inner)),
Value::Union(_i, _inner) => {
visitor.visit_some(Deserializer::new_deserializing_some(self.input))
}
_ => Err(de::Error::custom(format!(
"Expected a Union, but got {:?}",
self.input
Expand Down Expand Up @@ -753,6 +796,14 @@ impl<'de> de::Deserializer<'de> for Deserializer<'de> {
match self.input {
Value::Map(items) => visitor.visit_map(MapDeserializer::new(items)),
Value::Record(fields) => visitor.visit_map(RecordDeserializer::new(fields)),
Value::Union(_i, inner) => match inner.deref() {
Value::Map(items) => visitor.visit_map(MapDeserializer::new(items)),
Value::Record(fields) => visitor.visit_map(RecordDeserializer::new(fields)),
Value::Null => visitor.visit_map(RecordDeserializer::new(&[])),
_ => Err(de::Error::custom(format!(
"Expected a Map, Record or Null, but got: {inner:?}"
))),
},
_ => Err(de::Error::custom(format_args!(
"Expected a record or a map. Got: {:?}",
&self.input
Expand Down Expand Up @@ -799,9 +850,19 @@ impl<'de> de::Deserializer<'de> for Deserializer<'de> {
Value::Record(fields) => visitor.visit_enum(EnumDeserializer::new(fields)),
Value::String(field) => visitor.visit_enum(EnumUnitDeserializer::new(field)),
Value::Union(idx, inner) => {
if (*idx as usize) < variants.len() {
// if we came here from a some, we need to check if we are deserializing a
// non-newtype enum
if self.deserializing_some
&& let Value::Enum(_index, field) = inner.deref()
&& variants.contains(&&**field)
{
return visitor.visit_enum(EnumUnitDeserializer::new(field));
}
// Assume `null` is the first branch if deserializing some so decrement the variant index
let variant_idx = *idx as usize - usize::from(self.deserializing_some);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-medium medium

The code performs an unchecked subtraction *idx as usize - usize::from(self.deserializing_some) on line 862. When self.deserializing_some is true (which happens when deserializing an Option<T> where the value is not null) and the Avro union index *idx is 0, this will cause a panic due to integer underflow in debug builds. In release builds, it will wrap to usize::MAX, which will then fail the bounds check on line 863 and return an error.

This occurs when the Avro schema for a nullable union has the non-null variant at index 0 (e.g., ["int", "null"]), which is perfectly valid according to the Avro specification. An attacker providing a crafted Avro file with such a schema can cause a Denial of Service (DoS) by crashing the application (in debug mode) or causing unexpected deserialization failures.

Furthermore, the logic hardcodes the assumption that null is always the first variant in a nullable union. If null is at a different position (e.g., index 1 in ["int", "null", "string"]), the subtraction will result in the wrong variant being selected, leading to data corruption.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:useful; category:bug; feedback: The Gemini AI reviewer is correct! The calculation of the branch_index may lead to usize underflow when the null variant is not the first in the union. The Avro specification recommends listing the null first, but it allows to use it in a non-first position too. Prevents an application crash at production due to invalid variant index. The calculation should be done with checked operations and an Err should be returned instead with recommendation to put the null at the first position.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let variant_idx = *idx as usize - usize::from(self.deserializing_some); can underflow when deserializing_some is true and idx == 0 (e.g., nullable unions where null isn’t the first branch), which can panic in debug or produce a wrapped index in release. Consider guarding this case and returning a clear error instead of subtracting unconditionally.

Severity: high

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:useful; category:bug; feedback: The Augment AI reviewer is correct! The calculation of the branch_index may lead to usize underflow when the null variant is not the first in the union. The Avro specification recommends listing the null first, but it allows to use it in a non-first position too. Prevents an application crash at production due to invalid variant index. The calculation should be done with checked operations and an Err should be returned instead with recommendation to put the null at the first position.

if (variant_idx) < variants.len() {
visitor.visit_enum(UnionDeserializer::new(
variants[*idx as usize],
variants[variant_idx],
inner.as_ref(),
))
} else {
Expand Down
117 changes: 80 additions & 37 deletions avro/src/serde/ser_schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -606,6 +606,7 @@ pub struct SchemaAwareWriteSerializer<'s, W: Write> {
root_schema: &'s Schema,
names: &'s NamesRef<'s>,
enclosing_namespace: Namespace,
serializing_some: bool,
}

impl<'s, W: Write> SchemaAwareWriteSerializer<'s, W> {
Expand All @@ -629,6 +630,22 @@ impl<'s, W: Write> SchemaAwareWriteSerializer<'s, W> {
root_schema: schema,
names,
enclosing_namespace,
serializing_some: false,
}
}

pub fn new_serializing_some(
writer: &'s mut W,
schema: &'s Schema,
names: &'s NamesRef<'s>,
enclosing_namespace: Namespace,
) -> SchemaAwareWriteSerializer<'s, W> {
SchemaAwareWriteSerializer {
writer,
root_schema: schema,
names,
enclosing_namespace,
serializing_some: true,
}
}

Expand Down Expand Up @@ -1374,7 +1391,7 @@ impl<'s, W: Write> SchemaAwareWriteSerializer<'s, W> {
where
T: ?Sized + ser::Serialize,
{
let mut inner_ser = SchemaAwareWriteSerializer::new(
let mut inner_ser = SchemaAwareWriteSerializer::new_serializing_some(
&mut *self.writer,
schema,
self.names,
Expand Down Expand Up @@ -1460,19 +1477,37 @@ impl<'s, W: Write> SchemaAwareWriteSerializer<'s, W> {
encode_int(variant_index as i32, &mut self.writer)
}
Schema::Union(union_schema) => {
if variant_index as usize >= union_schema.schemas.len() {
// If we came here from a some, we need to check if we are serializing a
// non-newtype enum
if self.serializing_some {
for (i, variant_schema) in union_schema.schemas.iter().enumerate() {
match variant_schema {
Schema::Enum(enum_schema) if enum_schema.name.name == name => {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the serializing_some path, the union scan only matches Schema::Enum, so unions that reference the enum via Schema::Ref (e.g. ["null", "MyEnum"]) will fall through and then error via serialize_unit_struct_with_schema (since it doesn’t support Enum). Consider resolving Schema::Ref variants while scanning so Option<MyEnum> can serialize against referenced-enum union branches.

Severity: high

Fix This in Augment

🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:useful; category:bug; feedback: The Augment AI reviewer is correct! The new logic should add a support for Schema::Ref too because the reference may point to a Schema::Enum. Prevents an error while resolving the variant if it is a reference to an enum schema

if variant_index as usize >= enum_schema.symbols.len() {
return Err(create_error(format!(
"Variant index out of bounds: {}. The Enum schema has '{}' symbols",
variant_index,
enum_schema.symbols.len()
)));
}
encode_int(i as i32, &mut self.writer)?;
return encode_int(variant_index as i32, &mut self.writer);
}
_ => { /* skip */ }
}
}
}
let branch_index = variant_index as usize + usize::from(self.serializing_some);
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

security-medium medium

The serialization logic hardcodes the assumption that the null variant is always the first branch in a nullable union by adding usize::from(self.serializing_some) to the variant index (e.g., on line 1500). If the Avro schema has null at a different position or if the union does not contain null at all, this will result in serializing the wrong variant or an out-of-bounds error. This can lead to data corruption when serializing enums wrapped in Option if the schema does not follow the expected ["null", ...] pattern.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

value:useful; category:bug; feedback: The Gemini AI reviewer is correct! The calculation of the branch_index may lead to usize underflow when the null variant is not the first in the union. The Avro specification recommends listing the null first, but it allows to use it in a non-first position too. Prevents an application crash at production due to invalid variant index. The calculation should be done with checked operations and an Err should be returned instead with recommendation to put the null at the first position.

if branch_index >= union_schema.schemas.len() {
return Err(create_error(format!(
"Variant index out of bounds: {}. The union schema has '{}' schemas",
variant_index,
union_schema.schemas.len()
)));
}

encode_int(variant_index as i32, &mut self.writer)?;
self.serialize_unit_struct_with_schema(
name,
&union_schema.schemas[variant_index as usize],
)
encode_int(branch_index as i32, &mut self.writer)?;
self.serialize_unit_struct_with_schema(name, &union_schema.schemas[branch_index])
}
Schema::Ref { name: ref_name } => {
let ref_schema = self.get_ref_schema(ref_name)?;
Expand Down Expand Up @@ -1524,17 +1559,21 @@ impl<'s, W: Write> SchemaAwareWriteSerializer<'s, W> {

match schema {
Schema::Union(union_schema) => {
let variant_schema = union_schema
.schemas
.get(variant_index as usize)
.ok_or_else(|| {
create_error(format!(
"No variant schema at position {variant_index} for {union_schema:?}"
))
})?;
let branch_index = variant_index as usize + usize::from(self.serializing_some);
if branch_index >= union_schema.schemas.len() {
return Err(create_error(format!(
"Variant index out of bounds: {}. The union schema has '{}' schemas",
variant_index,
union_schema.schemas.len()
)));
}
Comment on lines +1563 to +1569
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In serialize_newtype_variant_with_schema, the error message for an out-of-bounds index uses variant_index, but the check is performed on branch_index. This can be misleading when serializing_some is true, as branch_index would be variant_index + 1. The error message should use branch_index to be accurate. Using "Branch index" instead of "Variant index" in the message would also improve clarity.

Suggested change
if branch_index >= union_schema.schemas.len() {
return Err(create_error(format!(
"Variant index out of bounds: {}. The union schema has '{}' schemas",
variant_index,
union_schema.schemas.len()
)));
}
if branch_index >= union_schema.schemas.len() {
return Err(create_error(format!(
"Branch index out of bounds: {}. The union schema has '{}' schemas",
branch_index,
union_schema.schemas.len()
)));
}


encode_int(variant_index as i32, &mut self.writer)?;
self.serialize_newtype_struct_with_schema(variant, value, variant_schema)
encode_int(branch_index as i32, &mut self.writer)?;
self.serialize_newtype_struct_with_schema(
name,
value,
&union_schema.schemas[branch_index],
)
}
_ => Err(create_error(format!(
"Expected Union schema. Got: {schema}"
Expand Down Expand Up @@ -1703,17 +1742,21 @@ impl<'s, W: Write> SchemaAwareWriteSerializer<'s, W> {

match schema {
Schema::Union(union_schema) => {
let variant_schema = union_schema
.schemas
.get(variant_index as usize)
.ok_or_else(|| {
create_error(format!(
"Cannot find a variant at position {variant_index} in {union_schema:?}"
))
})?;
let branch_index = variant_index as usize + usize::from(self.serializing_some);
if branch_index >= union_schema.schemas.len() {
return Err(create_error(format!(
"Variant index out of bounds: {}. The union schema has '{}' schemas",
variant_index,
union_schema.schemas.len()
)));
}
Comment on lines +1746 to +1752
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In serialize_tuple_variant_with_schema, the error message for an out-of-bounds index uses variant_index, but the check is performed on branch_index. This can be misleading when serializing_some is true, as branch_index would be variant_index + 1. The error message should use branch_index to be accurate. Using "Branch index" instead of "Variant index" in the message would also improve clarity.

Suggested change
if branch_index >= union_schema.schemas.len() {
return Err(create_error(format!(
"Variant index out of bounds: {}. The union schema has '{}' schemas",
variant_index,
union_schema.schemas.len()
)));
}
if branch_index >= union_schema.schemas.len() {
return Err(create_error(format!(
"Branch index out of bounds: {}. The union schema has '{}' schemas",
branch_index,
union_schema.schemas.len()
)));
}


encode_int(variant_index as i32, &mut self.writer)?;
self.serialize_tuple_struct_with_schema(variant, len, variant_schema)
encode_int(branch_index as i32, &mut self.writer)?;
self.serialize_tuple_struct_with_schema(
name,
len,
&union_schema.schemas[branch_index],
)
}
_ => Err(create_error(format!(
"Expected Union schema. Got: {schema}"
Expand Down Expand Up @@ -1835,17 +1878,17 @@ impl<'s, W: Write> SchemaAwareWriteSerializer<'s, W> {

match schema {
Schema::Union(union_schema) => {
let variant_schema = union_schema
.schemas
.get(variant_index as usize)
.ok_or_else(|| {
create_error(format!(
"Cannot find variant at position {variant_index} in {union_schema:?}"
))
})?;
let branch_index = variant_index as usize + usize::from(self.serializing_some);
if branch_index >= union_schema.schemas.len() {
return Err(create_error(format!(
"Variant index out of bounds: {}. The union schema has '{}' schemas",
variant_index,
union_schema.schemas.len()
)));
}
Comment on lines +1882 to +1888
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In serialize_struct_variant_with_schema, the error message for an out-of-bounds index uses variant_index, but the check is performed on branch_index. This can be misleading when serializing_some is true, as branch_index would be variant_index + 1. The error message should use branch_index to be accurate. Using "Branch index" instead of "Variant index" in the message would also improve clarity.

Suggested change
if branch_index >= union_schema.schemas.len() {
return Err(create_error(format!(
"Variant index out of bounds: {}. The union schema has '{}' schemas",
variant_index,
union_schema.schemas.len()
)));
}
if branch_index >= union_schema.schemas.len() {
return Err(create_error(format!(
"Branch index out of bounds: {}. The union schema has '{}' schemas",
branch_index,
union_schema.schemas.len()
)));
}


encode_int(variant_index as i32, &mut self.writer)?;
self.serialize_struct_with_schema(variant, len, variant_schema)
encode_int(branch_index as i32, &mut self.writer)?;
self.serialize_struct_with_schema(name, len, &union_schema.schemas[branch_index])
}
_ => Err(create_error(format!(
"Expected Union schema. Got: {schema}"
Expand Down
Loading