Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions arrow-schema/src/extension/canonical/bool8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ impl ExtensionType for Bool8 {
fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
Self.supports_data_type(data_type).map(|_| Self)
}

fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<(), ArrowError> {
Self.supports_data_type(data_type)
Comment thread
sdf-jkl marked this conversation as resolved.
}
}

#[cfg(test)]
Expand Down
4 changes: 4 additions & 0 deletions arrow-schema/src/extension/canonical/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ impl ExtensionType for Json {
json.supports_data_type(data_type)?;
Ok(json)
}

fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<(), ArrowError> {
Self::default().supports_data_type(data_type)
}
}

#[cfg(test)]
Expand Down
4 changes: 4 additions & 0 deletions arrow-schema/src/extension/canonical/opaque.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,10 @@ impl ExtensionType for Opaque {
fn try_new(_data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> {
Ok(Self::from(metadata))
}

fn validate(_data_type: &DataType, _metadata: Self::Metadata) -> Result<(), ArrowError> {
Ok(())
}
}

#[cfg(test)]
Expand Down
4 changes: 4 additions & 0 deletions arrow-schema/src/extension/canonical/timestamp_with_offset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ impl ExtensionType for TimestampWithOffset {
fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
Self.supports_data_type(data_type).map(|_| Self)
}

fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<(), ArrowError> {
Self.supports_data_type(data_type)
}
}

#[cfg(test)]
Expand Down
4 changes: 4 additions & 0 deletions arrow-schema/src/extension/canonical/uuid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ impl ExtensionType for Uuid {
fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
Self.supports_data_type(data_type).map(|_| Self)
}

fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<(), ArrowError> {
Self.supports_data_type(data_type)
}
}

#[cfg(test)]
Expand Down
9 changes: 9 additions & 0 deletions arrow-schema/src/extension/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,15 @@ pub trait ExtensionType: Sized {
/// this extension type.
fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError>;

/// Validate this extension type for a field with the given data type and
/// metadata.
///
/// The default implementation delegates to [`Self::try_new`]. Extension
/// types may override this to validate without constructing `Self`.
fn validate(data_type: &DataType, metadata: Self::Metadata) -> Result<(), ArrowError> {
Self::try_new(data_type, metadata).map(|_| ())
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I fear we have an API clash here:

  • supports_data_type receives &self, in order to allow configurable extension types based on their metadata
  • validate receives metadata but can only use it by instantiating Self (which requires the very allocation we wanted to avoid).

Ideally, supports_data_type should be implemented in terms of validate instead... but I guess that would be a breaking change?

The next best would be to chase down every extension type that actually has metadata, and implement the two methods in the correct direction.

Does any impl in the arrow crate actually use this provided method? Or is it just a safety net for third party impl to avoid a breaking change?

Copy link
Copy Markdown
Contributor Author

@sdf-jkl sdf-jkl Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's just a safety net now. No extension type is using the default validate impl

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ack, thanks!

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about as a follow on we remove the default impl (and we merge it for the next major breaking API release)-- that will force any implementations to implement and avoid the potential slowdown

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good!

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}

/// Construct this extension type from field metadata and data type.
///
/// This is a provided method that extracts extension type information from
Expand Down
132 changes: 102 additions & 30 deletions arrow-schema/src/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -504,13 +504,39 @@ impl Field {
.map(String::as_ref)
}

/// Returns `true` if this [`Field`] has the given [`ExtensionType`] name
/// and can be successfully validated as that extension type.
///
/// This first checks the extension type name and only calls
/// [`ExtensionType::validate`] when the name matches.
///
/// This is useful when you only need a boolean validity check and do not
/// need to retrieve the extension type instance.
#[inline]
pub fn has_valid_extension_type<E: ExtensionType>(&self) -> bool {
if self.extension_type_name() != Some(E::NAME) {
return false;
}

let ext_metadata = self
.metadata()
.get(EXTENSION_TYPE_METADATA_KEY)
.map(|s| s.as_str());
Comment thread
scovich marked this conversation as resolved.

E::deserialize_metadata(ext_metadata)
.and_then(|metadata| E::validate(self.data_type(), metadata))
.is_ok()
}

/// Returns an instance of the given [`ExtensionType`] of this [`Field`],
/// if set in the [`Field::metadata`].
///
/// Note that using `try_extension_type` with an extension type that does
/// not match the name in the metadata will return an `ArrowError` which can
/// be slow due to string allocations. If you only want to check if a
/// [`Field`] has a specific [`ExtensionType`], see the example below.
/// [`Field`] has a specific [`ExtensionType`], first check
/// [`Field::extension_type_name`], or use [`Field::has_valid_extension_type`]
/// to also validate metadata and data type.
///
/// # Errors
///
Expand All @@ -524,7 +550,7 @@ impl Field {
/// fail (for example when the [`Field::data_type`] is not supported by
/// the extension type ([`ExtensionType::supports_data_type`]))
///
/// # Examples: Check and retrieve an extension type
/// # Example: Check and retrieve an extension type
/// You can use this to check if a [`Field`] has a specific
/// [`ExtensionType`] and retrieve it:
/// ```
Expand All @@ -546,34 +572,6 @@ impl Field {
/// // do something with extension_type
/// }
/// ```
///
/// # Example: Checking if a field has a specific extension type first
///
/// Since `try_extension_type` returns an error, it is more
/// efficient to first check if the name matches before calling
/// `try_extension_type`:
/// ```
/// # use arrow_schema::{DataType, Field, ArrowError};
/// # use arrow_schema::extension::ExtensionType;
/// # struct MyExtensionType;
/// # impl ExtensionType for MyExtensionType {
/// # const NAME: &'static str = "my_extension";
/// # type Metadata = String;
/// # fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { Ok(()) }
/// # fn try_new(data_type: &DataType, metadata: Self::Metadata) -> Result<Self, ArrowError> { Ok(Self) }
/// # fn serialize_metadata(&self) -> Option<String> { unimplemented!() }
/// # fn deserialize_metadata(s: Option<&str>) -> Result<Self::Metadata, ArrowError> { unimplemented!() }
/// # fn metadata(&self) -> &<Self as ExtensionType>::Metadata { todo!() }
/// # }
/// # fn get_field() -> Field { Field::new("field", DataType::Null, false) }
/// let field = get_field();
/// // First check if the name matches before calling the potentially expensive `try_extension_type`
/// if field.extension_type_name() == Some(MyExtensionType::NAME) {
/// if let Ok(extension_type) = field.try_extension_type::<MyExtensionType>() {
/// // do something with extension_type
/// }
/// }
/// ```
pub fn try_extension_type<E: ExtensionType>(&self) -> Result<E, ArrowError> {
E::try_new_from_field_metadata(self.data_type(), self.metadata())
}
Expand Down Expand Up @@ -1013,6 +1011,80 @@ mod test {
use super::*;
use std::collections::hash_map::DefaultHasher;

#[derive(Debug, Clone, Copy)]
struct TestExtensionType;

impl ExtensionType for TestExtensionType {
const NAME: &'static str = "test.extension";
type Metadata = ();

fn metadata(&self) -> &Self::Metadata {
&()
}

fn serialize_metadata(&self) -> Option<String> {
None
}

fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata, ArrowError> {
metadata.map_or(Ok(()), |_| {
Err(ArrowError::InvalidArgumentError(
"TestExtensionType expects no metadata".to_owned(),
))
})
}

fn supports_data_type(&self, _data_type: &DataType) -> Result<(), ArrowError> {
Ok(())
}

fn try_new(_data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
Ok(Self)
}
}

#[test]
fn test_has_valid_extension_type() {
let no_extension = Field::new("f", DataType::Null, false);
assert!(!no_extension.has_valid_extension_type::<TestExtensionType>());

let matching_name = Field::new("f", DataType::Null, false).with_metadata(
[(
EXTENSION_TYPE_NAME_KEY.to_owned(),
TestExtensionType::NAME.to_owned(),
)]
.into_iter()
.collect(),
);
assert!(matching_name.has_valid_extension_type::<TestExtensionType>());

let matching_name_with_invalid_metadata = Field::new("f", DataType::Null, false)
.with_metadata(
[
(
EXTENSION_TYPE_NAME_KEY.to_owned(),
TestExtensionType::NAME.to_owned(),
),
(EXTENSION_TYPE_METADATA_KEY.to_owned(), "invalid".to_owned()),
]
.into_iter()
.collect(),
);
assert!(
!matching_name_with_invalid_metadata.has_valid_extension_type::<TestExtensionType>()
);

let different_name = Field::new("f", DataType::Null, false).with_metadata(
[(
EXTENSION_TYPE_NAME_KEY.to_owned(),
"some.other_extension".to_owned(),
)]
.into_iter()
.collect(),
);
assert!(!different_name.has_valid_extension_type::<TestExtensionType>());
}

#[test]
fn test_new_with_string() {
// Fields should allow owned Strings to support reuse
Expand Down
8 changes: 6 additions & 2 deletions parquet-variant-compute/src/variant_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ impl ExtensionType for VariantType {
Self.supports_data_type(data_type)?;
Ok(Self)
}

fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<()> {
Self.supports_data_type(data_type)
}
}

/// An array of Parquet [`Variant`] values
Expand Down Expand Up @@ -131,9 +135,9 @@ impl ExtensionType for VariantType {
/// let schema = get_schema();
/// assert_eq!(schema.fields().len(), 2);
/// // first field is not a Variant
/// assert!(schema.field(0).try_extension_type::<VariantType>().is_err());
/// assert!(!schema.field(0).has_valid_extension_type::<VariantType>());
/// // second field is a Variant
/// assert!(schema.field(1).try_extension_type::<VariantType>().is_ok());
/// assert!(schema.field(1).has_valid_extension_type::<VariantType>());
/// ```
///
/// # Example: Constructing the correct [`Field`] for a [`VariantArray`]
Expand Down
29 changes: 12 additions & 17 deletions parquet/src/arrow/schema/extension.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,19 +110,13 @@ pub(crate) fn has_extension_type(parquet_type: &Type) -> bool {
/// Return the Parquet logical type to use for the specified Arrow Struct field, if any.
#[cfg(feature = "variant_experimental")]
pub(crate) fn logical_type_for_struct(field: &Field) -> Option<LogicalType> {
use arrow_schema::extension::ExtensionType;
use parquet_variant_compute::VariantType;
// Check the name (= quick and cheap) and only try_extension_type if the name matches
// to avoid unnecessary String allocations in ArrowError
if field.extension_type_name()? != VariantType::NAME {
return None;
}
match field.try_extension_type::<VariantType>() {
Ok(VariantType) => Some(LogicalType::Variant {
if field.has_valid_extension_type::<VariantType>() {
Some(LogicalType::Variant {
specification_version: None,
}),
// Given check above, this should not error, but if it does ignore
Err(_e) => None,
})
} else {
None
}
}

Expand All @@ -137,9 +131,8 @@ pub(crate) fn logical_type_for_fixed_size_binary(field: &Field) -> Option<Logica
use arrow_schema::extension::Uuid;
// If set, map arrow uuid extension type to parquet uuid logical type.
field
.try_extension_type::<Uuid>()
.ok()
.map(|_| LogicalType::Uuid)
.has_valid_extension_type::<Uuid>()
.then_some(LogicalType::Uuid)
}

#[cfg(not(feature = "arrow_canonical_extension_types"))]
Expand All @@ -153,9 +146,11 @@ pub(crate) fn logical_type_for_string(field: &Field) -> Option<LogicalType> {
use arrow_schema::extension::Json;
// Use the Json logical type if the canonical Json
// extension type is set on this field.
field
.try_extension_type::<Json>()
.map_or(Some(LogicalType::String), |_| Some(LogicalType::Json))
Some(if field.has_valid_extension_type::<Json>() {
LogicalType::Json
} else {
LogicalType::String
})
}

#[cfg(not(feature = "arrow_canonical_extension_types"))]
Expand Down
8 changes: 8 additions & 0 deletions parquet/src/arrow/schema/virtual_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ impl ExtensionType for RowGroupIndex {
fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
Self.supports_data_type(data_type).map(|_| Self)
}

fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<(), ArrowError> {
Self.supports_data_type(data_type)
}
}

/// The extension type for row numbers.
Expand Down Expand Up @@ -113,6 +117,10 @@ impl ExtensionType for RowNumber {
fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result<Self, ArrowError> {
Self.supports_data_type(data_type).map(|_| Self)
}

fn validate(data_type: &DataType, _metadata: Self::Metadata) -> Result<(), ArrowError> {
Self.supports_data_type(data_type)
}
}

/// Returns `true` if the field is a virtual column.
Expand Down
6 changes: 2 additions & 4 deletions parquet/src/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@
//! // the VariantType extension type
//! let schema = reader.schema();
//! let field = schema.field_with_name("var")?;
//! assert!(field.try_extension_type::<VariantType>().is_ok());
//! assert!(field.has_valid_extension_type::<VariantType>());
//!
//! // The reader will yield RecordBatches with a StructArray
//! // to convert them to VariantArray, use VariantArray::try_new
Expand Down Expand Up @@ -285,9 +285,7 @@ mod tests {
assert_eq!(metadata_value, "arrow.parquet.variant");

// verify that `VariantType` also correctly finds the metadata
field
.try_extension_type::<VariantType>()
.expect("VariantExtensionType should be readable");
assert!(field.has_valid_extension_type::<VariantType>());
}

/// Read the specified test case filename from parquet-testing
Expand Down
Loading