-
Notifications
You must be signed in to change notification settings - Fork 0
20222: Provide session to the udtf call #296
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,128 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| //! See `main.rs` for how to run it. | ||
|
|
||
| use std::sync::{Arc, LazyLock}; | ||
|
|
||
| use arrow::array::{RecordBatch, StringBuilder}; | ||
| use arrow_schema::{DataType, Field, Schema, SchemaRef}; | ||
| use datafusion::{ | ||
| catalog::{MemTable, TableFunctionArgs, TableFunctionImpl, TableProvider}, | ||
| common::Result, | ||
| execution::SessionState, | ||
| prelude::SessionContext, | ||
| }; | ||
| use datafusion_common::{DataFusionError, plan_err}; | ||
| use tokio::{runtime::Handle, task::block_in_place}; | ||
|
|
||
| const FUNCTION_NAME: &str = "table_list"; | ||
|
|
||
| // The example shows, how to create UDTF that depends on the session state. | ||
| // Defines a `table_list` UDTF that returns a list of tables within the provided session. | ||
|
|
||
| pub async fn table_list_udtf() -> Result<()> { | ||
| let ctx = SessionContext::new(); | ||
| ctx.register_udtf(FUNCTION_NAME, Arc::new(TableListUdtf)); | ||
|
|
||
| // Register different kinds of tables. | ||
| ctx.sql("create view v as select 1") | ||
| .await? | ||
| .collect() | ||
| .await?; | ||
| ctx.sql("create table t(a int)").await?.collect().await?; | ||
|
|
||
| // Print results. | ||
| ctx.sql("select * from table_list()").await?.show().await?; | ||
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| #[derive(Debug, Default)] | ||
| struct TableListUdtf; | ||
|
|
||
| static SCHEMA: LazyLock<SchemaRef> = LazyLock::new(|| { | ||
| SchemaRef::new(Schema::new(vec![ | ||
| Field::new("catalog", DataType::Utf8, false), | ||
| Field::new("schema", DataType::Utf8, false), | ||
| Field::new("table", DataType::Utf8, false), | ||
| Field::new("type", DataType::Utf8, false), | ||
| ])) | ||
| }); | ||
|
|
||
| impl TableFunctionImpl for TableListUdtf { | ||
| fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> { | ||
| if !args.args.is_empty() { | ||
| return plan_err!( | ||
| "{}: unexpected number of arguments: {}, expected: 0", | ||
| FUNCTION_NAME, | ||
| args.args.len() | ||
| ); | ||
| } | ||
| let state = args | ||
| .session | ||
| .as_any() | ||
| .downcast_ref::<SessionState>() | ||
| .ok_or_else(|| { | ||
| DataFusionError::Internal("failed to downcast state".into()) | ||
| })?; | ||
|
|
||
| let mut catalogs = StringBuilder::new(); | ||
| let mut schemas = StringBuilder::new(); | ||
| let mut tables = StringBuilder::new(); | ||
| let mut types = StringBuilder::new(); | ||
|
|
||
| let catalog_list = state.catalog_list(); | ||
| for catalog_name in catalog_list.catalog_names() { | ||
| let Some(catalog) = catalog_list.catalog(&catalog_name) else { | ||
| continue; | ||
| }; | ||
| for schema_name in catalog.schema_names() { | ||
| let Some(schema) = catalog.schema(&schema_name) else { | ||
| continue; | ||
| }; | ||
| for table_name in schema.table_names() { | ||
| let Some(provider) = block_in_place(|| { | ||
| Handle::current().block_on(schema.table(&table_name)) | ||
| })? | ||
| else { | ||
| continue; | ||
| }; | ||
| catalogs.append_value(catalog_name.clone()); | ||
| schemas.append_value(schema_name.clone()); | ||
| tables.append_value(table_name.clone()); | ||
| types.append_value(provider.table_type().to_string()) | ||
| } | ||
| } | ||
| } | ||
|
Comment on lines
+90
to
+111
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The current implementation iterates through catalogs, schemas, and tables, and for each table, it blocks to get the table provider. Using To improve performance, you could gather all the asynchronous let mut table_futures = vec![];
for catalog_name in catalog_list.catalog_names() {
if let Some(catalog) = catalog_list.catalog(&catalog_name) {
for schema_name in catalog.schema_names() {
if let Some(schema) = catalog.schema(&schema_name) {
for table_name in schema.table_names() {
let schema_clone = Arc::clone(&schema);
let catalog_name_clone = catalog_name.clone();
let schema_name_clone = schema_name.clone();
let table_name_clone = table_name.clone();
table_futures.push(async move {
schema_clone.table(&table_name_clone).await.map(|provider| {
(catalog_name_clone, schema_name_clone, table_name_clone, provider)
})
});
}
}
}
}
}
let results = block_in_place(|| {
Handle::current().block_on(futures::future::try_join_all(table_futures))
})?;
for (catalog_name, schema_name, table_name, provider) in results {
if let Some(provider) = provider {
catalogs.append_value(catalog_name);
schemas.append_value(schema_name);
tables.append_value(table_name);
types.append_value(provider.table_type().to_string())
}
}
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:good-to-have; category:bug; feedback: The Gemini AI reviewer is correct! The current solution uses Tokio's |
||
|
|
||
| let batch = RecordBatch::try_new( | ||
| Arc::clone(&SCHEMA), | ||
| vec![ | ||
| Arc::new(catalogs.finish()), | ||
| Arc::new(schemas.finish()), | ||
| Arc::new(tables.finish()), | ||
| Arc::new(types.finish()), | ||
| ], | ||
| )?; | ||
|
|
||
| Ok(Arc::new(MemTable::try_new( | ||
| batch.schema(), | ||
| vec![vec![batch]], | ||
| )?)) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -23,8 +23,8 @@ use std::sync::Arc; | |||||||||||||||||||||||||||||||||||||||||||
| use crate::session::Session; | ||||||||||||||||||||||||||||||||||||||||||||
| use arrow::datatypes::SchemaRef; | ||||||||||||||||||||||||||||||||||||||||||||
| use async_trait::async_trait; | ||||||||||||||||||||||||||||||||||||||||||||
| use datafusion_common::Result; | ||||||||||||||||||||||||||||||||||||||||||||
| use datafusion_common::{Constraints, Statistics, not_impl_err}; | ||||||||||||||||||||||||||||||||||||||||||||
| use datafusion_common::{Result, internal_err}; | ||||||||||||||||||||||||||||||||||||||||||||
| use datafusion_expr::Expr; | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| use datafusion_expr::dml::InsertOp; | ||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -507,10 +507,30 @@ pub trait TableProviderFactory: Debug + Sync + Send { | |||||||||||||||||||||||||||||||||||||||||||
| ) -> Result<Arc<dyn TableProvider>>; | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| /// Describes arguments provided to the table function call. | ||||||||||||||||||||||||||||||||||||||||||||
| pub struct TableFunctionArgs<'a> { | ||||||||||||||||||||||||||||||||||||||||||||
| /// Call arguments. | ||||||||||||||||||||||||||||||||||||||||||||
| pub args: &'a [Expr], | ||||||||||||||||||||||||||||||||||||||||||||
| /// Session within which the function is called. | ||||||||||||||||||||||||||||||||||||||||||||
| pub session: &'a dyn Session, | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+510
to
+516
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major Future-proof TableFunctionArgs before this API ships. This type is already constructed cross-crate from Suggested API hardening+#[non_exhaustive]
/// Describes arguments provided to the table function call.
pub struct TableFunctionArgs<'a> {
/// Call arguments.
pub args: &'a [Expr],
/// Session within which the function is called.
pub session: &'a dyn Session,
}
+
+impl<'a> TableFunctionArgs<'a> {
+ pub fn new(args: &'a [Expr], session: &'a dyn Session) -> Self {
+ Self { args, session }
+ }
+}Then switch external construction sites to 📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:good-to-have; category:bug; feedback: The CodeRabbit AI reviewer is correct! The new struct is fully public (it is public itself and all its fields are public too). This makes it too open for the external users and thus less extendable in the future. It would be better to hide the fields, add a constructor method and getters. This way more fields could be added later without breaking the existing users. |
||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| /// A trait for table function implementations | ||||||||||||||||||||||||||||||||||||||||||||
| pub trait TableFunctionImpl: Debug + Sync + Send + Any { | ||||||||||||||||||||||||||||||||||||||||||||
| /// Create a table provider | ||||||||||||||||||||||||||||||||||||||||||||
| fn call(&self, args: &[Expr]) -> Result<Arc<dyn TableProvider>>; | ||||||||||||||||||||||||||||||||||||||||||||
| #[deprecated( | ||||||||||||||||||||||||||||||||||||||||||||
| since = "53.0.0", | ||||||||||||||||||||||||||||||||||||||||||||
| note = "Implement `TableFunctionImpl::call_with_args` instead" | ||||||||||||||||||||||||||||||||||||||||||||
| )] | ||||||||||||||||||||||||||||||||||||||||||||
| fn call(&self, _args: &[Expr]) -> Result<Arc<dyn TableProvider>> { | ||||||||||||||||||||||||||||||||||||||||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Severity: medium 🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.
Owner
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. value:valid-but-wont-fix; category:bug; feedback: The Augment AI reviewer is not correct! The default implementation for |
||||||||||||||||||||||||||||||||||||||||||||
| internal_err!("unimplemented") | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| /// Create a table provider | ||||||||||||||||||||||||||||||||||||||||||||
| fn call_with_args(&self, args: TableFunctionArgs) -> Result<Arc<dyn TableProvider>> { | ||||||||||||||||||||||||||||||||||||||||||||
| #[expect(deprecated)] | ||||||||||||||||||||||||||||||||||||||||||||
| self.call(args.args) | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| /// A table that uses a function to generate data | ||||||||||||||||||||||||||||||||||||||||||||
|
|
@@ -539,7 +559,20 @@ impl TableFunction { | |||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| /// Get the function implementation and generate a table | ||||||||||||||||||||||||||||||||||||||||||||
| #[deprecated( | ||||||||||||||||||||||||||||||||||||||||||||
| since = "53.0.0", | ||||||||||||||||||||||||||||||||||||||||||||
| note = "Use `TableFunction::create_table_provider_with_args` instead" | ||||||||||||||||||||||||||||||||||||||||||||
| )] | ||||||||||||||||||||||||||||||||||||||||||||
| pub fn create_table_provider(&self, args: &[Expr]) -> Result<Arc<dyn TableProvider>> { | ||||||||||||||||||||||||||||||||||||||||||||
| #[expect(deprecated)] | ||||||||||||||||||||||||||||||||||||||||||||
| self.fun.call(args) | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||
| /// Get the function implementation and generate a table | ||||||||||||||||||||||||||||||||||||||||||||
| pub fn create_table_provider_with_args( | ||||||||||||||||||||||||||||||||||||||||||||
| &self, | ||||||||||||||||||||||||||||||||||||||||||||
| args: TableFunctionArgs, | ||||||||||||||||||||||||||||||||||||||||||||
| ) -> Result<Arc<dyn TableProvider>> { | ||||||||||||||||||||||||||||||||||||||||||||
| self.fun.call_with_args(args) | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
datafusion-examples/examples/udf/table_list_udtf.rs:100: Usingblock_in_place+Handle::current().block_on(...)will panic iftable_list()is invoked without a Tokio runtime, andblock_in_placealso requires the multi-thread runtime. Since UDTFs can be called from various execution contexts, it may be worth documenting/guarding these runtime assumptions for the example.Severity: medium
🤖 Was this useful? React with 👍 or 👎, or 🚀 if it prevented an incident/outage.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
value:good-to-have; category:documentation; feedback: The Augment AI reviewer is correct! The example would work only in multi-threaded Tokio runtime. It would be good to document this (e.g. with a comment), so the users copying this example into their application are aware of this limitation.