From 0567c02af4ffedffa6b561a74cd5a3d98ae5f1ab Mon Sep 17 00:00:00 2001 From: Michiel De Backker Date: Wed, 22 May 2024 08:54:11 +0200 Subject: [PATCH 01/31] feat: initial subquery support (#37) Co-authored-by: Suriya Kandaswamy Co-authored-by: Suriya Kandaswamy --- datafusion-federation/src/analyzer.rs | 170 ---------- datafusion-federation/src/lib.rs | 44 ++- datafusion-federation/src/optimizer.rs | 429 +++++++++++++++++++++++++ examples/examples/flight-sql.rs | 9 +- examples/examples/postgres-partial.rs | 9 +- examples/examples/sqlite-partial.rs | 11 +- examples/examples/sqlite-subquery.rs | 90 ++++++ examples/examples/sqlite.rs | 10 +- sources/sql/src/lib.rs | 34 +- 9 files changed, 584 insertions(+), 222 deletions(-) delete mode 100644 datafusion-federation/src/analyzer.rs create mode 100644 datafusion-federation/src/optimizer.rs create mode 100644 examples/examples/sqlite-subquery.rs diff --git a/datafusion-federation/src/analyzer.rs b/datafusion-federation/src/analyzer.rs deleted file mode 100644 index f5ba3a1..0000000 --- a/datafusion-federation/src/analyzer.rs +++ /dev/null @@ -1,170 +0,0 @@ -use std::sync::Arc; - -use datafusion::{ - config::ConfigOptions, - datasource::source_as_provider, - error::Result, - logical_expr::{Expr, LogicalPlan, Projection, TableScan, TableSource}, - optimizer::analyzer::AnalyzerRule, -}; - -use crate::{FederatedTableProviderAdaptor, FederatedTableSource, FederationProviderRef}; - -#[derive(Default)] -pub struct FederationAnalyzerRule {} - -impl AnalyzerRule for FederationAnalyzerRule { - // Walk over the plan, look for the largest subtrees that only have - // TableScans from the same FederationProvider. - // There 'largest sub-trees' are passed to their respective FederationProvider.optimizer. - fn analyze(&self, plan: LogicalPlan, config: &ConfigOptions) -> Result { - let (optimized, _) = self.optimize_recursively(&plan, None, config)?; - if let Some(result) = optimized { - return Ok(result); - } - Ok(plan.clone()) - } - - /// A human readable name for this optimizer rule - fn name(&self) -> &str { - "federation_optimizer_rule" - } -} - -impl FederationAnalyzerRule { - pub fn new() -> Self { - Self::default() - } - - // optimize_recursively recursively finds the largest sub-plans that can be federated - // to a single FederationProvider. - // Returns a plan if a sub-tree was federated, otherwise None. - // Returns a FederationProvider if it covers the entire sub-tree, otherwise None. - fn optimize_recursively( - &self, - plan: &LogicalPlan, - parent: Option<&LogicalPlan>, - _config: &ConfigOptions, - ) -> Result<(Option, Option)> { - // Check if this node determines the FederationProvider - let sole_provider = self.get_federation_provider(plan)?; - if sole_provider.is_some() { - return Ok((None, sole_provider)); - } - - // optimize_inputs - let inputs = plan.inputs(); - if inputs.is_empty() { - return Ok((None, None)); - } - - let (new_inputs, providers): (Vec<_>, Vec<_>) = inputs - .iter() - .map(|i| self.optimize_recursively(i, Some(plan), _config)) - .collect::>>()? - .into_iter() - .unzip(); - - // Note: assumes provider is None if ambiguous - let first_provider = providers.first().unwrap(); - let is_singular = providers.iter().all(|p| p.is_some() && p == first_provider); - - if is_singular { - if parent.is_none() { - // federate the entire plan - if let Some(provider) = first_provider { - if let Some(optimizer) = provider.analyzer() { - let optimized = optimizer.execute_and_check(plan, _config, |_, _| {})?; - return Ok((Some(optimized), None)); - } - return Ok((None, None)); - } - return Ok((None, None)); - } - // The largest sub-plan is higher up. - return Ok((None, first_provider.clone())); - } - - // The plan is ambiguous, any inputs that are not federated and - // have a sole provider, should be federated. - let new_inputs = new_inputs - .into_iter() - .enumerate() - .map(|(i, new_sub_plan)| { - if let Some(sub_plan) = new_sub_plan { - // Already federated - return Ok(sub_plan); - } - let sub_plan = inputs.get(i).unwrap(); - // Check if the input has a sole provider and can be federated. - if let Some(provider) = providers.get(i).unwrap() { - if let Some(optimizer) = provider.analyzer() { - let wrapped = wrap_projection((*sub_plan).clone())?; - - let optimized = - optimizer.execute_and_check(&wrapped, _config, |_, _| {})?; - return Ok(optimized); - } - // No federation for this sub-plan (no analyzer) - return Ok((*sub_plan).clone()); - } - // No federation for this sub-plan (no provider) - Ok((*sub_plan).clone()) - }) - .collect::>>()?; - - let new_plan = plan.with_new_exprs(plan.expressions(), new_inputs)?; - - Ok((Some(new_plan), None)) - } - - fn get_federation_provider(&self, plan: &LogicalPlan) -> Result> { - match plan { - LogicalPlan::TableScan(TableScan { ref source, .. }) => { - let Some(federated_source) = get_table_source(source)? else { - return Ok(None); - }; - let provider = federated_source.federation_provider(); - Ok(Some(provider)) - } - _ => Ok(None), - } - } -} - -fn wrap_projection(plan: LogicalPlan) -> Result { - // TODO: minimize requested columns - match plan { - LogicalPlan::Projection(_) => Ok(plan), - _ => { - let expr = plan - .schema() - .fields() - .iter() - .map(|f| Expr::Column(f.qualified_column())) - .collect::>(); - Ok(LogicalPlan::Projection(Projection::try_new( - expr, - Arc::new(plan), - )?)) - } - } -} - -pub fn get_table_source( - source: &Arc, -) -> Result>> { - // Unwrap TableSource - let source = source_as_provider(source)?; - - // Get FederatedTableProviderAdaptor - let Some(wrapper) = source - .as_any() - .downcast_ref::() - else { - return Ok(None); - }; - - // Return original FederatedTableSource - Ok(Some(Arc::clone(&wrapper.source))) -} diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index 999d296..b6bd949 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -4,16 +4,50 @@ use std::{ sync::Arc, }; -use datafusion::optimizer::analyzer::Analyzer; +use datafusion::{ + execution::context::{SessionContext, SessionState}, + optimizer::{optimizer::Optimizer, OptimizerRule}, +}; -mod analyzer; -pub use analyzer::*; +mod optimizer; +pub use optimizer::*; mod table_provider; pub use table_provider::*; mod plan_node; pub use plan_node::*; +pub fn default_session_state() -> SessionState { + let df_state = SessionContext::new().state(); + + let rules = default_optimizer_rules(); + df_state + .with_optimizer_rules(rules) + .with_query_planner(Arc::new(FederatedQueryPlanner::new())) +} + +pub fn default_optimizer_rules() -> Vec> { + // Get the default optimizer + let df_default = Optimizer::new(); + let mut default_rules = df_default.rules; + + // Insert the FederationOptimizerRule after the ScalarSubqueryToJoin. + // This ensures ScalarSubquery are replaced before we try to federate. + let Some(pos) = default_rules + .iter() + .position(|x| x.name() == "scalar_subquery_to_join") + else { + panic!("Could not locate ScalarSubqueryToJoin"); + }; + + // TODO: check if we should allow other optimizers to run before the federation rule. + + let federation_rule = Arc::new(FederationOptimizerRule::new()); + default_rules.insert(pos + 1, federation_rule); + + default_rules +} + pub type FederationProviderRef = Arc; pub trait FederationProvider: Send + Sync { // Returns the name of the provider, used for comparison. @@ -23,9 +57,9 @@ pub trait FederationProvider: Send + Sync { // will execute a query. For example: database instance & catalog. fn compute_context(&self) -> Option; - // Returns an analyzer that can cut out part of the plan + // Returns an optimizer that can cut out part of the plan // to federate it. - fn analyzer(&self) -> Option>; + fn optimizer(&self) -> Option>; } impl fmt::Display for dyn FederationProvider { diff --git a/datafusion-federation/src/optimizer.rs b/datafusion-federation/src/optimizer.rs new file mode 100644 index 0000000..4afac32 --- /dev/null +++ b/datafusion-federation/src/optimizer.rs @@ -0,0 +1,429 @@ +use std::sync::Arc; + +use datafusion::common::not_impl_err; +use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; +use datafusion::logical_expr::Extension; +use datafusion::optimizer::optimizer::Optimizer; +use datafusion::optimizer::{OptimizerConfig, OptimizerRule}; +use datafusion::{ + datasource::source_as_provider, + error::Result, + logical_expr::{Expr, LogicalPlan, Projection, TableScan, TableSource}, +}; + +use crate::{ + FederatedTableProviderAdaptor, FederatedTableSource, FederationProvider, FederationProviderRef, +}; + +#[derive(Default)] +pub struct FederationOptimizerRule {} + +impl OptimizerRule for FederationOptimizerRule { + // Walk over the plan, look for the largest subtrees that only have + // TableScans from the same FederationProvider. + // There 'largest sub-trees' are passed to their respective FederationProvider.optimizer. + fn try_optimize( + &self, + plan: &LogicalPlan, + config: &dyn OptimizerConfig, + ) -> Result> { + let (optimized, _) = self.optimize_plan_recursively(plan, true, config)?; + Ok(optimized) + } + + /// A human readable name for this optimizer rule + fn name(&self) -> &str { + "federation_optimizer_rule" + } +} + +enum ScanResult { + None, + Distinct(FederationProviderRef), + Ambiguous, +} + +impl ScanResult { + fn merge(&mut self, other: Self) { + match (&self, &other) { + (_, ScanResult::None) => {} + (ScanResult::None, _) => *self = other, + (ScanResult::Ambiguous, _) | (_, ScanResult::Ambiguous) => { + *self = ScanResult::Ambiguous + } + (ScanResult::Distinct(provider), ScanResult::Distinct(other_provider)) => { + if provider != other_provider { + *self = ScanResult::Ambiguous + } + } + } + } + fn add(&mut self, provider: Option) { + self.merge(ScanResult::from(provider)) + } + fn is_ambiguous(&self) -> bool { + matches!(self, ScanResult::Ambiguous) + } + fn is_none(&self) -> bool { + matches!(self, ScanResult::None) + } + fn is_some(&self) -> bool { + !self.is_none() + } + fn unwrap(self) -> Option { + match self { + ScanResult::None => None, + ScanResult::Distinct(provider) => Some(provider), + ScanResult::Ambiguous => panic!("called `ScanResult::unwrap()` on a `Ambiguous` value"), + } + } + fn check_recursion(&self) -> TreeNodeRecursion { + if self.is_ambiguous() { + TreeNodeRecursion::Stop + } else { + TreeNodeRecursion::Continue + } + } +} + +impl From> for ScanResult { + fn from(provider: Option) -> Self { + match provider { + Some(provider) => ScanResult::Distinct(provider), + None => ScanResult::None, + } + } +} + +impl PartialEq> for ScanResult { + fn eq(&self, other: &Option) -> bool { + match (self, other) { + (ScanResult::None, None) => true, + (ScanResult::Distinct(provider), Some(other_provider)) => provider == other_provider, + _ => false, + } + } +} + +impl Clone for ScanResult { + fn clone(&self) -> Self { + match self { + ScanResult::None => ScanResult::None, + ScanResult::Distinct(provider) => ScanResult::Distinct(provider.clone()), + ScanResult::Ambiguous => ScanResult::Ambiguous, + } + } +} + +impl FederationOptimizerRule { + pub fn new() -> Self { + Self::default() + } + + // scans a plan to see if it belongs to a single FederationProvider + fn scan_plan_recursively(&self, plan: &LogicalPlan) -> Result { + let mut sole_provider: ScanResult = ScanResult::None; + + plan.apply(&mut |p: &LogicalPlan| -> Result { + let exprs_provider = self.scan_plan_exprs(p)?; + sole_provider.merge(exprs_provider); + + if sole_provider.is_ambiguous() { + return Ok(TreeNodeRecursion::Stop); + } + + let sub_provider = get_leaf_provider(p)?; + sole_provider.add(sub_provider); + + Ok(sole_provider.check_recursion()) + })?; + + Ok(sole_provider) + } + + // scans a plan's expressions to see if it belongs to a single FederationProvider + fn scan_plan_exprs(&self, plan: &LogicalPlan) -> Result { + let mut sole_provider: ScanResult = ScanResult::None; + + let exprs = plan.expressions(); + for expr in &exprs { + let expr_result = self.scan_expr_recursively(expr)?; + sole_provider.merge(expr_result); + + if sole_provider.is_ambiguous() { + return Ok(sole_provider); + } + } + + Ok(sole_provider) + } + + // scans an expression to see if it belongs to a single FederationProvider + fn scan_expr_recursively(&self, expr: &Expr) -> Result { + let mut sole_provider: ScanResult = ScanResult::None; + + expr.apply(&mut |e: &Expr| -> Result { + // TODO: Support other types of sub-queries + match e { + Expr::ScalarSubquery(ref subquery) => { + let plan_result = self.scan_plan_recursively(&subquery.subquery)?; + + sole_provider.merge(plan_result); + Ok(sole_provider.check_recursion()) + } + Expr::InSubquery(_) => not_impl_err!("InSubquery"), + Expr::OuterReferenceColumn(..) => { + // Subqueries that reference outer columns are not supported + // for now. We handle this here as ambiguity to force + // federation lower in the plan tree. + sole_provider = ScanResult::Ambiguous; + Ok(TreeNodeRecursion::Stop) + } + _ => Ok(TreeNodeRecursion::Continue), + } + })?; + + Ok(sole_provider) + } + + // optimize_recursively recursively finds the largest sub-plans that can be federated + // to a single FederationProvider. + // Returns a plan if a sub-tree was federated, otherwise None. + // Returns a ScanResult of all FederationProviders in the subtree. + fn optimize_plan_recursively( + &self, + plan: &LogicalPlan, + is_root: bool, + _config: &dyn OptimizerConfig, + ) -> Result<(Option, ScanResult)> { + // Used to track if all sources, including tableScan, plan inputs and + // expressions, represents an un-ambiguous or 'sole' FederationProvider + let mut sole_provider: ScanResult = ScanResult::None; + + if let LogicalPlan::Extension(Extension { ref node }) = plan { + if node.name() == "Federated" { + // Avoid attempting double federation + return Ok((None, ScanResult::Ambiguous)); + } + } + + // Check if this plan node is a leaf that determines the FederationProvider + let leaf_provider = get_leaf_provider(plan)?; + + // Check if the expressions contain, a potentially different, FederationProvider + let exprs_result = self.scan_plan_exprs(plan)?; + let optimize_expressions = exprs_result.is_some(); + + // Return early if this is a leaf and there is no ambiguity with the expressions. + if leaf_provider.is_some() && (exprs_result.is_none() || exprs_result == leaf_provider) { + return Ok((None, leaf_provider.into())); + } + // Aggregate leaf & expression providers + sole_provider.add(leaf_provider); + sole_provider.merge(exprs_result); + + let inputs = plan.inputs(); + // Return early if there are no sources. + if inputs.is_empty() && sole_provider.is_none() { + return Ok((None, ScanResult::None)); + } + + // Recursively optimize inputs + let input_results = inputs + .iter() + .map(|i| self.optimize_plan_recursively(i, false, _config)) + .collect::>>()?; + + // Aggregate the input providers + input_results.iter().for_each(|(_, scan_result)| { + sole_provider.merge(scan_result.clone()); + }); + + if sole_provider.is_none() { + // No providers found + // TODO: Is/should this be reachable? + return Ok((None, ScanResult::None)); + } + + // If all sources are federated to the same provider + if let ScanResult::Distinct(provider) = sole_provider { + if !is_root { + // The largest sub-plan is higher up. + return Ok((None, ScanResult::Distinct(provider))); + } + + let Some(optimizer) = provider.optimizer() else { + // No optimizer provided + return Ok((None, ScanResult::None)); + }; + + // If this is the root plan node; federate the entire plan + let optimized = optimizer.optimize(plan, _config, |_, _| {})?; + return Ok((Some(optimized), ScanResult::None)); + } + + // The plan is ambiguous; any input that is not yet optimized and has a + // sole provider represents a largest sub-plan and should be federated. + // + // We loop over the input optimization results, federate where needed and + // return a complete list of new inputs for the optimized plan. + let new_inputs = input_results + .into_iter() + .enumerate() + .map(|(i, (input_plan, input_result))| { + if let Some(federated_plan) = input_plan { + // Already federated deeper in the plan tree + return Ok(federated_plan); + } + + let original_input = (*inputs.get(i).unwrap()).clone(); + if input_result.is_ambiguous() { + // Can happen if the input is already federated, so use + // the original input. + return Ok(original_input); + } + + let provider = input_result.unwrap(); + let Some(provider) = provider else { + // No provider for this input; use the original input. + return Ok(original_input); + }; + + let Some(optimizer) = provider.optimizer() else { + // No optimizer for this input; use the original input. + return Ok(original_input); + }; + + // Replace the input with the federated counterpart + let wrapped = wrap_projection(original_input)?; + let optimized = optimizer.optimize(&wrapped, _config, |_, _| {})?; + + Ok(optimized) + }) + .collect::>>()?; + + // Optimize expressions if needed + let new_expressions = if optimize_expressions { + self.optimize_plan_exprs(plan, _config)? + } else { + plan.expressions() + }; + + // Construct the optimized plan + let new_plan = plan.with_new_exprs(new_expressions, new_inputs)?; + + // Return the federated plan + Ok((Some(new_plan), ScanResult::Ambiguous)) + } + + // Optimize all exprs of a plan + fn optimize_plan_exprs( + &self, + plan: &LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result> { + plan.expressions() + .iter() + .map(|expr| { + let transformed = expr + .clone() + .transform(&|e| self.optimize_expr_recursively(e, _config))?; + Ok(transformed.data) + }) + .collect::>>() + } + + // recursively optimize expressions + // Current logic: individually federate every sub-query. + fn optimize_expr_recursively( + &self, + expr: Expr, + _config: &dyn OptimizerConfig, + ) -> Result> { + match expr { + Expr::ScalarSubquery(ref subquery) => { + // Optimize as root to force federating the sub-query + let (new_subquery, _) = + self.optimize_plan_recursively(&subquery.subquery, true, _config)?; + let Some(new_subquery) = new_subquery else { + return Ok(Transformed::no(expr)); + }; + Ok(Transformed::yes(Expr::ScalarSubquery( + subquery.with_plan(new_subquery.into()), + ))) + } + Expr::InSubquery(_) => not_impl_err!("InSubquery"), + _ => Ok(Transformed::no(expr)), + } + } +} + +// NopFederationProvider is used to represent tables that are not federated, but +// are resolved by DataFusion. This simplifies the logic of the optimizer rule. +struct NopFederationProvider {} + +impl FederationProvider for NopFederationProvider { + fn name(&self) -> &str { + "nop" + } + + fn compute_context(&self) -> Option { + None + } + + fn optimizer(&self) -> Option> { + None + } +} + +fn get_leaf_provider(plan: &LogicalPlan) -> Result> { + match plan { + LogicalPlan::TableScan(TableScan { ref source, .. }) => { + let Some(federated_source) = get_table_source(source)? else { + // Table is not federated but provided by a standard table provider. + // We use a placeholder federation provider to simplify the logic. + return Ok(Some(Arc::new(NopFederationProvider {}))); + }; + let provider = federated_source.federation_provider(); + Ok(Some(provider)) + } + _ => Ok(None), + } +} + +fn wrap_projection(plan: LogicalPlan) -> Result { + // TODO: minimize requested columns + match plan { + LogicalPlan::Projection(_) => Ok(plan), + _ => { + let expr = plan + .schema() + .fields() + .iter() + .map(|f| Expr::Column(f.qualified_column())) + .collect::>(); + Ok(LogicalPlan::Projection(Projection::try_new( + expr, + Arc::new(plan), + )?)) + } + } +} + +pub fn get_table_source( + source: &Arc, +) -> Result>> { + // Unwrap TableSource + let source = source_as_provider(source)?; + + // Get FederatedTableProviderAdaptor + let Some(wrapper) = source + .as_any() + .downcast_ref::() + else { + return Ok(None); + }; + + // Return original FederatedTableSource + Ok(Some(Arc::clone(&wrapper.source))) +} diff --git a/examples/examples/flight-sql.rs b/examples/examples/flight-sql.rs index 7a32e29..e0899cb 100644 --- a/examples/examples/flight-sql.rs +++ b/examples/examples/flight-sql.rs @@ -9,7 +9,6 @@ use datafusion::{ options::CsvReadOptions, }, }; -use datafusion_federation::{FederatedQueryPlanner, FederationAnalyzerRule}; use datafusion_federation_flight_sql::{executor::FlightSQLExecutor, server::FlightSqlService}; use datafusion_federation_sql::{SQLFederationProvider, SQLSchemaProvider}; use tokio::time::sleep; @@ -39,15 +38,9 @@ async fn main() -> Result<()> { sleep(Duration::from_secs(3)).await; // Local context - let state = SessionContext::new().state(); + let state = datafusion_federation::default_session_state(); let known_tables: Vec = ["test"].iter().map(|&x| x.into()).collect(); - // Register FederationAnalyzer - // TODO: Interaction with other analyzers & optimizers. - let state = state - .add_analyzer_rule(Arc::new(FederationAnalyzerRule::new())) - .with_query_planner(Arc::new(FederatedQueryPlanner::new())); - // Register schema // TODO: table inference let dsn: String = "http://localhost:50051".to_string(); diff --git a/examples/examples/postgres-partial.rs b/examples/examples/postgres-partial.rs index 873dd40..fab31a4 100644 --- a/examples/examples/postgres-partial.rs +++ b/examples/examples/postgres-partial.rs @@ -6,19 +6,12 @@ use datafusion::{ error::Result, execution::context::{SessionContext, SessionState}, }; -use datafusion_federation::{FederatedQueryPlanner, FederationAnalyzerRule}; use datafusion_federation_sql::connectorx::CXExecutor; use datafusion_federation_sql::{MultiSchemaProvider, SQLFederationProvider, SQLSchemaProvider}; #[tokio::main] async fn main() -> Result<()> { - let state = SessionContext::new().state(); - // Register FederationAnalyzer - // TODO: Interaction with other analyzers & optimizers. - let state = state - .add_analyzer_rule(Arc::new(FederationAnalyzerRule::new())) - .with_query_planner(Arc::new(FederatedQueryPlanner::new())); - + let state = datafusion_federation::default_session_state(); let df = task::spawn_blocking(move || { // Register schema let pg_provider_1 = async_std::task::block_on(create_postgres_provider(vec!["class"], "conn1")).unwrap(); diff --git a/examples/examples/sqlite-partial.rs b/examples/examples/sqlite-partial.rs index 780462b..2b6f5b9 100644 --- a/examples/examples/sqlite-partial.rs +++ b/examples/examples/sqlite-partial.rs @@ -7,24 +7,15 @@ use datafusion::{ error::Result, execution::context::{SessionContext, SessionState}, }; -use datafusion_federation::{FederatedQueryPlanner, FederationAnalyzerRule}; use datafusion_federation_sql::{connectorx::CXExecutor, SQLFederationProvider, SQLSchemaProvider}; #[tokio::main] async fn main() -> Result<()> { - let state = SessionContext::new().state(); - // Register FederationAnalyzer - // TODO: Interaction with other analyzers & optimizers. - let state = state - .add_analyzer_rule(Arc::new(FederationAnalyzerRule::new())) - .with_query_planner(Arc::new(FederatedQueryPlanner::new())); - - // Register schema + let state = datafusion_federation::default_session_state(); let provider = MultiSchemaProvider::new(vec![ create_sqlite_provider(vec!["Artist"], "conn1").await?, create_sqlite_provider(vec!["Track", "Album"], "conn2").await?, ]); - overwrite_default_schema(&state, Arc::new(provider))?; // Run query diff --git a/examples/examples/sqlite-subquery.rs b/examples/examples/sqlite-subquery.rs new file mode 100644 index 0000000..8f505bc --- /dev/null +++ b/examples/examples/sqlite-subquery.rs @@ -0,0 +1,90 @@ +use std::{any::Any, sync::Arc}; + +use async_trait::async_trait; +use datafusion::{ + catalog::schema::SchemaProvider, + datasource::TableProvider, + error::Result, + execution::context::{SessionContext, SessionState}, +}; +use datafusion_federation_sql::{connectorx::CXExecutor, SQLFederationProvider, SQLSchemaProvider}; + +#[tokio::main] +async fn main() -> Result<()> { + let state = datafusion_federation::default_session_state(); + let provider = MultiSchemaProvider::new(vec![ + create_sqlite_provider(vec!["Artist"], "conn1").await?, + create_sqlite_provider(vec!["Track", "Album"], "conn2").await?, + ]); + overwrite_default_schema(&state, Arc::new(provider))?; + + // Run query + let ctx = SessionContext::new_with_state(state); + let query = r#"SELECT Name, (SELECT Title FROM Album limit 1) FROM Artist limit 1"#; + let df = ctx.sql(query).await?; + + // let explain = df.clone().explain(true, false)?; + // explain.show().await?; + + df.show().await +} + +async fn create_sqlite_provider( + known_tables: Vec<&str>, + context: &str, +) -> Result> { + let dsn = "sqlite://./examples/examples/chinook.sqlite".to_string(); + let known_tables: Vec = known_tables.iter().map(|&x| x.into()).collect(); + let mut executor = CXExecutor::new(dsn)?; + executor.context(context.to_string()); + let provider = Arc::new(SQLFederationProvider::new(Arc::new(executor))); + Ok(Arc::new( + SQLSchemaProvider::new_with_tables(provider, known_tables).await?, + )) +} + +struct MultiSchemaProvider { + children: Vec>, +} + +impl MultiSchemaProvider { + pub fn new(children: Vec>) -> Self { + Self { children } + } +} + +fn overwrite_default_schema(state: &SessionState, schema: Arc) -> Result<()> { + let options = &state.config().options().catalog; + let catalog = state + .catalog_list() + .catalog(options.default_catalog.as_str()) + .unwrap(); + + catalog.register_schema(options.default_schema.as_str(), schema)?; + + Ok(()) +} + +#[async_trait] +impl SchemaProvider for MultiSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.children.iter().flat_map(|p| p.table_names()).collect() + } + + async fn table(&self, name: &str) -> Result>> { + for child in &self.children { + if let Ok(Some(table)) = child.table(name).await { + return Ok(Some(table)); + } + } + Ok(None) + } + + fn table_exist(&self, name: &str) -> bool { + self.children.iter().any(|p| p.table_exist(name)) + } +} diff --git a/examples/examples/sqlite.rs b/examples/examples/sqlite.rs index 74e5371..a43c7da 100644 --- a/examples/examples/sqlite.rs +++ b/examples/examples/sqlite.rs @@ -5,7 +5,6 @@ use datafusion::{ error::Result, execution::context::{SessionContext, SessionState}, }; -use datafusion_federation::{FederatedQueryPlanner, FederationAnalyzerRule}; use datafusion_federation_sql::{connectorx::CXExecutor, SQLFederationProvider, SQLSchemaProvider}; #[tokio::main] @@ -16,13 +15,7 @@ async fn main() -> datafusion::error::Result<()> { .map(|&x| x.into()) .collect(); - let state = SessionContext::new().state(); - - // Register FederationAnalyzer - // TODO: Interaction with other analyzers & optimizers. - let state = state - .add_analyzer_rule(Arc::new(FederationAnalyzerRule::new())) - .with_query_planner(Arc::new(FederatedQueryPlanner::new())); + let state = datafusion_federation::default_session_state(); // Register schema // TODO: table inference @@ -43,6 +36,7 @@ async fn main() -> datafusion::error::Result<()> { JOIN Album a ON t.AlbumId = a.AlbumId JOIN Artist ar ON a.ArtistId = ar.ArtistId limit 10"#; + let df = ctx.sql(query).await?; df.show().await?; diff --git a/sources/sql/src/lib.rs b/sources/sql/src/lib.rs index b814904..7619186 100644 --- a/sources/sql/src/lib.rs +++ b/sources/sql/src/lib.rs @@ -4,11 +4,10 @@ use std::{any::Any, sync::Arc, vec}; use async_trait::async_trait; use datafusion::{ arrow::datatypes::{Schema, SchemaRef}, - config::ConfigOptions, error::Result, execution::{context::SessionState, TaskContext}, logical_expr::{Extension, LogicalPlan}, - optimizer::analyzer::{Analyzer, AnalyzerRule}, + optimizer::{optimizer::Optimizer, OptimizerConfig, OptimizerRule}, physical_expr::EquivalenceProperties, physical_plan::{ DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, PlanProperties, @@ -30,15 +29,15 @@ pub use executor::*; // SQLFederationProvider provides federation to SQL DMBSs. pub struct SQLFederationProvider { - analyzer: Arc, + optimizer: Arc, executor: Arc, } impl SQLFederationProvider { pub fn new(executor: Arc) -> Self { Self { - analyzer: Arc::new(Analyzer::with_rules(vec![Arc::new( - SQLFederationAnalyzerRule::new(executor.clone()), + optimizer: Arc::new(Optimizer::with_rules(vec![Arc::new( + SQLFederationOptimizerRule::new(executor.clone()), )])), executor, } @@ -54,16 +53,16 @@ impl FederationProvider for SQLFederationProvider { self.executor.compute_context() } - fn analyzer(&self) -> Option> { - Some(self.analyzer.clone()) + fn optimizer(&self) -> Option> { + Some(self.optimizer.clone()) } } -struct SQLFederationAnalyzerRule { +struct SQLFederationOptimizerRule { planner: Arc, } -impl SQLFederationAnalyzerRule { +impl SQLFederationOptimizerRule { pub fn new(executor: Arc) -> Self { Self { planner: Arc::new(SQLFederationPlanner::new(executor.clone())), @@ -71,15 +70,24 @@ impl SQLFederationAnalyzerRule { } } -impl AnalyzerRule for SQLFederationAnalyzerRule { - fn analyze(&self, plan: LogicalPlan, _config: &ConfigOptions) -> Result { +impl OptimizerRule for SQLFederationOptimizerRule { + fn try_optimize( + &self, + plan: &LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result> { + if let LogicalPlan::Extension(Extension { ref node }) = plan { + if node.name() == "Federated" { + // Avoid attempting double federation + return Ok(None); + } + } // Simply accept the entire plan for now - let fed_plan = FederatedPlanNode::new(plan.clone(), self.planner.clone()); let ext_node = Extension { node: Arc::new(fed_plan), }; - Ok(LogicalPlan::Extension(ext_node)) + Ok(Some(LogicalPlan::Extension(ext_node))) } /// A human readable name for this analyzer rule From ce87bf45681d230e816f27e545e720498c2de955 Mon Sep 17 00:00:00 2001 From: Phillip LeBlanc Date: Wed, 19 Jun 2024 01:35:51 +0900 Subject: [PATCH 02/31] feat: add fallback TableProvider to FederatedTableProviderAdaptor (#39) --- datafusion-federation/src/table_provider.rs | 87 +++++++++++++++++++-- 1 file changed, 81 insertions(+), 6 deletions(-) diff --git a/datafusion-federation/src/table_provider.rs b/datafusion-federation/src/table_provider.rs index 6a9afaa..a1acc30 100644 --- a/datafusion-federation/src/table_provider.rs +++ b/datafusion-federation/src/table_provider.rs @@ -7,7 +7,7 @@ use datafusion::{ datasource::TableProvider, error::{DataFusionError, Result}, execution::context::SessionState, - logical_expr::{Expr, LogicalPlan, TableSource, TableType}, + logical_expr::{Expr, LogicalPlan, TableProviderFilterPushDown, TableSource, TableType}, physical_plan::ExecutionPlan, }; @@ -17,11 +17,28 @@ use crate::FederationProvider; // from a TableScan. This wrapper may be avoidable. pub struct FederatedTableProviderAdaptor { pub source: Arc, + pub table_provider: Option>, } impl FederatedTableProviderAdaptor { pub fn new(source: Arc) -> Self { - Self { source } + Self { + source, + table_provider: None, + } + } + + /// Creates a new FederatedTableProviderAdaptor that falls back to the + /// provided TableProvider. This is useful if used within a DataFusion + /// context without the federation optimizer. + pub fn new_with_provider( + source: Arc, + table_provider: Arc, + ) -> Self { + Self { + source, + table_provider: Some(table_provider), + } } } @@ -31,34 +48,92 @@ impl TableProvider for FederatedTableProviderAdaptor { self } fn schema(&self) -> SchemaRef { + if let Some(table_provider) = &self.table_provider { + return table_provider.schema(); + } + self.source.schema() } fn constraints(&self) -> Option<&Constraints> { + if let Some(table_provider) = &self.table_provider { + return table_provider + .constraints() + .or_else(|| self.source.constraints()); + } + self.source.constraints() } fn table_type(&self) -> TableType { + if let Some(table_provider) = &self.table_provider { + return table_provider.table_type(); + } + self.source.table_type() } fn get_logical_plan(&self) -> Option<&LogicalPlan> { + if let Some(table_provider) = &self.table_provider { + return table_provider + .get_logical_plan() + .or_else(|| self.source.get_logical_plan()); + } + self.source.get_logical_plan() } fn get_column_default(&self, column: &str) -> Option<&Expr> { + if let Some(table_provider) = &self.table_provider { + return table_provider + .get_column_default(column) + .or_else(|| self.source.get_column_default(column)); + } + self.source.get_column_default(column) } + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> Result> { + if let Some(table_provider) = &self.table_provider { + return table_provider.supports_filters_pushdown(filters); + } + + Ok(vec![ + TableProviderFilterPushDown::Unsupported; + filters.len() + ]) + } // Scan is not supported; the adaptor should be replaced // with a virtual TableProvider that provides federation for a sub-plan. async fn scan( &self, - _state: &SessionState, - _projection: Option<&Vec>, - _filters: &[Expr], - _limit: Option, + state: &SessionState, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, ) -> Result> { + if let Some(table_provider) = &self.table_provider { + return table_provider.scan(state, projection, filters, limit).await; + } + Err(DataFusionError::NotImplemented( "FederatedTableProviderAdaptor cannot scan".to_string(), )) } + + async fn insert_into( + &self, + _state: &SessionState, + input: Arc, + overwrite: bool, + ) -> Result> { + if let Some(table_provider) = &self.table_provider { + return table_provider.insert_into(_state, input, overwrite).await; + } + + Err(DataFusionError::NotImplemented( + "FederatedTableProviderAdaptor cannot insert_into".to_string(), + )) + } } // FederatedTableProvider extends DataFusion's TableProvider trait From 330e4e8e8c4e3d31b1a2cd8fed5df398485045c5 Mon Sep 17 00:00:00 2001 From: hozan23 Date: Fri, 23 Aug 2024 10:03:27 +0200 Subject: [PATCH 03/31] delete redundant github workflows & commitlint --- .github/workflows/check.yml | 13 ------------- .github/workflows/pull-request.yml | 30 ------------------------------ .github/workflows/test.yml | 4 ++-- commitlint.config.js | 8 -------- 4 files changed, 2 insertions(+), 53 deletions(-) delete mode 100644 .github/workflows/check.yml delete mode 100644 .github/workflows/pull-request.yml delete mode 100644 commitlint.config.js diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml deleted file mode 100644 index fe66d27..0000000 --- a/.github/workflows/check.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: Check - -on: [push, pull_request] - -jobs: - formatting: - name: Formatting - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 - - run: npm install prettier prettier-plugin-toml - - run: npx prettier --check --no-config . diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml deleted file mode 100644 index 62da82f..0000000 --- a/.github/workflows/pull-request.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: Pull Request - -on: - pull_request_target: - types: - - opened - - reopened - - edited - - synchronize - -jobs: - conventional-commits: - name: Conventional Commits - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 - - run: npm install @commitlint/config-conventional - - run: npx commitlint <<< $CONVENTIONAL_COMMIT - env: - CONVENTIONAL_COMMIT: | - ${{ github.event.pull_request.title }} - - ${{ github.event.pull_request.body }} - - if: failure() - run: - echo "Datafusion-federation follows the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) for release automation. - The PR title and body are used as the merge commit message. - - Please update your PR title to match the specification." >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8e548ee..0792434 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,7 +55,7 @@ jobs: - uses: arduino/setup-protoc@v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - - run: cargo clippy -- -Dwarnings + - run: cargo clippy -- -D warnings package: name: Package @@ -69,5 +69,5 @@ jobs: - uses: arduino/setup-protoc@v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - - run: cargo build + - run: cargo build --all - run: cargo package -p datafusion-federation --allow-dirty diff --git a/commitlint.config.js b/commitlint.config.js deleted file mode 100644 index 0a2216d..0000000 --- a/commitlint.config.js +++ /dev/null @@ -1,8 +0,0 @@ -module.exports = { - extends: ["@commitlint/config-conventional"], - rules: { - "body-max-line-length": [0, "always", Infinity], - "footer-max-line-length": [0, "always", Infinity], - "header-max-length": [0, "always", Infinity], - }, -}; From f7182a3d45a5401e8b1976c46727593c5e6c3784 Mon Sep 17 00:00:00 2001 From: hozan23 Date: Thu, 22 Aug 2024 16:13:52 +0200 Subject: [PATCH 04/31] update deps && use datafusion 41 --- Cargo.toml | 12 +++--------- datafusion-federation/src/lib.rs | 7 +++---- datafusion-federation/src/optimizer.rs | 8 ++++---- datafusion-federation/src/plan_node.rs | 8 ++++++++ datafusion-federation/src/table_provider.rs | 6 +++--- sources/flight-sql/Cargo.toml | 14 ++++++++------ sources/sql/Cargo.toml | 14 +++++--------- sources/sql/src/connectorx/executor.rs | 1 + sources/sql/src/lib.rs | 6 +++++- sources/sql/src/schema.rs | 3 +-- 10 files changed, 41 insertions(+), 38 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 744de0b..09d77a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,23 +3,17 @@ resolver = "2" members = [ "datafusion-federation", - "examples", "sources/sql", "sources/flight-sql", ] -[patch.crates-io] -# connectorx = { path = "../connector-x/connectorx" } -# datafusion = { path = "../arrow-datafusion/datafusion/core" } - [workspace.package] version = "0.1.3" edition = "2021" license = "MIT" readme = "README.md" - [workspace.dependencies] -async-trait = "0.1.77" -datafusion = "37.0.0" -datafusion-substrait = "37.0.0" +async-trait = "0.1.81" +datafusion = "41.0.0" +datafusion-substrait = "41.0.0" diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index b6bd949..81de605 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -5,7 +5,7 @@ use std::{ }; use datafusion::{ - execution::context::{SessionContext, SessionState}, + execution::session_state::{SessionState, SessionStateBuilder}, optimizer::{optimizer::Optimizer, OptimizerRule}, }; @@ -18,12 +18,11 @@ mod plan_node; pub use plan_node::*; pub fn default_session_state() -> SessionState { - let df_state = SessionContext::new().state(); - let rules = default_optimizer_rules(); - df_state + SessionStateBuilder::new() .with_optimizer_rules(rules) .with_query_planner(Arc::new(FederatedQueryPlanner::new())) + .build() } pub fn default_optimizer_rules() -> Vec> { diff --git a/datafusion-federation/src/optimizer.rs b/datafusion-federation/src/optimizer.rs index 4afac32..d0fc24b 100644 --- a/datafusion-federation/src/optimizer.rs +++ b/datafusion-federation/src/optimizer.rs @@ -258,7 +258,7 @@ impl FederationOptimizerRule { }; // If this is the root plan node; federate the entire plan - let optimized = optimizer.optimize(plan, _config, |_, _| {})?; + let optimized = optimizer.optimize(plan.clone(), _config, |_, _| {})?; return Ok((Some(optimized), ScanResult::None)); } @@ -296,7 +296,7 @@ impl FederationOptimizerRule { // Replace the input with the federated counterpart let wrapped = wrap_projection(original_input)?; - let optimized = optimizer.optimize(&wrapped, _config, |_, _| {})?; + let optimized = optimizer.optimize(wrapped, _config, |_, _| {})?; Ok(optimized) }) @@ -398,9 +398,9 @@ fn wrap_projection(plan: LogicalPlan) -> Result { _ => { let expr = plan .schema() - .fields() + .columns() .iter() - .map(|f| Expr::Column(f.qualified_column())) + .map(|c| Expr::Column(c.clone())) .collect::>(); Ok(LogicalPlan::Projection(Projection::try_new( expr, diff --git a/datafusion-federation/src/plan_node.rs b/datafusion-federation/src/plan_node.rs index 35a9306..c81b152 100644 --- a/datafusion-federation/src/plan_node.rs +++ b/datafusion-federation/src/plan_node.rs @@ -65,6 +65,14 @@ impl UserDefinedLogicalNodeCore for FederatedPlanNode { planner: self.planner.clone(), } } + + /// XXX should consider something else here ? + fn with_exprs_and_inputs(&self, _exprs: Vec, _inputs: Vec) -> Result { + Ok(Self { + plan: self.plan.clone(), + planner: self.planner.clone(), + }) + } } #[derive(Default)] diff --git a/datafusion-federation/src/table_provider.rs b/datafusion-federation/src/table_provider.rs index a1acc30..b820b6e 100644 --- a/datafusion-federation/src/table_provider.rs +++ b/datafusion-federation/src/table_provider.rs @@ -3,10 +3,10 @@ use std::{any::Any, sync::Arc}; use async_trait::async_trait; use datafusion::{ arrow::datatypes::SchemaRef, + catalog::Session, common::Constraints, datasource::TableProvider, error::{DataFusionError, Result}, - execution::context::SessionState, logical_expr::{Expr, LogicalPlan, TableProviderFilterPushDown, TableSource, TableType}, physical_plan::ExecutionPlan, }; @@ -106,7 +106,7 @@ impl TableProvider for FederatedTableProviderAdaptor { // with a virtual TableProvider that provides federation for a sub-plan. async fn scan( &self, - state: &SessionState, + state: &dyn Session, projection: Option<&Vec>, filters: &[Expr], limit: Option, @@ -122,7 +122,7 @@ impl TableProvider for FederatedTableProviderAdaptor { async fn insert_into( &self, - _state: &SessionState, + _state: &dyn Session, input: Arc, overwrite: bool, ) -> Result> { diff --git a/sources/flight-sql/Cargo.toml b/sources/flight-sql/Cargo.toml index f778dfc..a7ed673 100644 --- a/sources/flight-sql/Cargo.toml +++ b/sources/flight-sql/Cargo.toml @@ -3,7 +3,6 @@ name = "datafusion-federation-flight-sql" version.workspace = true edition.workspace = true license.workspace = true -readme.workspace = true [lib] name = "datafusion_federation_flight_sql" @@ -13,11 +12,14 @@ path = "src/lib.rs" async-trait.workspace = true datafusion.workspace = true datafusion-substrait.workspace = true + +# XXX use the release verion on crates.io datafusion-federation.path = "../../datafusion-federation" datafusion-federation-sql.path = "../sql" + futures = "0.3.30" -tonic = {version="0.11.0", features=["tls"] } -prost = "0.12.3" -arrow = "51.0.0" -arrow-flight = { version = "51.0.0", features = ["flight-sql-experimental"] } -log = "0.4.20" +tonic = {version="0.12.0", features=["tls"] } +prost = "0.13" +arrow = "52.0.0" +arrow-flight = { version = "52.0.0", features = ["flight-sql-experimental"] } +log = "0.4.22" diff --git a/sources/sql/Cargo.toml b/sources/sql/Cargo.toml index bc89d95..31d8218 100644 --- a/sources/sql/Cargo.toml +++ b/sources/sql/Cargo.toml @@ -3,7 +3,6 @@ name = "datafusion-federation-sql" version.workspace = true edition.workspace = true license.workspace = true -readme.workspace = true [lib] name = "datafusion_federation_sql" @@ -11,14 +10,11 @@ path = "src/lib.rs" [dependencies] async-trait.workspace = true -# connectorx = { version = "0.3.2", features = ["src_sqlite"] } -# https://github.com/sfu-db/connector-x/pull/555 -connectorx = { git = "https://github.com/devinjdangelo/connector-x.git", features = [ - "dst_arrow", - "src_sqlite" -] } datafusion.workspace = true + +# XXX use the release verion on crates.io datafusion-federation.path = "../../datafusion-federation" -# derive_builder = "0.13.0" + +connectorx = { version = "0.3.3" , features = ["dst_arrow", "src_sqlite"] } futures = "0.3.30" -tokio = "1.35.1" +tokio = "1.39" diff --git a/sources/sql/src/connectorx/executor.rs b/sources/sql/src/connectorx/executor.rs index fc5ea3d..033f39c 100644 --- a/sources/sql/src/connectorx/executor.rs +++ b/sources/sql/src/connectorx/executor.rs @@ -53,6 +53,7 @@ impl SQLExecutor for CXExecutor { fn compute_context(&self) -> Option { Some(self.context.clone()) } + fn execute(&self, sql: &str, schema: SchemaRef) -> Result { let conn = self.conn.clone(); let query: CXQuery = sql.into(); diff --git a/sources/sql/src/lib.rs b/sources/sql/src/lib.rs index 7619186..217d3d7 100644 --- a/sources/sql/src/lib.rs +++ b/sources/sql/src/lib.rs @@ -162,6 +162,10 @@ impl DisplayAs for VirtualExecutionPlan { } impl ExecutionPlan for VirtualExecutionPlan { + fn name(&self) -> &str { + "sql_federation_exec" + } + fn as_any(&self) -> &dyn Any { self } @@ -170,7 +174,7 @@ impl ExecutionPlan for VirtualExecutionPlan { self.schema() } - fn children(&self) -> Vec> { + fn children(&self) -> Vec<&Arc> { vec![] } diff --git a/sources/sql/src/schema.rs b/sources/sql/src/schema.rs index c780f23..83d6e08 100644 --- a/sources/sql/src/schema.rs +++ b/sources/sql/src/schema.rs @@ -2,8 +2,7 @@ use async_trait::async_trait; use datafusion::logical_expr::{TableSource, TableType}; use datafusion::{ - arrow::datatypes::SchemaRef, catalog::schema::SchemaProvider, datasource::TableProvider, - error::Result, + arrow::datatypes::SchemaRef, catalog::SchemaProvider, datasource::TableProvider, error::Result, }; use futures::future::join_all; use std::{any::Any, sync::Arc}; From 05519e8e38f9d3543d53bfe9bb6095f27b2907fb Mon Sep 17 00:00:00 2001 From: hozan23 Date: Thu, 22 Aug 2024 16:26:41 +0200 Subject: [PATCH 05/31] remove connectorx from sources/sql crate --- sources/sql/README.md | 4 + sources/sql/src/connectorx/executor.rs | 126 ------------------------- sources/sql/src/connectorx/mod.rs | 2 - sources/sql/src/lib.rs | 1 - 4 files changed, 4 insertions(+), 129 deletions(-) create mode 100644 sources/sql/README.md delete mode 100644 sources/sql/src/connectorx/executor.rs delete mode 100644 sources/sql/src/connectorx/mod.rs diff --git a/sources/sql/README.md b/sources/sql/README.md new file mode 100644 index 0000000..def0668 --- /dev/null +++ b/sources/sql/README.md @@ -0,0 +1,4 @@ + + +This will be move to +[datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers) repository diff --git a/sources/sql/src/connectorx/executor.rs b/sources/sql/src/connectorx/executor.rs deleted file mode 100644 index 033f39c..0000000 --- a/sources/sql/src/connectorx/executor.rs +++ /dev/null @@ -1,126 +0,0 @@ -use async_trait::async_trait; -use connectorx::{ - destinations::arrow::ArrowDestinationError, - errors::{ConnectorXError, ConnectorXOutError}, - prelude::{get_arrow, CXQuery, SourceConn, SourceType}, -}; -use datafusion::{ - arrow::datatypes::{Field, Schema, SchemaRef}, - error::{DataFusionError, Result}, - physical_plan::{ - stream::RecordBatchStreamAdapter, EmptyRecordBatchStream, SendableRecordBatchStream, - }, - sql::sqlparser::dialect::{Dialect, GenericDialect, PostgreSqlDialect, SQLiteDialect}, -}; -use futures::executor::block_on; -use std::sync::Arc; -use tokio::task; - -use crate::executor::SQLExecutor; - -pub struct CXExecutor { - context: String, - conn: SourceConn, -} - -impl CXExecutor { - pub fn new(dsn: String) -> Result { - let conn = SourceConn::try_from(dsn.as_str()).map_err(cx_error_to_df)?; - Ok(Self { context: dsn, conn }) - } - - pub fn new_with_conn(conn: SourceConn) -> Self { - Self { - context: conn.conn.to_string(), - conn, - } - } - - pub fn context(&mut self, context: String) { - self.context = context; - } -} - -fn cx_error_to_df(err: ConnectorXError) -> DataFusionError { - DataFusionError::External(format!("ConnectorX: {err:?}").into()) -} - -#[async_trait] -impl SQLExecutor for CXExecutor { - fn name(&self) -> &str { - "connector_x_executor" - } - fn compute_context(&self) -> Option { - Some(self.context.clone()) - } - - fn execute(&self, sql: &str, schema: SchemaRef) -> Result { - let conn = self.conn.clone(); - let query: CXQuery = sql.into(); - - let mut dst = block_on(task::spawn_blocking(move || -> Result<_, _> { - get_arrow(&conn, None, &[query.clone()]).map_err(cx_out_error_to_df) - })) - .map_err(|err| DataFusionError::External(err.to_string().into()))??; - let stream = if let Some(batch) = dst.record_batch().map_err(cx_dst_error_to_df)? { - futures::stream::once(async move { Ok(batch) }) - } else { - return Ok(Box::pin(EmptyRecordBatchStream::new(Arc::new( - Schema::empty(), - )))); - }; - - Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) - } - - async fn table_names(&self) -> Result> { - Err(DataFusionError::NotImplemented( - "connector_x source: table inference not implemented".to_string(), - )) - } - - async fn get_table_schema(&self, table_name: &str) -> Result { - let conn = self.conn.clone(); - let query: CXQuery = format!("select * from {table_name} limit 1") - .as_str() - .into(); - - let dst = get_arrow(&conn, None, &[query.clone()]).map_err(cx_out_error_to_df)?; - let schema = schema_to_lowercase(dst.arrow_schema()); - Ok(schema) - } - - fn dialect(&self) -> Arc { - match &self.conn.ty { - SourceType::Postgres => Arc::new(PostgreSqlDialect {}), - SourceType::SQLite => Arc::new(SQLiteDialect {}), - _ => Arc::new(GenericDialect {}), - } - } -} - -fn cx_dst_error_to_df(err: ArrowDestinationError) -> DataFusionError { - DataFusionError::External(format!("ConnectorX failed to run query: {err:?}").into()) -} - -/// Get the schema with lowercase field names -fn schema_to_lowercase(schema: SchemaRef) -> SchemaRef { - // DF needs lower case schema - let lower_fields: Vec<_> = schema - .fields - .iter() - .map(|f| { - Field::new( - f.name().to_ascii_lowercase(), - f.data_type().clone(), - f.is_nullable(), - ) - }) - .collect(); - - Arc::new(Schema::new(lower_fields)) -} - -fn cx_out_error_to_df(err: ConnectorXOutError) -> DataFusionError { - DataFusionError::External(format!("ConnectorX failed to run query: {err:?}").into()) -} diff --git a/sources/sql/src/connectorx/mod.rs b/sources/sql/src/connectorx/mod.rs deleted file mode 100644 index 600069a..0000000 --- a/sources/sql/src/connectorx/mod.rs +++ /dev/null @@ -1,2 +0,0 @@ -mod executor; -pub use executor::*; diff --git a/sources/sql/src/lib.rs b/sources/sql/src/lib.rs index 217d3d7..917fa8d 100644 --- a/sources/sql/src/lib.rs +++ b/sources/sql/src/lib.rs @@ -20,7 +20,6 @@ use datafusion_federation::{FederatedPlanNode, FederationPlanner, FederationProv mod schema; pub use schema::*; -pub mod connectorx; mod executor; pub use executor::*; From 3408e797b90eefbce0bf2f6e03a74e717cb75ef0 Mon Sep 17 00:00:00 2001 From: hozan23 Date: Thu, 22 Aug 2024 17:09:50 +0200 Subject: [PATCH 06/31] flight-sql: fix up dependencie issues --- sources/flight-sql/Cargo.toml | 6 +-- sources/flight-sql/src/server/service.rs | 47 ++++++++++++------------ sources/flight-sql/src/server/state.rs | 3 +- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/sources/flight-sql/Cargo.toml b/sources/flight-sql/Cargo.toml index a7ed673..07c39e7 100644 --- a/sources/flight-sql/Cargo.toml +++ b/sources/flight-sql/Cargo.toml @@ -18,8 +18,8 @@ datafusion-federation.path = "../../datafusion-federation" datafusion-federation-sql.path = "../sql" futures = "0.3.30" -tonic = {version="0.12.0", features=["tls"] } -prost = "0.13" +tonic = {version="0.11.0", features=["tls", "transport", "codegen", "prost"] } +prost = "0.12.3" arrow = "52.0.0" -arrow-flight = { version = "52.0.0", features = ["flight-sql-experimental"] } +arrow-flight = { version = "52.2.0", features = ["flight-sql-experimental"] } log = "0.4.22" diff --git a/sources/flight-sql/src/server/service.rs b/sources/flight-sql/src/server/service.rs index c49cf32..afa4a5f 100644 --- a/sources/flight-sql/src/server/service.rs +++ b/sources/flight-sql/src/server/service.rs @@ -1,14 +1,11 @@ -use arrow::datatypes::SchemaRef; -use arrow::error::ArrowError; -use arrow::ipc::writer::IpcWriteOptions; -use arrow_flight::encode::FlightDataEncoderBuilder; -use arrow_flight::error::FlightError; -use arrow_flight::flight_service_server::{FlightService, FlightServiceServer}; -use arrow_flight::sql::server::{ - FlightSqlService as ArrowFlightSqlService, PeekableFlightDataStream, -}; +use std::pin::Pin; +use std::sync::Arc; + +use arrow::{datatypes::SchemaRef, error::ArrowError, ipc::writer::IpcWriteOptions}; use arrow_flight::sql::{ - self, ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, + self, + server::{FlightSqlService as ArrowFlightSqlService, PeekableFlightDataStream}, + ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, @@ -16,26 +13,30 @@ use arrow_flight::sql::{ CommandGetCrossReference, CommandGetDbSchemas, CommandGetExportedKeys, CommandGetImportedKeys, CommandGetPrimaryKeys, CommandGetSqlInfo, CommandGetTableTypes, CommandGetTables, CommandGetXdbcTypeInfo, CommandPreparedStatementQuery, CommandPreparedStatementUpdate, - CommandStatementQuery, CommandStatementSubstraitPlan, CommandStatementUpdate, SqlInfo, - TicketStatementQuery, + CommandStatementQuery, CommandStatementSubstraitPlan, CommandStatementUpdate, + DoPutPreparedStatementResult, SqlInfo, TicketStatementQuery, }; use arrow_flight::{ + encode::FlightDataEncoderBuilder, + error::FlightError, + flight_service_server::{FlightService, FlightServiceServer}, Action, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, HandshakeResponse, IpcMessage, SchemaAsIpc, Ticket, }; -use datafusion::common::arrow::datatypes::Schema; -use datafusion::dataframe::DataFrame; -use datafusion::error::{DataFusionError, Result as DataFusionResult}; -use datafusion::execution::context::{SQLOptions, SessionContext, SessionState}; -use datafusion::logical_expr::LogicalPlan; -use datafusion::physical_plan::SendableRecordBatchStream; -use datafusion_substrait::logical_plan::consumer::from_substrait_plan; -use datafusion_substrait::serializer::deserialize_bytes; +use datafusion::{ + common::arrow::datatypes::Schema, + dataframe::DataFrame, + error::{DataFusionError, Result as DataFusionResult}, + execution::context::{SQLOptions, SessionContext, SessionState}, + logical_expr::LogicalPlan, + physical_plan::SendableRecordBatchStream, +}; +use datafusion_substrait::{ + logical_plan::consumer::from_substrait_plan, serializer::deserialize_bytes, +}; use futures::{Stream, StreamExt, TryStreamExt}; use log::info; use prost::bytes::Bytes; -use std::pin::Pin; -use std::sync::Arc; use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; @@ -601,7 +602,7 @@ impl ArrowFlightSqlService for FlightSqlService { &self, _query: CommandPreparedStatementQuery, request: Request, - ) -> Result::DoPutStream>> { + ) -> Result { info!("do_put_prepared_statement_query"); let (_, _) = self.new_context(request)?; diff --git a/sources/flight-sql/src/server/state.rs b/sources/flight-sql/src/server/state.rs index 0b4ec09..e28ffdd 100644 --- a/sources/flight-sql/src/server/state.rs +++ b/sources/flight-sql/src/server/state.rs @@ -1,9 +1,10 @@ +use std::fmt::Display; + use arrow_flight::{ error::FlightError, sql::{self, Any, Command}, }; use prost::{bytes::Bytes, Message}; -use std::fmt::Display; pub type Result = std::result::Result; From a514a6cd4734de1ab1a0529deacc2eab1b21eb4d Mon Sep 17 00:00:00 2001 From: hozan23 Date: Fri, 23 Aug 2024 09:45:32 +0200 Subject: [PATCH 07/31] move sources/sql code to datafusion-federation crate and add sql feature --- Cargo.toml | 3 +-- datafusion-federation/Cargo.toml | 5 +++++ datafusion-federation/src/lib.rs | 18 ++++++++++-------- .../src/sql}/executor.rs | 0 .../src/sql/mod.rs | 15 +++++++-------- .../src/sql}/schema.rs | 11 +++++------ 6 files changed, 28 insertions(+), 24 deletions(-) rename {sources/sql/src => datafusion-federation/src/sql}/executor.rs (100%) rename sources/sql/src/lib.rs => datafusion-federation/src/sql/mod.rs (95%) rename {sources/sql/src => datafusion-federation/src/sql}/schema.rs (96%) diff --git a/Cargo.toml b/Cargo.toml index 09d77a8..f149159 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,8 +3,7 @@ resolver = "2" members = [ "datafusion-federation", - "sources/sql", - "sources/flight-sql", + # "sources/flight-sql", ] [workspace.package] diff --git a/datafusion-federation/Cargo.toml b/datafusion-federation/Cargo.toml index 83e2723..fa6bd74 100644 --- a/datafusion-federation/Cargo.toml +++ b/datafusion-federation/Cargo.toml @@ -10,10 +10,15 @@ description = "Datafusion federation." name = "datafusion_federation" path = "src/lib.rs" +[features] +sql = ["futures"] + [dependencies] async-trait.workspace = true datafusion.workspace = true +futures = { version = "0.3.30", optional = true } + [package.metadata.docs.rs] # Whether to pass `--all-features` to Cargo (default: false) diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index 81de605..012c62d 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -1,5 +1,11 @@ -use core::fmt; +mod optimizer; +mod plan_node; +mod table_provider; +#[cfg(feature = "sql")] +pub mod sql; + use std::{ + fmt, hash::{Hash, Hasher}, sync::Arc, }; @@ -9,13 +15,9 @@ use datafusion::{ optimizer::{optimizer::Optimizer, OptimizerRule}, }; -mod optimizer; -pub use optimizer::*; -mod table_provider; -pub use table_provider::*; - -mod plan_node; -pub use plan_node::*; +pub use optimizer::{get_table_source, FederationOptimizerRule}; +pub use plan_node::{FederatedPlanNode, FederatedQueryPlanner, FederationPlanner}; +pub use table_provider::{FederatedTableProviderAdaptor, FederatedTableSource}; pub fn default_session_state() -> SessionState { let rules = default_optimizer_rules(); diff --git a/sources/sql/src/executor.rs b/datafusion-federation/src/sql/executor.rs similarity index 100% rename from sources/sql/src/executor.rs rename to datafusion-federation/src/sql/executor.rs diff --git a/sources/sql/src/lib.rs b/datafusion-federation/src/sql/mod.rs similarity index 95% rename from sources/sql/src/lib.rs rename to datafusion-federation/src/sql/mod.rs index 917fa8d..d37f2b5 100644 --- a/sources/sql/src/lib.rs +++ b/datafusion-federation/src/sql/mod.rs @@ -1,6 +1,9 @@ -use core::fmt; -use std::{any::Any, sync::Arc, vec}; +mod executor; +mod schema; + +use std::{any::Any, fmt, sync::Arc, vec}; +use crate::{FederatedPlanNode, FederationPlanner, FederationProvider}; use async_trait::async_trait; use datafusion::{ arrow::datatypes::{Schema, SchemaRef}, @@ -15,13 +18,9 @@ use datafusion::{ }, sql::unparser::plan_to_sql, }; -use datafusion_federation::{FederatedPlanNode, FederationPlanner, FederationProvider}; -mod schema; -pub use schema::*; - -mod executor; -pub use executor::*; +pub use executor::{SQLExecutor, SQLExecutorRef}; +pub use schema::{MultiSchemaProvider, SQLSchemaProvider, SQLTableSource}; // #[macro_use] // extern crate derive_builder; diff --git a/sources/sql/src/schema.rs b/datafusion-federation/src/sql/schema.rs similarity index 96% rename from sources/sql/src/schema.rs rename to datafusion-federation/src/sql/schema.rs index 83d6e08..cb35ee6 100644 --- a/sources/sql/src/schema.rs +++ b/datafusion-federation/src/sql/schema.rs @@ -1,18 +1,17 @@ -use async_trait::async_trait; +use std::{any::Any, sync::Arc}; +use async_trait::async_trait; use datafusion::logical_expr::{TableSource, TableType}; use datafusion::{ arrow::datatypes::SchemaRef, catalog::SchemaProvider, datasource::TableProvider, error::Result, }; use futures::future::join_all; -use std::{any::Any, sync::Arc}; -use datafusion_federation::{ - FederatedTableProviderAdaptor, FederatedTableSource, FederationProvider, +use crate::{ + sql::SQLFederationProvider, FederatedTableProviderAdaptor, FederatedTableSource, + FederationProvider, }; -use crate::SQLFederationProvider; - pub struct SQLSchemaProvider { // provider: Arc, tables: Vec>, From b699eeca1ec71a07b86b9ab1f8b7f1211e5badc8 Mon Sep 17 00:00:00 2001 From: hozan23 Date: Fri, 23 Aug 2024 10:00:34 +0200 Subject: [PATCH 08/31] move sources/flight-sql to datafusion-flight-sql-server --- Cargo.toml | 2 +- datafusion-federation/src/lib.rs | 2 +- datafusion-federation/src/sql/mod.rs | 3 ++- .../Cargo.toml | 17 ++++++++++------ .../src/executor/mod.rs | 6 +++--- .../src/lib.rs | 0 .../src/server/mod.rs | 0 .../src/server/service.rs | 0 .../src/server/session.rs | 0 .../src/server/state.rs | 0 sources/sql/Cargo.toml | 20 ------------------- sources/sql/README.md | 4 ---- 12 files changed, 18 insertions(+), 36 deletions(-) rename {sources/flight-sql => datafusion-flight-sql-server}/Cargo.toml (60%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/executor/mod.rs (98%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/lib.rs (100%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/server/mod.rs (100%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/server/service.rs (100%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/server/session.rs (100%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/server/state.rs (100%) delete mode 100644 sources/sql/Cargo.toml delete mode 100644 sources/sql/README.md diff --git a/Cargo.toml b/Cargo.toml index f149159..53ee52c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = [ "datafusion-federation", - # "sources/flight-sql", + "datafusion-flight-sql-server", ] [workspace.package] diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index 012c62d..b3eaff3 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -1,8 +1,8 @@ mod optimizer; mod plan_node; -mod table_provider; #[cfg(feature = "sql")] pub mod sql; +mod table_provider; use std::{ fmt, diff --git a/datafusion-federation/src/sql/mod.rs b/datafusion-federation/src/sql/mod.rs index d37f2b5..e19950d 100644 --- a/datafusion-federation/src/sql/mod.rs +++ b/datafusion-federation/src/sql/mod.rs @@ -3,7 +3,6 @@ mod schema; use std::{any::Any, fmt, sync::Arc, vec}; -use crate::{FederatedPlanNode, FederationPlanner, FederationProvider}; use async_trait::async_trait; use datafusion::{ arrow::datatypes::{Schema, SchemaRef}, @@ -22,6 +21,8 @@ use datafusion::{ pub use executor::{SQLExecutor, SQLExecutorRef}; pub use schema::{MultiSchemaProvider, SQLSchemaProvider, SQLTableSource}; +use crate::{FederatedPlanNode, FederationPlanner, FederationProvider}; + // #[macro_use] // extern crate derive_builder; diff --git a/sources/flight-sql/Cargo.toml b/datafusion-flight-sql-server/Cargo.toml similarity index 60% rename from sources/flight-sql/Cargo.toml rename to datafusion-flight-sql-server/Cargo.toml index 07c39e7..8e466e2 100644 --- a/sources/flight-sql/Cargo.toml +++ b/datafusion-flight-sql-server/Cargo.toml @@ -1,24 +1,29 @@ [package] -name = "datafusion-federation-flight-sql" +name = "datafusion-flight-sql-server" version.workspace = true edition.workspace = true license.workspace = true [lib] -name = "datafusion_federation_flight_sql" +name = "datafusion_flight_sql_server" path = "src/lib.rs" [dependencies] async-trait.workspace = true datafusion.workspace = true datafusion-substrait.workspace = true - # XXX use the release verion on crates.io -datafusion-federation.path = "../../datafusion-federation" -datafusion-federation-sql.path = "../sql" +datafusion-federation = { path = "../datafusion-federation", features = [ + "sql", +] } futures = "0.3.30" -tonic = {version="0.11.0", features=["tls", "transport", "codegen", "prost"] } +tonic = { version = "0.11.0", features = [ + "tls", + "transport", + "codegen", + "prost", +] } prost = "0.12.3" arrow = "52.0.0" arrow-flight = { version = "52.2.0", features = ["flight-sql-experimental"] } diff --git a/sources/flight-sql/src/executor/mod.rs b/datafusion-flight-sql-server/src/executor/mod.rs similarity index 98% rename from sources/flight-sql/src/executor/mod.rs rename to datafusion-flight-sql-server/src/executor/mod.rs index a5c5a38..930939d 100644 --- a/sources/flight-sql/src/executor/mod.rs +++ b/datafusion-flight-sql-server/src/executor/mod.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow::{datatypes::SchemaRef, error::ArrowError}; use arrow_flight::sql::client::FlightSqlServiceClient; use async_trait::async_trait; @@ -6,10 +8,8 @@ use datafusion::{ physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}, sql::sqlparser::dialect::{Dialect, GenericDialect}, }; -use datafusion_federation_sql::SQLExecutor; +use datafusion_federation::sql::SQLExecutor; use futures::TryStreamExt; - -use std::sync::Arc; use tonic::transport::Channel; pub struct FlightSQLExecutor { diff --git a/sources/flight-sql/src/lib.rs b/datafusion-flight-sql-server/src/lib.rs similarity index 100% rename from sources/flight-sql/src/lib.rs rename to datafusion-flight-sql-server/src/lib.rs diff --git a/sources/flight-sql/src/server/mod.rs b/datafusion-flight-sql-server/src/server/mod.rs similarity index 100% rename from sources/flight-sql/src/server/mod.rs rename to datafusion-flight-sql-server/src/server/mod.rs diff --git a/sources/flight-sql/src/server/service.rs b/datafusion-flight-sql-server/src/server/service.rs similarity index 100% rename from sources/flight-sql/src/server/service.rs rename to datafusion-flight-sql-server/src/server/service.rs diff --git a/sources/flight-sql/src/server/session.rs b/datafusion-flight-sql-server/src/server/session.rs similarity index 100% rename from sources/flight-sql/src/server/session.rs rename to datafusion-flight-sql-server/src/server/session.rs diff --git a/sources/flight-sql/src/server/state.rs b/datafusion-flight-sql-server/src/server/state.rs similarity index 100% rename from sources/flight-sql/src/server/state.rs rename to datafusion-flight-sql-server/src/server/state.rs diff --git a/sources/sql/Cargo.toml b/sources/sql/Cargo.toml deleted file mode 100644 index 31d8218..0000000 --- a/sources/sql/Cargo.toml +++ /dev/null @@ -1,20 +0,0 @@ -[package] -name = "datafusion-federation-sql" -version.workspace = true -edition.workspace = true -license.workspace = true - -[lib] -name = "datafusion_federation_sql" -path = "src/lib.rs" - -[dependencies] -async-trait.workspace = true -datafusion.workspace = true - -# XXX use the release verion on crates.io -datafusion-federation.path = "../../datafusion-federation" - -connectorx = { version = "0.3.3" , features = ["dst_arrow", "src_sqlite"] } -futures = "0.3.30" -tokio = "1.39" diff --git a/sources/sql/README.md b/sources/sql/README.md deleted file mode 100644 index def0668..0000000 --- a/sources/sql/README.md +++ /dev/null @@ -1,4 +0,0 @@ - - -This will be move to -[datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers) repository From d9449e5ea0cc13afdf71118216a2141188566bec Mon Sep 17 00:00:00 2001 From: hozan23 Date: Fri, 23 Aug 2024 10:59:20 +0200 Subject: [PATCH 09/31] "create datafusion-flight-sql-table-provider crate & move the executor from datafusion-flight-sql-server" --- Cargo.toml | 1 + README.md | 3 +++ datafusion-flight-sql-server/Cargo.toml | 5 +++- .../examples/flight-sql.rs | 13 ++++------ .../examples/test.csv | 4 ++++ datafusion-flight-sql-server/src/lib.rs | 5 ++-- .../src/server/mod.rs | 6 ----- .../src/{server => }/service.rs | 5 ++-- .../src/{server => }/session.rs | 0 .../src/{server => }/state.rs | 0 .../Cargo.toml | 24 +++++++++++++++++++ .../src/lib.rs | 0 12 files changed, 46 insertions(+), 20 deletions(-) rename {examples => datafusion-flight-sql-server}/examples/flight-sql.rs (88%) create mode 100644 datafusion-flight-sql-server/examples/test.csv delete mode 100644 datafusion-flight-sql-server/src/server/mod.rs rename datafusion-flight-sql-server/src/{server => }/service.rs (99%) rename datafusion-flight-sql-server/src/{server => }/session.rs (100%) rename datafusion-flight-sql-server/src/{server => }/state.rs (100%) create mode 100644 datafusion-flight-sql-table-provider/Cargo.toml rename datafusion-flight-sql-server/src/executor/mod.rs => datafusion-flight-sql-table-provider/src/lib.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index 53ee52c..83ca62e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ resolver = "2" members = [ "datafusion-federation", "datafusion-flight-sql-server", + "datafusion-flight-sql-table-provider", ] [workspace.package] diff --git a/README.md b/README.md index 028db87..ed7fdef 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,9 @@ The goal of this repo is to allow [DataFusion](https://github.com/apache/arrow-datafusion) to resolve queries across remote query engines while pushing down as much compute as possible down. + +> :warning: **All the examples are deprecated for now** + Check out [the examples](./examples/) to get a feel for how it works. Potential use-cases: diff --git a/datafusion-flight-sql-server/Cargo.toml b/datafusion-flight-sql-server/Cargo.toml index 8e466e2..ede6979 100644 --- a/datafusion-flight-sql-server/Cargo.toml +++ b/datafusion-flight-sql-server/Cargo.toml @@ -9,7 +9,6 @@ name = "datafusion_flight_sql_server" path = "src/lib.rs" [dependencies] -async-trait.workspace = true datafusion.workspace = true datafusion-substrait.workspace = true # XXX use the release verion on crates.io @@ -28,3 +27,7 @@ prost = "0.12.3" arrow = "52.0.0" arrow-flight = { version = "52.2.0", features = ["flight-sql-experimental"] } log = "0.4.22" + +[dev-dependencies] +tokio = { version = "1.39.3", features = ["full"] } +datafusion-flight-sql-table-provider = { path = "../datafusion-flight-sql-table-provider" } diff --git a/examples/examples/flight-sql.rs b/datafusion-flight-sql-server/examples/flight-sql.rs similarity index 88% rename from examples/examples/flight-sql.rs rename to datafusion-flight-sql-server/examples/flight-sql.rs index e0899cb..021227a 100644 --- a/examples/examples/flight-sql.rs +++ b/datafusion-flight-sql-server/examples/flight-sql.rs @@ -2,15 +2,16 @@ use std::{sync::Arc, time::Duration}; use arrow_flight::sql::client::FlightSqlServiceClient; use datafusion::{ - catalog::schema::SchemaProvider, + catalog::SchemaProvider, error::{DataFusionError, Result}, execution::{ context::{SessionContext, SessionState}, options::CsvReadOptions, }, }; -use datafusion_federation_flight_sql::{executor::FlightSQLExecutor, server::FlightSqlService}; -use datafusion_federation_sql::{SQLFederationProvider, SQLSchemaProvider}; +use datafusion_federation::sql::{SQLFederationProvider, SQLSchemaProvider}; +use datafusion_flight_sql_server::service::FlightSqlService; +use datafusion_flight_sql_table_provider::FlightSQLExecutor; use tokio::time::sleep; use tonic::transport::Endpoint; @@ -19,11 +20,7 @@ async fn main() -> Result<()> { let dsn: String = "0.0.0.0:50051".to_string(); let remote_ctx = SessionContext::new(); remote_ctx - .register_csv( - "test", - "./examples/examples/test.csv", - CsvReadOptions::new(), - ) + .register_csv("test", "./examples/test.csv", CsvReadOptions::new()) .await?; // Remote context diff --git a/datafusion-flight-sql-server/examples/test.csv b/datafusion-flight-sql-server/examples/test.csv new file mode 100644 index 0000000..811d276 --- /dev/null +++ b/datafusion-flight-sql-server/examples/test.csv @@ -0,0 +1,4 @@ +foo,bar +a,1 +b,2 +c,3 \ No newline at end of file diff --git a/datafusion-flight-sql-server/src/lib.rs b/datafusion-flight-sql-server/src/lib.rs index a8795c9..f7f6eac 100644 --- a/datafusion-flight-sql-server/src/lib.rs +++ b/datafusion-flight-sql-server/src/lib.rs @@ -1,2 +1,3 @@ -pub mod executor; -pub mod server; +pub mod service; +pub mod session; +pub mod state; diff --git a/datafusion-flight-sql-server/src/server/mod.rs b/datafusion-flight-sql-server/src/server/mod.rs deleted file mode 100644 index 0c054b9..0000000 --- a/datafusion-flight-sql-server/src/server/mod.rs +++ /dev/null @@ -1,6 +0,0 @@ -mod service; -pub use service::*; -mod state; -pub use state::*; -mod session; -pub use session::*; diff --git a/datafusion-flight-sql-server/src/server/service.rs b/datafusion-flight-sql-server/src/service.rs similarity index 99% rename from datafusion-flight-sql-server/src/server/service.rs rename to datafusion-flight-sql-server/src/service.rs index afa4a5f..8a85132 100644 --- a/datafusion-flight-sql-server/src/server/service.rs +++ b/datafusion-flight-sql-server/src/service.rs @@ -1,5 +1,4 @@ -use std::pin::Pin; -use std::sync::Arc; +use std::{pin::Pin, sync::Arc}; use arrow::{datatypes::SchemaRef, error::ArrowError, ipc::writer::IpcWriteOptions}; use arrow_flight::sql::{ @@ -40,8 +39,8 @@ use prost::bytes::Bytes; use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; +use super::session::{SessionStateProvider, StaticSessionStateProvider}; use super::state::{CommandTicket, QueryHandle}; -use super::{SessionStateProvider, StaticSessionStateProvider}; type Result = std::result::Result; diff --git a/datafusion-flight-sql-server/src/server/session.rs b/datafusion-flight-sql-server/src/session.rs similarity index 100% rename from datafusion-flight-sql-server/src/server/session.rs rename to datafusion-flight-sql-server/src/session.rs diff --git a/datafusion-flight-sql-server/src/server/state.rs b/datafusion-flight-sql-server/src/state.rs similarity index 100% rename from datafusion-flight-sql-server/src/server/state.rs rename to datafusion-flight-sql-server/src/state.rs diff --git a/datafusion-flight-sql-table-provider/Cargo.toml b/datafusion-flight-sql-table-provider/Cargo.toml new file mode 100644 index 0000000..4302133 --- /dev/null +++ b/datafusion-flight-sql-table-provider/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "datafusion-flight-sql-table-provider" +version.workspace = true +edition.workspace = true +license.workspace = true +readme.workspace = true + +[dependencies] +async-trait.workspace = true +datafusion.workspace = true +# XXX use the release verion on crates.io +datafusion-federation = { path = "../datafusion-federation", features = [ + "sql", +] } + +futures = "0.3.30" +tonic = { version = "0.11.0", features = [ + "tls", + "transport", + "codegen", + "prost", +] } +arrow = "52.0.0" +arrow-flight = { version = "52.2.0", features = ["flight-sql-experimental"] } diff --git a/datafusion-flight-sql-server/src/executor/mod.rs b/datafusion-flight-sql-table-provider/src/lib.rs similarity index 100% rename from datafusion-flight-sql-server/src/executor/mod.rs rename to datafusion-flight-sql-table-provider/src/lib.rs From ac84a5023b0cf71ec1be5675e2adcf027085fe82 Mon Sep 17 00:00:00 2001 From: hozan23 Date: Fri, 23 Aug 2024 11:27:17 +0200 Subject: [PATCH 10/31] override supports_rewrite for FederationOptimizerRule and SQLFederationOptimizerRule --- datafusion-federation/src/optimizer.rs | 6 ++++++ datafusion-federation/src/sql/mod.rs | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/datafusion-federation/src/optimizer.rs b/datafusion-federation/src/optimizer.rs index d0fc24b..8f4cf04 100644 --- a/datafusion-federation/src/optimizer.rs +++ b/datafusion-federation/src/optimizer.rs @@ -35,6 +35,12 @@ impl OptimizerRule for FederationOptimizerRule { fn name(&self) -> &str { "federation_optimizer_rule" } + + /// XXX + /// Does this rule support rewriting owned plans (rather than by reference)? + fn supports_rewrite(&self) -> bool { + false + } } enum ScanResult { diff --git a/datafusion-federation/src/sql/mod.rs b/datafusion-federation/src/sql/mod.rs index e19950d..e68f8fa 100644 --- a/datafusion-federation/src/sql/mod.rs +++ b/datafusion-federation/src/sql/mod.rs @@ -93,6 +93,12 @@ impl OptimizerRule for SQLFederationOptimizerRule { fn name(&self) -> &str { "federate_sql" } + + /// XXX + /// Does this rule support rewriting owned plans (rather than by reference)? + fn supports_rewrite(&self) -> bool { + false + } } struct SQLFederationPlanner { executor: Arc, From 5d240d7584fc0ee7f1f305e5d732abc9c49c586e Mon Sep 17 00:00:00 2001 From: Michiel De Backker Date: Fri, 23 Aug 2024 14:56:37 +0200 Subject: [PATCH 11/31] Improve project overview --- README.md | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed7fdef..9d39ffa 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,20 @@ [![crates.io](https://img.shields.io/crates/v/datafusion-federation.svg)](https://crates.io/crates/datafusion-federation) [![docs.rs](https://docs.rs/datafusion-federation/badge.svg)](https://docs.rs/datafusion-federation) -The goal of this repo is to allow [DataFusion](https://github.com/apache/arrow-datafusion) to resolve queries across remote query engines while pushing down as much compute as possible down. +DataFusion Federation allows [DataFusion](https://github.com/apache/arrow-datafusion) to execute (part of) a query plan by a remote execution engine. + ┌────────────────┐ + ┌────────────┐ │ Remote DBMS(s) │ + SQL Query ───> │ DataFusion │ ───> │ ( execution │ + └────────────┘ │ happens here ) │ + └────────────────┘ + +The goal is to allow resolving queries across remote query engines while pushing down as much compute as possible to the remote database(s). This allows execution to happen as close to the storage as possible. This concept is referred to as 'query federation'. + +> [!TIP] +> This repository implements the federation framework itself. If you want to connect to a specific database, check out the compatible providers available in [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). + +#### Usage > :warning: **All the examples are deprecated for now** @@ -17,6 +29,71 @@ Potential use-cases: - DataFusion -> Flight SQL -> DataFusion - .. +#### Design concept + +Say you have a query plan as follows: + + ┌────────────┐ + │ Join │ + └────────────┘ + ▲ + ┌───────┴────────┐ + ┌────────────┐ ┌────────────┐ + │ Scan A │ │ Join │ + └────────────┘ └────────────┘ + ▲ + ┌───────┴────────┐ + ┌────────────┐ ┌────────────┐ + │ Scan B │ │ Scan C │ + └────────────┘ └────────────┘ + +DataFusion Federation will identify the largest possible sub-plans that +can be executed by an external database: + + ┌────────────┐ Optimizer pass + │ Join │ recognizes B and C + └────────────┘ are available in an + ▲ external database + ┌──────────────┴────────┐ + │ ┌ ─ ─ ─ ─ ─ ─ ┴ ─ ── ─ ─ ─ ─ ─┐ + ┌────────────┐ ┌────────────┐ │ + │ Scan A │ │ │ Join │ + └────────────┘ └────────────┘ │ + │ ▲ + ┌───────┴────────┐ │ + ┌────────────┐ ┌────────────┐ │ + ││ Scan B │ │ Scan C │ + └────────────┘ └────────────┘ │ + ─ ── ─ ─ ── ─ ─ ─ ─ ─ ─ ─ ── ─ ┘ + +The sub-plans are cut out and replaced by an opaque federation node in the plan: + + ┌────────────┐ + │ Join │ + └────────────┘ Rewritten Plan + ▲ + ┌────────┴───────────┐ + │ │ + ┌────────────┐ ┏━━━━━━━━━━━━━━━━━━┓ + │ Scan A │ ┃ Scan B+C ┃ + └────────────┘ ┃ (TableProvider ┃ + ┃ that can execute ┃ + ┃ sub-plan in an ┃ + ┃external database)┃ + ┗━━━━━━━━━━━━━━━━━━┛ + +Different databases may have different query languages and execution capabilities. To accommodate for this, we allow each 'federation provider' to self-determine what part of a sub-plan it will actually federate. This is done by letting each federation provider define its own optimizer rule. When a sub-plan is 'cut out' of the overall plan, it is first passed the federation provider's optimizer rule. This optimizer rule determines the part of the plan that is cut out, based based on the execution capabilities of the database it represents. + +#### Implementation + +A remote database is represented by the `FederationProvider` trait. To identify table scans that are available in the same database, they implement `FederatedTableSource` trait. This trait allows lookup of the corresponding `FederationProvider`. + +Identifying sub-plans to federate is done by the `FederationOptimizerRule`. This rule needs to be registered in your DataFusion SessionState. One easy way to do this is using `default_session_state`. To do its job, the `FederationOptimizerRule` currently requires that all TableProviders that need to be federated are `FederatedTableProviderAdaptor`s. The `FederatedTableProviderAdaptor` also has a fallback mechanism that allows implementations to fallback to a 'vanilla' TableProvider in case the `FederationOptimizerRule` isn't registered. + +The `FederationProvider` can provide a `compute_context`. This allows it to differentiate between multiple remote execution context of the same type. For example two different mysql instances, database schemas, access level, etc. The `FederationProvider` also returns the `Optimizer` that is allows it to self-determine what part of a sub-plan it can federate. + +The `sql` module implements a generic `FederationProvider` for SQL execution engines. A specific SQL engine implements the `SQLExecutor` trait for its engine specific execution. There are a number of compatible providers available in [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). + #### Status The project is in alpha status. Contributions welcome; land a PR = commit access. From f4babeeac31d23f8316195e6132fa39ab6e01683 Mon Sep 17 00:00:00 2001 From: hozan23 <119854621+hozan23@users.noreply.github.com> Date: Sat, 24 Aug 2024 21:06:53 +0200 Subject: [PATCH 12/31] update README.md (#48) --- README.md | 84 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 9d39ffa..0553c83 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ -## DataFusion Federation +# DataFusion Federation [![crates.io](https://img.shields.io/crates/v/datafusion-federation.svg)](https://crates.io/crates/datafusion-federation) [![docs.rs](https://docs.rs/datafusion-federation/badge.svg)](https://docs.rs/datafusion-federation) -DataFusion Federation allows [DataFusion](https://github.com/apache/arrow-datafusion) to execute (part of) a query plan by a remote execution engine. +DataFusion Federation allows +[DataFusion](https://github.com/apache/arrow-datafusion) to execute (part of) a +query plan by a remote execution engine. ┌────────────────┐ ┌────────────┐ │ Remote DBMS(s) │ @@ -11,25 +13,31 @@ DataFusion Federation allows [DataFusion](https://github.com/apache/arrow-datafu └────────────┘ │ happens here ) │ └────────────────┘ -The goal is to allow resolving queries across remote query engines while pushing down as much compute as possible to the remote database(s). This allows execution to happen as close to the storage as possible. This concept is referred to as 'query federation'. +The goal is to allow resolving queries across remote query engines while +pushing down as much compute as possible to the remote database(s). This allows +execution to happen as close to the storage as possible. This concept is +referred to as 'query federation'. > [!TIP] -> This repository implements the federation framework itself. If you want to connect to a specific database, check out the compatible providers available in [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). +> This repository implements the federation framework itself. If you want to +> connect to a specific database, check out the compatible providers available +> in +> [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). -#### Usage +## Usage > :warning: **All the examples are deprecated for now** Check out [the examples](./examples/) to get a feel for how it works. -Potential use-cases: +## Potential use-cases: - Querying across SQLite, MySQL, PostgreSQL, ... - Pushing down SQL or [Substrait](https://substrait.io/) plans. - DataFusion -> Flight SQL -> DataFusion - .. -#### Design concept +## Design concept Say you have a query plan as follows: @@ -50,9 +58,9 @@ Say you have a query plan as follows: DataFusion Federation will identify the largest possible sub-plans that can be executed by an external database: - ┌────────────┐ Optimizer pass - │ Join │ recognizes B and C - └────────────┘ are available in an + ┌────────────┐ Optimizer recognizes + │ Join │ that B and C are + └────────────┘ available in an ▲ external database ┌──────────────┴────────┐ │ ┌ ─ ─ ─ ─ ─ ─ ┴ ─ ── ─ ─ ─ ─ ─┐ @@ -82,21 +90,47 @@ The sub-plans are cut out and replaced by an opaque federation node in the plan: ┃external database)┃ ┗━━━━━━━━━━━━━━━━━━┛ -Different databases may have different query languages and execution capabilities. To accommodate for this, we allow each 'federation provider' to self-determine what part of a sub-plan it will actually federate. This is done by letting each federation provider define its own optimizer rule. When a sub-plan is 'cut out' of the overall plan, it is first passed the federation provider's optimizer rule. This optimizer rule determines the part of the plan that is cut out, based based on the execution capabilities of the database it represents. - -#### Implementation - -A remote database is represented by the `FederationProvider` trait. To identify table scans that are available in the same database, they implement `FederatedTableSource` trait. This trait allows lookup of the corresponding `FederationProvider`. - -Identifying sub-plans to federate is done by the `FederationOptimizerRule`. This rule needs to be registered in your DataFusion SessionState. One easy way to do this is using `default_session_state`. To do its job, the `FederationOptimizerRule` currently requires that all TableProviders that need to be federated are `FederatedTableProviderAdaptor`s. The `FederatedTableProviderAdaptor` also has a fallback mechanism that allows implementations to fallback to a 'vanilla' TableProvider in case the `FederationOptimizerRule` isn't registered. - -The `FederationProvider` can provide a `compute_context`. This allows it to differentiate between multiple remote execution context of the same type. For example two different mysql instances, database schemas, access level, etc. The `FederationProvider` also returns the `Optimizer` that is allows it to self-determine what part of a sub-plan it can federate. - -The `sql` module implements a generic `FederationProvider` for SQL execution engines. A specific SQL engine implements the `SQLExecutor` trait for its engine specific execution. There are a number of compatible providers available in [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). - -#### Status - -The project is in alpha status. Contributions welcome; land a PR = commit access. +Different databases may have different query languages and execution +capabilities. To accommodate for this, we allow each 'federation provider' to +self-determine what part of a sub-plan it will actually federate. This is done +by letting each federation provider define its own optimizer rule. When a +sub-plan is 'cut out' of the overall plan, it is first passed the federation +provider's optimizer rule. This optimizer rule determines the part of the plan +that is cut out, based on the execution capabilities of the database it +represents. + +## Implementation + +A remote database is represented by the `FederationProvider` trait. To identify +table scans that are available in the same database, they implement +`FederatedTableSource` trait. This trait allows lookup of the corresponding +`FederationProvider`. + +Identifying sub-plans to federate is done by the `FederationOptimizerRule`. +This rule needs to be registered in your DataFusion SessionState. One easy way +to do this is using `default_session_state`. To do its job, the +`FederationOptimizerRule` currently requires that all TableProviders that need +to be federated are `FederatedTableProviderAdaptor`s. The +`FederatedTableProviderAdaptor` also has a fallback mechanism that allows +implementations to fallback to a 'vanilla' TableProvider in case the +`FederationOptimizerRule` isn't registered. + +The `FederationProvider` can provide a `compute_context`. This allows it to +differentiate between multiple remote execution context of the same type. For +example two different mysql instances, database schemas, access level, etc. The +`FederationProvider` also returns the `Optimizer` that is allows it to +self-determine what part of a sub-plan it can federate. + +The `sql` module implements a generic `FederationProvider` for SQL execution +engines. A specific SQL engine implements the `SQLExecutor` trait for its +engine specific execution. There are a number of compatible providers available +in +[datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). + +## Status + +The project is in alpha status. Contributions welcome; land a PR = commit +access. - [Docs (release)](https://docs.rs/datafusion-federation) - [Docs (main)](https://datafusion-contrib.github.io/datafusion-federation/) From 48dbea52e626d0ac316a747d07b75cf4d29c989d Mon Sep 17 00:00:00 2001 From: hozan23 <119854621+hozan23@users.noreply.github.com> Date: Mon, 26 Aug 2024 09:27:43 +0200 Subject: [PATCH 13/31] datafusion-fedeartion: Remove assert_eq macros and handle errors properly (#49) --- datafusion-federation/src/plan_node.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/datafusion-federation/src/plan_node.rs b/datafusion-federation/src/plan_node.rs index c81b152..5a8cd00 100644 --- a/datafusion-federation/src/plan_node.rs +++ b/datafusion-federation/src/plan_node.rs @@ -8,7 +8,7 @@ use std::{ use async_trait::async_trait; use datafusion::{ common::DFSchemaRef, - error::Result, + error::{DataFusionError, Result}, execution::context::{QueryPlanner, SessionState}, logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNode, UserDefinedLogicalNodeCore}, physical_plan::ExecutionPlan, @@ -57,17 +57,14 @@ impl UserDefinedLogicalNodeCore for FederatedPlanNode { write!(f, "Federated\n {:?}", self.plan) } - fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self { - assert_eq!(inputs.len(), 0, "input size inconsistent"); - assert_eq!(exprs.len(), 0, "expression size inconsistent"); - Self { - plan: self.plan.clone(), - planner: self.planner.clone(), + fn with_exprs_and_inputs(&self, exprs: Vec, inputs: Vec) -> Result { + if !inputs.is_empty() { + return Err(DataFusionError::Plan("input size inconsistent".into())); + } + if !exprs.is_empty() { + return Err(DataFusionError::Plan("expression size inconsistent".into())); } - } - /// XXX should consider something else here ? - fn with_exprs_and_inputs(&self, _exprs: Vec, _inputs: Vec) -> Result { Ok(Self { plan: self.plan.clone(), planner: self.planner.clone(), @@ -149,8 +146,11 @@ impl ExtensionPlanner for FederatedPlanner { ) -> Result>> { let dc_node = node.as_any().downcast_ref::(); if let Some(fed_node) = dc_node { - assert_eq!(logical_inputs.len(), 0, "Inconsistent number of inputs"); - assert_eq!(physical_inputs.len(), 0, "Inconsistent number of inputs"); + if !logical_inputs.is_empty() || !physical_inputs.is_empty() { + return Err(DataFusionError::Plan( + "Inconsistent number of inputs".into(), + )); + } let fed_planner = fed_node.planner.clone(); let exec_plan = fed_planner.plan_federation(fed_node, session_state).await?; From c8fa466b011c5b59f440bb08ec80b676737f7bf4 Mon Sep 17 00:00:00 2001 From: hozan23 <119854621+hozan23@users.noreply.github.com> Date: Mon, 26 Aug 2024 17:59:24 +0200 Subject: [PATCH 14/31] Add example in datafusion-federation crate (#50) --- README.md | 5 +- datafusion-federation/Cargo.toml | 18 ++-- datafusion-federation/examples/df-csv.rs | 115 +++++++++++++++++++++++ datafusion-federation/examples/test.csv | 4 + 4 files changed, 133 insertions(+), 9 deletions(-) create mode 100644 datafusion-federation/examples/df-csv.rs create mode 100644 datafusion-federation/examples/test.csv diff --git a/README.md b/README.md index 0553c83..dcfeaa4 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,8 @@ referred to as 'query federation'. ## Usage -> :warning: **All the examples are deprecated for now** - -Check out [the examples](./examples/) to get a feel for how it works. +Check out the [examples](./datafusion-federation/examples/) to get a feel for +how it works. ## Potential use-cases: diff --git a/datafusion-federation/Cargo.toml b/datafusion-federation/Cargo.toml index fa6bd74..6af448c 100644 --- a/datafusion-federation/Cargo.toml +++ b/datafusion-federation/Cargo.toml @@ -10,6 +10,12 @@ description = "Datafusion federation." name = "datafusion_federation" path = "src/lib.rs" +[package.metadata.docs.rs] +# Whether to pass `--all-features` to Cargo (default: false) +all-features = true +# Whether to pass `--no-default-features` to Cargo (default: false) +no-default-features = true + [features] sql = ["futures"] @@ -19,10 +25,10 @@ datafusion.workspace = true futures = { version = "0.3.30", optional = true } -[package.metadata.docs.rs] - -# Whether to pass `--all-features` to Cargo (default: false) -all-features = true +[dev-dependencies] +tokio = { version = "1.39.3", features = ["full"] } -# Whether to pass `--no-default-features` to Cargo (default: false) -no-default-features = true +[[example]] +name = "df-csv" +path = "examples/df-csv.rs" +required-features = ["sql"] diff --git a/datafusion-federation/examples/df-csv.rs b/datafusion-federation/examples/df-csv.rs new file mode 100644 index 0000000..24ac495 --- /dev/null +++ b/datafusion-federation/examples/df-csv.rs @@ -0,0 +1,115 @@ +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::{ + arrow::datatypes::SchemaRef, + catalog::SchemaProvider, + error::{DataFusionError, Result}, + execution::{ + context::{SessionContext, SessionState}, + options::CsvReadOptions, + }, + physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}, + sql::sqlparser::dialect::{Dialect, GenericDialect}, +}; +use datafusion_federation::sql::{SQLExecutor, SQLFederationProvider, SQLSchemaProvider}; +use futures::TryStreamExt; + +const CSV_PATH: &str = "./examples/test.csv"; +const TABLE_NAME: &str = "test"; + +#[tokio::main] +async fn main() -> Result<()> { + // Create a remote context + let remote_ctx = Arc::new(SessionContext::new()); + + // Registers a CSV file + remote_ctx + .register_csv(TABLE_NAME, CSV_PATH, CsvReadOptions::new()) + .await?; + let known_tables: Vec = [TABLE_NAME].iter().map(|&x| x.into()).collect(); + + // Register schema + let executor = Arc::new(InMemorySQLExecutor::new(remote_ctx)); + let provider = Arc::new(SQLFederationProvider::new(executor)); + let schema_provider = + Arc::new(SQLSchemaProvider::new_with_tables(provider, known_tables).await?); + + // Local context + let state = datafusion_federation::default_session_state(); + overwrite_default_schema(&state, schema_provider)?; + let ctx = SessionContext::new_with_state(state); + + // Run query + let query = r#"SELECT * from test"#; + let df = ctx.sql(query).await?; + + // let explain = df.clone().explain(true, false)?; + // explain.show().await?; + + df.show().await +} + +fn overwrite_default_schema(state: &SessionState, schema: Arc) -> Result<()> { + let options = &state.config().options().catalog; + let catalog = state + .catalog_list() + .catalog(options.default_catalog.as_str()) + .unwrap(); + + catalog.register_schema(options.default_schema.as_str(), schema)?; + + Ok(()) +} + +pub struct InMemorySQLExecutor { + session: Arc, +} + +impl InMemorySQLExecutor { + pub fn new(session: Arc) -> Self { + Self { session } + } +} + +#[async_trait] +impl SQLExecutor for InMemorySQLExecutor { + fn name(&self) -> &str { + "in_memory_sql_executor" + } + + fn compute_context(&self) -> Option { + None + } + + fn execute(&self, sql: &str, schema: SchemaRef) -> Result { + // Execute it using the remote datafusion session context + let future_stream = _execute(self.session.clone(), sql.to_string()); + let stream = futures::stream::once(future_stream).try_flatten(); + Ok(Box::pin(RecordBatchStreamAdapter::new( + schema.clone(), + stream, + ))) + } + + async fn table_names(&self) -> Result> { + Err(DataFusionError::NotImplemented( + "table inference not implemented".to_string(), + )) + } + + async fn get_table_schema(&self, table_name: &str) -> Result { + let sql = format!("select * from {table_name} limit 1"); + let df = self.session.sql(&sql).await?; + let schema = df.schema().as_arrow().clone(); + Ok(Arc::new(schema)) + } + + fn dialect(&self) -> Arc { + Arc::new(GenericDialect {}) + } +} + +async fn _execute(ctx: Arc, sql: String) -> Result { + ctx.sql(&sql).await?.execute_stream().await +} diff --git a/datafusion-federation/examples/test.csv b/datafusion-federation/examples/test.csv new file mode 100644 index 0000000..811d276 --- /dev/null +++ b/datafusion-federation/examples/test.csv @@ -0,0 +1,4 @@ +foo,bar +a,1 +b,2 +c,3 \ No newline at end of file From 1033cfffa2cc961fd5c8f8e360717a00237186a3 Mon Sep 17 00:00:00 2001 From: Michiel De Backker Date: Wed, 22 May 2024 08:54:11 +0200 Subject: [PATCH 15/31] feat: initial subquery support (#37) Co-authored-by: Suriya Kandaswamy Co-authored-by: Suriya Kandaswamy --- datafusion-federation/src/lib.rs | 44 ++- datafusion-federation/src/optimizer.rs | 429 +++++++++++++++++++++++++ examples/examples/flight-sql.rs | 9 +- examples/examples/postgres-partial.rs | 9 +- examples/examples/sqlite-partial.rs | 11 +- examples/examples/sqlite-subquery.rs | 90 ++++++ examples/examples/sqlite.rs | 10 +- sources/sql/src/lib.rs | 47 +-- 8 files changed, 588 insertions(+), 61 deletions(-) create mode 100644 datafusion-federation/src/optimizer.rs create mode 100644 examples/examples/sqlite-subquery.rs diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index 41ce145..bdcb03d 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -4,10 +4,13 @@ use std::{ sync::Arc, }; -use datafusion::optimizer::analyzer::Analyzer; +use datafusion::{ + execution::context::{SessionContext, SessionState}, + optimizer::{optimizer::Optimizer, OptimizerRule}, +}; -mod analyzer; -pub use analyzer::*; +mod optimizer; +pub use optimizer::*; mod table_provider; pub use table_provider::*; @@ -15,6 +18,37 @@ mod plan_node; pub use plan_node::*; pub mod schema_cast; +pub fn default_session_state() -> SessionState { + let df_state = SessionContext::new().state(); + + let rules = default_optimizer_rules(); + df_state + .with_optimizer_rules(rules) + .with_query_planner(Arc::new(FederatedQueryPlanner::new())) +} + +pub fn default_optimizer_rules() -> Vec> { + // Get the default optimizer + let df_default = Optimizer::new(); + let mut default_rules = df_default.rules; + + // Insert the FederationOptimizerRule after the ScalarSubqueryToJoin. + // This ensures ScalarSubquery are replaced before we try to federate. + let Some(pos) = default_rules + .iter() + .position(|x| x.name() == "scalar_subquery_to_join") + else { + panic!("Could not locate ScalarSubqueryToJoin"); + }; + + // TODO: check if we should allow other optimizers to run before the federation rule. + + let federation_rule = Arc::new(FederationOptimizerRule::new()); + default_rules.insert(pos + 1, federation_rule); + + default_rules +} + pub type FederationProviderRef = Arc; pub trait FederationProvider: Send + Sync { // Returns the name of the provider, used for comparison. @@ -24,9 +58,9 @@ pub trait FederationProvider: Send + Sync { // will execute a query. For example: database instance & catalog. fn compute_context(&self) -> Option; - // Returns an analyzer that can cut out part of the plan + // Returns an optimizer that can cut out part of the plan // to federate it. - fn analyzer(&self) -> Option>; + fn optimizer(&self) -> Option>; } impl fmt::Display for dyn FederationProvider { diff --git a/datafusion-federation/src/optimizer.rs b/datafusion-federation/src/optimizer.rs new file mode 100644 index 0000000..4afac32 --- /dev/null +++ b/datafusion-federation/src/optimizer.rs @@ -0,0 +1,429 @@ +use std::sync::Arc; + +use datafusion::common::not_impl_err; +use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; +use datafusion::logical_expr::Extension; +use datafusion::optimizer::optimizer::Optimizer; +use datafusion::optimizer::{OptimizerConfig, OptimizerRule}; +use datafusion::{ + datasource::source_as_provider, + error::Result, + logical_expr::{Expr, LogicalPlan, Projection, TableScan, TableSource}, +}; + +use crate::{ + FederatedTableProviderAdaptor, FederatedTableSource, FederationProvider, FederationProviderRef, +}; + +#[derive(Default)] +pub struct FederationOptimizerRule {} + +impl OptimizerRule for FederationOptimizerRule { + // Walk over the plan, look for the largest subtrees that only have + // TableScans from the same FederationProvider. + // There 'largest sub-trees' are passed to their respective FederationProvider.optimizer. + fn try_optimize( + &self, + plan: &LogicalPlan, + config: &dyn OptimizerConfig, + ) -> Result> { + let (optimized, _) = self.optimize_plan_recursively(plan, true, config)?; + Ok(optimized) + } + + /// A human readable name for this optimizer rule + fn name(&self) -> &str { + "federation_optimizer_rule" + } +} + +enum ScanResult { + None, + Distinct(FederationProviderRef), + Ambiguous, +} + +impl ScanResult { + fn merge(&mut self, other: Self) { + match (&self, &other) { + (_, ScanResult::None) => {} + (ScanResult::None, _) => *self = other, + (ScanResult::Ambiguous, _) | (_, ScanResult::Ambiguous) => { + *self = ScanResult::Ambiguous + } + (ScanResult::Distinct(provider), ScanResult::Distinct(other_provider)) => { + if provider != other_provider { + *self = ScanResult::Ambiguous + } + } + } + } + fn add(&mut self, provider: Option) { + self.merge(ScanResult::from(provider)) + } + fn is_ambiguous(&self) -> bool { + matches!(self, ScanResult::Ambiguous) + } + fn is_none(&self) -> bool { + matches!(self, ScanResult::None) + } + fn is_some(&self) -> bool { + !self.is_none() + } + fn unwrap(self) -> Option { + match self { + ScanResult::None => None, + ScanResult::Distinct(provider) => Some(provider), + ScanResult::Ambiguous => panic!("called `ScanResult::unwrap()` on a `Ambiguous` value"), + } + } + fn check_recursion(&self) -> TreeNodeRecursion { + if self.is_ambiguous() { + TreeNodeRecursion::Stop + } else { + TreeNodeRecursion::Continue + } + } +} + +impl From> for ScanResult { + fn from(provider: Option) -> Self { + match provider { + Some(provider) => ScanResult::Distinct(provider), + None => ScanResult::None, + } + } +} + +impl PartialEq> for ScanResult { + fn eq(&self, other: &Option) -> bool { + match (self, other) { + (ScanResult::None, None) => true, + (ScanResult::Distinct(provider), Some(other_provider)) => provider == other_provider, + _ => false, + } + } +} + +impl Clone for ScanResult { + fn clone(&self) -> Self { + match self { + ScanResult::None => ScanResult::None, + ScanResult::Distinct(provider) => ScanResult::Distinct(provider.clone()), + ScanResult::Ambiguous => ScanResult::Ambiguous, + } + } +} + +impl FederationOptimizerRule { + pub fn new() -> Self { + Self::default() + } + + // scans a plan to see if it belongs to a single FederationProvider + fn scan_plan_recursively(&self, plan: &LogicalPlan) -> Result { + let mut sole_provider: ScanResult = ScanResult::None; + + plan.apply(&mut |p: &LogicalPlan| -> Result { + let exprs_provider = self.scan_plan_exprs(p)?; + sole_provider.merge(exprs_provider); + + if sole_provider.is_ambiguous() { + return Ok(TreeNodeRecursion::Stop); + } + + let sub_provider = get_leaf_provider(p)?; + sole_provider.add(sub_provider); + + Ok(sole_provider.check_recursion()) + })?; + + Ok(sole_provider) + } + + // scans a plan's expressions to see if it belongs to a single FederationProvider + fn scan_plan_exprs(&self, plan: &LogicalPlan) -> Result { + let mut sole_provider: ScanResult = ScanResult::None; + + let exprs = plan.expressions(); + for expr in &exprs { + let expr_result = self.scan_expr_recursively(expr)?; + sole_provider.merge(expr_result); + + if sole_provider.is_ambiguous() { + return Ok(sole_provider); + } + } + + Ok(sole_provider) + } + + // scans an expression to see if it belongs to a single FederationProvider + fn scan_expr_recursively(&self, expr: &Expr) -> Result { + let mut sole_provider: ScanResult = ScanResult::None; + + expr.apply(&mut |e: &Expr| -> Result { + // TODO: Support other types of sub-queries + match e { + Expr::ScalarSubquery(ref subquery) => { + let plan_result = self.scan_plan_recursively(&subquery.subquery)?; + + sole_provider.merge(plan_result); + Ok(sole_provider.check_recursion()) + } + Expr::InSubquery(_) => not_impl_err!("InSubquery"), + Expr::OuterReferenceColumn(..) => { + // Subqueries that reference outer columns are not supported + // for now. We handle this here as ambiguity to force + // federation lower in the plan tree. + sole_provider = ScanResult::Ambiguous; + Ok(TreeNodeRecursion::Stop) + } + _ => Ok(TreeNodeRecursion::Continue), + } + })?; + + Ok(sole_provider) + } + + // optimize_recursively recursively finds the largest sub-plans that can be federated + // to a single FederationProvider. + // Returns a plan if a sub-tree was federated, otherwise None. + // Returns a ScanResult of all FederationProviders in the subtree. + fn optimize_plan_recursively( + &self, + plan: &LogicalPlan, + is_root: bool, + _config: &dyn OptimizerConfig, + ) -> Result<(Option, ScanResult)> { + // Used to track if all sources, including tableScan, plan inputs and + // expressions, represents an un-ambiguous or 'sole' FederationProvider + let mut sole_provider: ScanResult = ScanResult::None; + + if let LogicalPlan::Extension(Extension { ref node }) = plan { + if node.name() == "Federated" { + // Avoid attempting double federation + return Ok((None, ScanResult::Ambiguous)); + } + } + + // Check if this plan node is a leaf that determines the FederationProvider + let leaf_provider = get_leaf_provider(plan)?; + + // Check if the expressions contain, a potentially different, FederationProvider + let exprs_result = self.scan_plan_exprs(plan)?; + let optimize_expressions = exprs_result.is_some(); + + // Return early if this is a leaf and there is no ambiguity with the expressions. + if leaf_provider.is_some() && (exprs_result.is_none() || exprs_result == leaf_provider) { + return Ok((None, leaf_provider.into())); + } + // Aggregate leaf & expression providers + sole_provider.add(leaf_provider); + sole_provider.merge(exprs_result); + + let inputs = plan.inputs(); + // Return early if there are no sources. + if inputs.is_empty() && sole_provider.is_none() { + return Ok((None, ScanResult::None)); + } + + // Recursively optimize inputs + let input_results = inputs + .iter() + .map(|i| self.optimize_plan_recursively(i, false, _config)) + .collect::>>()?; + + // Aggregate the input providers + input_results.iter().for_each(|(_, scan_result)| { + sole_provider.merge(scan_result.clone()); + }); + + if sole_provider.is_none() { + // No providers found + // TODO: Is/should this be reachable? + return Ok((None, ScanResult::None)); + } + + // If all sources are federated to the same provider + if let ScanResult::Distinct(provider) = sole_provider { + if !is_root { + // The largest sub-plan is higher up. + return Ok((None, ScanResult::Distinct(provider))); + } + + let Some(optimizer) = provider.optimizer() else { + // No optimizer provided + return Ok((None, ScanResult::None)); + }; + + // If this is the root plan node; federate the entire plan + let optimized = optimizer.optimize(plan, _config, |_, _| {})?; + return Ok((Some(optimized), ScanResult::None)); + } + + // The plan is ambiguous; any input that is not yet optimized and has a + // sole provider represents a largest sub-plan and should be federated. + // + // We loop over the input optimization results, federate where needed and + // return a complete list of new inputs for the optimized plan. + let new_inputs = input_results + .into_iter() + .enumerate() + .map(|(i, (input_plan, input_result))| { + if let Some(federated_plan) = input_plan { + // Already federated deeper in the plan tree + return Ok(federated_plan); + } + + let original_input = (*inputs.get(i).unwrap()).clone(); + if input_result.is_ambiguous() { + // Can happen if the input is already federated, so use + // the original input. + return Ok(original_input); + } + + let provider = input_result.unwrap(); + let Some(provider) = provider else { + // No provider for this input; use the original input. + return Ok(original_input); + }; + + let Some(optimizer) = provider.optimizer() else { + // No optimizer for this input; use the original input. + return Ok(original_input); + }; + + // Replace the input with the federated counterpart + let wrapped = wrap_projection(original_input)?; + let optimized = optimizer.optimize(&wrapped, _config, |_, _| {})?; + + Ok(optimized) + }) + .collect::>>()?; + + // Optimize expressions if needed + let new_expressions = if optimize_expressions { + self.optimize_plan_exprs(plan, _config)? + } else { + plan.expressions() + }; + + // Construct the optimized plan + let new_plan = plan.with_new_exprs(new_expressions, new_inputs)?; + + // Return the federated plan + Ok((Some(new_plan), ScanResult::Ambiguous)) + } + + // Optimize all exprs of a plan + fn optimize_plan_exprs( + &self, + plan: &LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result> { + plan.expressions() + .iter() + .map(|expr| { + let transformed = expr + .clone() + .transform(&|e| self.optimize_expr_recursively(e, _config))?; + Ok(transformed.data) + }) + .collect::>>() + } + + // recursively optimize expressions + // Current logic: individually federate every sub-query. + fn optimize_expr_recursively( + &self, + expr: Expr, + _config: &dyn OptimizerConfig, + ) -> Result> { + match expr { + Expr::ScalarSubquery(ref subquery) => { + // Optimize as root to force federating the sub-query + let (new_subquery, _) = + self.optimize_plan_recursively(&subquery.subquery, true, _config)?; + let Some(new_subquery) = new_subquery else { + return Ok(Transformed::no(expr)); + }; + Ok(Transformed::yes(Expr::ScalarSubquery( + subquery.with_plan(new_subquery.into()), + ))) + } + Expr::InSubquery(_) => not_impl_err!("InSubquery"), + _ => Ok(Transformed::no(expr)), + } + } +} + +// NopFederationProvider is used to represent tables that are not federated, but +// are resolved by DataFusion. This simplifies the logic of the optimizer rule. +struct NopFederationProvider {} + +impl FederationProvider for NopFederationProvider { + fn name(&self) -> &str { + "nop" + } + + fn compute_context(&self) -> Option { + None + } + + fn optimizer(&self) -> Option> { + None + } +} + +fn get_leaf_provider(plan: &LogicalPlan) -> Result> { + match plan { + LogicalPlan::TableScan(TableScan { ref source, .. }) => { + let Some(federated_source) = get_table_source(source)? else { + // Table is not federated but provided by a standard table provider. + // We use a placeholder federation provider to simplify the logic. + return Ok(Some(Arc::new(NopFederationProvider {}))); + }; + let provider = federated_source.federation_provider(); + Ok(Some(provider)) + } + _ => Ok(None), + } +} + +fn wrap_projection(plan: LogicalPlan) -> Result { + // TODO: minimize requested columns + match plan { + LogicalPlan::Projection(_) => Ok(plan), + _ => { + let expr = plan + .schema() + .fields() + .iter() + .map(|f| Expr::Column(f.qualified_column())) + .collect::>(); + Ok(LogicalPlan::Projection(Projection::try_new( + expr, + Arc::new(plan), + )?)) + } + } +} + +pub fn get_table_source( + source: &Arc, +) -> Result>> { + // Unwrap TableSource + let source = source_as_provider(source)?; + + // Get FederatedTableProviderAdaptor + let Some(wrapper) = source + .as_any() + .downcast_ref::() + else { + return Ok(None); + }; + + // Return original FederatedTableSource + Ok(Some(Arc::clone(&wrapper.source))) +} diff --git a/examples/examples/flight-sql.rs b/examples/examples/flight-sql.rs index 7a32e29..e0899cb 100644 --- a/examples/examples/flight-sql.rs +++ b/examples/examples/flight-sql.rs @@ -9,7 +9,6 @@ use datafusion::{ options::CsvReadOptions, }, }; -use datafusion_federation::{FederatedQueryPlanner, FederationAnalyzerRule}; use datafusion_federation_flight_sql::{executor::FlightSQLExecutor, server::FlightSqlService}; use datafusion_federation_sql::{SQLFederationProvider, SQLSchemaProvider}; use tokio::time::sleep; @@ -39,15 +38,9 @@ async fn main() -> Result<()> { sleep(Duration::from_secs(3)).await; // Local context - let state = SessionContext::new().state(); + let state = datafusion_federation::default_session_state(); let known_tables: Vec = ["test"].iter().map(|&x| x.into()).collect(); - // Register FederationAnalyzer - // TODO: Interaction with other analyzers & optimizers. - let state = state - .add_analyzer_rule(Arc::new(FederationAnalyzerRule::new())) - .with_query_planner(Arc::new(FederatedQueryPlanner::new())); - // Register schema // TODO: table inference let dsn: String = "http://localhost:50051".to_string(); diff --git a/examples/examples/postgres-partial.rs b/examples/examples/postgres-partial.rs index 873dd40..fab31a4 100644 --- a/examples/examples/postgres-partial.rs +++ b/examples/examples/postgres-partial.rs @@ -6,19 +6,12 @@ use datafusion::{ error::Result, execution::context::{SessionContext, SessionState}, }; -use datafusion_federation::{FederatedQueryPlanner, FederationAnalyzerRule}; use datafusion_federation_sql::connectorx::CXExecutor; use datafusion_federation_sql::{MultiSchemaProvider, SQLFederationProvider, SQLSchemaProvider}; #[tokio::main] async fn main() -> Result<()> { - let state = SessionContext::new().state(); - // Register FederationAnalyzer - // TODO: Interaction with other analyzers & optimizers. - let state = state - .add_analyzer_rule(Arc::new(FederationAnalyzerRule::new())) - .with_query_planner(Arc::new(FederatedQueryPlanner::new())); - + let state = datafusion_federation::default_session_state(); let df = task::spawn_blocking(move || { // Register schema let pg_provider_1 = async_std::task::block_on(create_postgres_provider(vec!["class"], "conn1")).unwrap(); diff --git a/examples/examples/sqlite-partial.rs b/examples/examples/sqlite-partial.rs index 780462b..2b6f5b9 100644 --- a/examples/examples/sqlite-partial.rs +++ b/examples/examples/sqlite-partial.rs @@ -7,24 +7,15 @@ use datafusion::{ error::Result, execution::context::{SessionContext, SessionState}, }; -use datafusion_federation::{FederatedQueryPlanner, FederationAnalyzerRule}; use datafusion_federation_sql::{connectorx::CXExecutor, SQLFederationProvider, SQLSchemaProvider}; #[tokio::main] async fn main() -> Result<()> { - let state = SessionContext::new().state(); - // Register FederationAnalyzer - // TODO: Interaction with other analyzers & optimizers. - let state = state - .add_analyzer_rule(Arc::new(FederationAnalyzerRule::new())) - .with_query_planner(Arc::new(FederatedQueryPlanner::new())); - - // Register schema + let state = datafusion_federation::default_session_state(); let provider = MultiSchemaProvider::new(vec![ create_sqlite_provider(vec!["Artist"], "conn1").await?, create_sqlite_provider(vec!["Track", "Album"], "conn2").await?, ]); - overwrite_default_schema(&state, Arc::new(provider))?; // Run query diff --git a/examples/examples/sqlite-subquery.rs b/examples/examples/sqlite-subquery.rs new file mode 100644 index 0000000..8f505bc --- /dev/null +++ b/examples/examples/sqlite-subquery.rs @@ -0,0 +1,90 @@ +use std::{any::Any, sync::Arc}; + +use async_trait::async_trait; +use datafusion::{ + catalog::schema::SchemaProvider, + datasource::TableProvider, + error::Result, + execution::context::{SessionContext, SessionState}, +}; +use datafusion_federation_sql::{connectorx::CXExecutor, SQLFederationProvider, SQLSchemaProvider}; + +#[tokio::main] +async fn main() -> Result<()> { + let state = datafusion_federation::default_session_state(); + let provider = MultiSchemaProvider::new(vec![ + create_sqlite_provider(vec!["Artist"], "conn1").await?, + create_sqlite_provider(vec!["Track", "Album"], "conn2").await?, + ]); + overwrite_default_schema(&state, Arc::new(provider))?; + + // Run query + let ctx = SessionContext::new_with_state(state); + let query = r#"SELECT Name, (SELECT Title FROM Album limit 1) FROM Artist limit 1"#; + let df = ctx.sql(query).await?; + + // let explain = df.clone().explain(true, false)?; + // explain.show().await?; + + df.show().await +} + +async fn create_sqlite_provider( + known_tables: Vec<&str>, + context: &str, +) -> Result> { + let dsn = "sqlite://./examples/examples/chinook.sqlite".to_string(); + let known_tables: Vec = known_tables.iter().map(|&x| x.into()).collect(); + let mut executor = CXExecutor::new(dsn)?; + executor.context(context.to_string()); + let provider = Arc::new(SQLFederationProvider::new(Arc::new(executor))); + Ok(Arc::new( + SQLSchemaProvider::new_with_tables(provider, known_tables).await?, + )) +} + +struct MultiSchemaProvider { + children: Vec>, +} + +impl MultiSchemaProvider { + pub fn new(children: Vec>) -> Self { + Self { children } + } +} + +fn overwrite_default_schema(state: &SessionState, schema: Arc) -> Result<()> { + let options = &state.config().options().catalog; + let catalog = state + .catalog_list() + .catalog(options.default_catalog.as_str()) + .unwrap(); + + catalog.register_schema(options.default_schema.as_str(), schema)?; + + Ok(()) +} + +#[async_trait] +impl SchemaProvider for MultiSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.children.iter().flat_map(|p| p.table_names()).collect() + } + + async fn table(&self, name: &str) -> Result>> { + for child in &self.children { + if let Ok(Some(table)) = child.table(name).await { + return Ok(Some(table)); + } + } + Ok(None) + } + + fn table_exist(&self, name: &str) -> bool { + self.children.iter().any(|p| p.table_exist(name)) + } +} diff --git a/examples/examples/sqlite.rs b/examples/examples/sqlite.rs index 74e5371..a43c7da 100644 --- a/examples/examples/sqlite.rs +++ b/examples/examples/sqlite.rs @@ -5,7 +5,6 @@ use datafusion::{ error::Result, execution::context::{SessionContext, SessionState}, }; -use datafusion_federation::{FederatedQueryPlanner, FederationAnalyzerRule}; use datafusion_federation_sql::{connectorx::CXExecutor, SQLFederationProvider, SQLSchemaProvider}; #[tokio::main] @@ -16,13 +15,7 @@ async fn main() -> datafusion::error::Result<()> { .map(|&x| x.into()) .collect(); - let state = SessionContext::new().state(); - - // Register FederationAnalyzer - // TODO: Interaction with other analyzers & optimizers. - let state = state - .add_analyzer_rule(Arc::new(FederationAnalyzerRule::new())) - .with_query_planner(Arc::new(FederatedQueryPlanner::new())); + let state = datafusion_federation::default_session_state(); // Register schema // TODO: table inference @@ -43,6 +36,7 @@ async fn main() -> datafusion::error::Result<()> { JOIN Album a ON t.AlbumId = a.AlbumId JOIN Artist ar ON a.ArtistId = ar.ArtistId limit 10"#; + let df = ctx.sql(query).await?; df.show().await?; diff --git a/sources/sql/src/lib.rs b/sources/sql/src/lib.rs index 496bfbf..e3cea89 100644 --- a/sources/sql/src/lib.rs +++ b/sources/sql/src/lib.rs @@ -4,19 +4,10 @@ use std::{any::Any, collections::HashMap, sync::Arc, vec}; use async_trait::async_trait; use datafusion::{ arrow::datatypes::{Schema, SchemaRef}, - common::Column, - config::ConfigOptions, error::Result, execution::{context::SessionState, TaskContext}, - logical_expr::{ - expr::{ - AggregateFunction, Alias, Exists, InList, InSubquery, ScalarFunction, Sort, Unnest, - WindowFunction, - }, - Between, BinaryExpr, Case, Cast, Expr, Extension, GroupingSet, Like, LogicalPlan, Subquery, - TryCast, - }, - optimizer::analyzer::{Analyzer, AnalyzerRule}, + logical_expr::{Extension, LogicalPlan}, + optimizer::{optimizer::Optimizer, OptimizerConfig, OptimizerRule}, physical_expr::EquivalenceProperties, physical_plan::{ DisplayAs, DisplayFormatType, ExecutionMode, ExecutionPlan, Partitioning, PlanProperties, @@ -44,15 +35,15 @@ pub use executor::*; // SQLFederationProvider provides federation to SQL DMBSs. pub struct SQLFederationProvider { - analyzer: Arc, + optimizer: Arc, executor: Arc, } impl SQLFederationProvider { pub fn new(executor: Arc) -> Self { Self { - analyzer: Arc::new(Analyzer::with_rules(vec![Arc::new( - SQLFederationAnalyzerRule::new(Arc::clone(&executor)), + optimizer: Arc::new(Optimizer::with_rules(vec![Arc::new( + SQLFederationOptimizerRule::new(executor.clone()), )])), executor, } @@ -68,16 +59,17 @@ impl FederationProvider for SQLFederationProvider { self.executor.compute_context() } - fn analyzer(&self) -> Option> { - Some(Arc::clone(&self.analyzer)) + + fn optimizer(&self) -> Option> { + Some(self.optimizer.clone()) } } -struct SQLFederationAnalyzerRule { +struct SQLFederationOptimizerRule { planner: Arc, } -impl SQLFederationAnalyzerRule { +impl SQLFederationOptimizerRule { pub fn new(executor: Arc) -> Self { Self { planner: Arc::new(SQLFederationPlanner::new(Arc::clone(&executor))), @@ -85,13 +77,24 @@ impl SQLFederationAnalyzerRule { } } -impl AnalyzerRule for SQLFederationAnalyzerRule { - fn analyze(&self, plan: LogicalPlan, _config: &ConfigOptions) -> Result { - let fed_plan = FederatedPlanNode::new(plan.clone(), Arc::clone(&self.planner)); +impl OptimizerRule for SQLFederationOptimizerRule { + fn try_optimize( + &self, + plan: &LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> Result> { + if let LogicalPlan::Extension(Extension { ref node }) = plan { + if node.name() == "Federated" { + // Avoid attempting double federation + return Ok(None); + } + } + // Simply accept the entire plan for now + let fed_plan = FederatedPlanNode::new(plan.clone(), self.planner.clone()); let ext_node = Extension { node: Arc::new(fed_plan), }; - Ok(LogicalPlan::Extension(ext_node)) + Ok(Some(LogicalPlan::Extension(ext_node))) } /// A human readable name for this analyzer rule From 7dfb653570997e60a9133caccf9edc7fdc6adf37 Mon Sep 17 00:00:00 2001 From: hozan23 Date: Fri, 23 Aug 2024 10:03:27 +0200 Subject: [PATCH 16/31] delete redundant github workflows & commitlint --- .github/workflows/check.yml | 13 ------------- .github/workflows/pull-request.yml | 30 ------------------------------ .github/workflows/test.yml | 4 ++-- commitlint.config.js | 8 -------- 4 files changed, 2 insertions(+), 53 deletions(-) delete mode 100644 .github/workflows/check.yml delete mode 100644 .github/workflows/pull-request.yml delete mode 100644 commitlint.config.js diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml deleted file mode 100644 index fe66d27..0000000 --- a/.github/workflows/check.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: Check - -on: [push, pull_request] - -jobs: - formatting: - name: Formatting - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 - - run: npm install prettier prettier-plugin-toml - - run: npx prettier --check --no-config . diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml deleted file mode 100644 index 62da82f..0000000 --- a/.github/workflows/pull-request.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: Pull Request - -on: - pull_request_target: - types: - - opened - - reopened - - edited - - synchronize - -jobs: - conventional-commits: - name: Conventional Commits - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-node@v4 - - run: npm install @commitlint/config-conventional - - run: npx commitlint <<< $CONVENTIONAL_COMMIT - env: - CONVENTIONAL_COMMIT: | - ${{ github.event.pull_request.title }} - - ${{ github.event.pull_request.body }} - - if: failure() - run: - echo "Datafusion-federation follows the [Conventional Commits specification](https://www.conventionalcommits.org/en/v1.0.0/) for release automation. - The PR title and body are used as the merge commit message. - - Please update your PR title to match the specification." >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8e548ee..0792434 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,7 +55,7 @@ jobs: - uses: arduino/setup-protoc@v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - - run: cargo clippy -- -Dwarnings + - run: cargo clippy -- -D warnings package: name: Package @@ -69,5 +69,5 @@ jobs: - uses: arduino/setup-protoc@v3 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - - run: cargo build + - run: cargo build --all - run: cargo package -p datafusion-federation --allow-dirty diff --git a/commitlint.config.js b/commitlint.config.js deleted file mode 100644 index 0a2216d..0000000 --- a/commitlint.config.js +++ /dev/null @@ -1,8 +0,0 @@ -module.exports = { - extends: ["@commitlint/config-conventional"], - rules: { - "body-max-line-length": [0, "always", Infinity], - "footer-max-line-length": [0, "always", Infinity], - "header-max-length": [0, "always", Infinity], - }, -}; From 8968619b5d0ee644b365449107662300d069c703 Mon Sep 17 00:00:00 2001 From: hozan23 Date: Thu, 22 Aug 2024 16:13:52 +0200 Subject: [PATCH 17/31] update deps && use datafusion 41 --- Cargo.toml | 9 ++------- datafusion-federation/src/lib.rs | 7 +++---- datafusion-federation/src/optimizer.rs | 8 ++++---- datafusion-federation/src/plan_node.rs | 8 ++++++++ sources/flight-sql/Cargo.toml | 4 +++- sources/sql/Cargo.toml | 5 +++-- sources/sql/src/connectorx/executor.rs | 1 + sources/sql/src/lib.rs | 1 - 8 files changed, 24 insertions(+), 19 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bced60f..7d9b5fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,26 +3,21 @@ resolver = "2" members = [ "datafusion-federation", - "examples", "sources/sql", "sources/flight-sql", ] -[patch.crates-io] -# connectorx = { path = "../connector-x/connectorx" } -# datafusion = { path = "../arrow-datafusion/datafusion/core" } - [workspace.package] version = "0.1.6" edition = "2021" license = "Apache-2.0" readme = "README.md" - [workspace.dependencies] -async-trait = "0.1.77" +async-trait = "0.1.81" async-stream = "0.3.5" futures = "0.3.30" datafusion = "41.0.0" datafusion-substrait = "41.0.0" arrow-json = "52.2.0" + diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index bdcb03d..fad97de 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -5,7 +5,7 @@ use std::{ }; use datafusion::{ - execution::context::{SessionContext, SessionState}, + execution::session_state::{SessionState, SessionStateBuilder}, optimizer::{optimizer::Optimizer, OptimizerRule}, }; @@ -19,12 +19,11 @@ pub use plan_node::*; pub mod schema_cast; pub fn default_session_state() -> SessionState { - let df_state = SessionContext::new().state(); - let rules = default_optimizer_rules(); - df_state + SessionStateBuilder::new() .with_optimizer_rules(rules) .with_query_planner(Arc::new(FederatedQueryPlanner::new())) + .build() } pub fn default_optimizer_rules() -> Vec> { diff --git a/datafusion-federation/src/optimizer.rs b/datafusion-federation/src/optimizer.rs index 4afac32..d0fc24b 100644 --- a/datafusion-federation/src/optimizer.rs +++ b/datafusion-federation/src/optimizer.rs @@ -258,7 +258,7 @@ impl FederationOptimizerRule { }; // If this is the root plan node; federate the entire plan - let optimized = optimizer.optimize(plan, _config, |_, _| {})?; + let optimized = optimizer.optimize(plan.clone(), _config, |_, _| {})?; return Ok((Some(optimized), ScanResult::None)); } @@ -296,7 +296,7 @@ impl FederationOptimizerRule { // Replace the input with the federated counterpart let wrapped = wrap_projection(original_input)?; - let optimized = optimizer.optimize(&wrapped, _config, |_, _| {})?; + let optimized = optimizer.optimize(wrapped, _config, |_, _| {})?; Ok(optimized) }) @@ -398,9 +398,9 @@ fn wrap_projection(plan: LogicalPlan) -> Result { _ => { let expr = plan .schema() - .fields() + .columns() .iter() - .map(|f| Expr::Column(f.qualified_column())) + .map(|c| Expr::Column(c.clone())) .collect::>(); Ok(LogicalPlan::Projection(Projection::try_new( expr, diff --git a/datafusion-federation/src/plan_node.rs b/datafusion-federation/src/plan_node.rs index c99204f..535ee4e 100644 --- a/datafusion-federation/src/plan_node.rs +++ b/datafusion-federation/src/plan_node.rs @@ -65,6 +65,14 @@ impl UserDefinedLogicalNodeCore for FederatedPlanNode { planner: Arc::clone(&self.planner), }) } + + /// XXX should consider something else here ? + fn with_exprs_and_inputs(&self, _exprs: Vec, _inputs: Vec) -> Result { + Ok(Self { + plan: self.plan.clone(), + planner: self.planner.clone(), + }) + } } #[derive(Default)] diff --git a/sources/flight-sql/Cargo.toml b/sources/flight-sql/Cargo.toml index d9c1a1d..7ba4d72 100644 --- a/sources/flight-sql/Cargo.toml +++ b/sources/flight-sql/Cargo.toml @@ -3,7 +3,6 @@ name = "datafusion-federation-flight-sql" version.workspace = true edition.workspace = true license.workspace = true -readme.workspace = true [lib] name = "datafusion_federation_flight_sql" @@ -13,8 +12,11 @@ path = "src/lib.rs" async-trait.workspace = true datafusion.workspace = true datafusion-substrait.workspace = true + +# XXX use the release verion on crates.io datafusion-federation.path = "../../datafusion-federation" datafusion-federation-sql.path = "../sql" + futures = "0.3.30" tonic = {version="0.11.0", features=["tls"] } prost = "0.12.3" diff --git a/sources/sql/Cargo.toml b/sources/sql/Cargo.toml index 5d1d030..b5938c7 100644 --- a/sources/sql/Cargo.toml +++ b/sources/sql/Cargo.toml @@ -3,7 +3,6 @@ name = "datafusion-federation-sql" version.workspace = true edition.workspace = true license.workspace = true -readme.workspace = true [lib] name = "datafusion_federation_sql" @@ -18,8 +17,10 @@ connectorx = { git = "https://github.com/devinjdangelo/connector-x.git", feature "src_sqlite" ], optional = true } datafusion.workspace = true + +# XXX use the release verion on crates.io datafusion-federation.path = "../../datafusion-federation" -# derive_builder = "0.13.0" + futures = "0.3.30" tokio = "1.35.1" tracing = "0.1.40" diff --git a/sources/sql/src/connectorx/executor.rs b/sources/sql/src/connectorx/executor.rs index b5964ce..38a8f4e 100644 --- a/sources/sql/src/connectorx/executor.rs +++ b/sources/sql/src/connectorx/executor.rs @@ -53,6 +53,7 @@ impl SQLExecutor for CXExecutor { fn compute_context(&self) -> Option { Some(self.context.clone()) } + fn execute(&self, sql: &str, schema: SchemaRef) -> Result { let conn = self.conn.clone(); let query: CXQuery = sql.into(); diff --git a/sources/sql/src/lib.rs b/sources/sql/src/lib.rs index e3cea89..3dc0f8c 100644 --- a/sources/sql/src/lib.rs +++ b/sources/sql/src/lib.rs @@ -59,7 +59,6 @@ impl FederationProvider for SQLFederationProvider { self.executor.compute_context() } - fn optimizer(&self) -> Option> { Some(self.optimizer.clone()) } From 5741fd6e45d174f35be89ee1f641d349d7edd2b8 Mon Sep 17 00:00:00 2001 From: hozan23 Date: Thu, 22 Aug 2024 16:26:41 +0200 Subject: [PATCH 18/31] remove connectorx from sources/sql crate --- sources/sql/README.md | 4 ++++ sources/sql/src/connectorx/mod.rs | 2 -- sources/sql/src/lib.rs | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 sources/sql/README.md delete mode 100644 sources/sql/src/connectorx/mod.rs diff --git a/sources/sql/README.md b/sources/sql/README.md new file mode 100644 index 0000000..def0668 --- /dev/null +++ b/sources/sql/README.md @@ -0,0 +1,4 @@ + + +This will be move to +[datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers) repository diff --git a/sources/sql/src/connectorx/mod.rs b/sources/sql/src/connectorx/mod.rs deleted file mode 100644 index 600069a..0000000 --- a/sources/sql/src/connectorx/mod.rs +++ /dev/null @@ -1,2 +0,0 @@ -mod executor; -pub use executor::*; diff --git a/sources/sql/src/lib.rs b/sources/sql/src/lib.rs index 3dc0f8c..438acfa 100644 --- a/sources/sql/src/lib.rs +++ b/sources/sql/src/lib.rs @@ -27,6 +27,7 @@ pub use schema::*; #[cfg(feature = "connectorx")] pub mod connectorx; + mod executor; pub use executor::*; From 82f3dfad67568f15d1127278a2e1a659614bc5bd Mon Sep 17 00:00:00 2001 From: hozan23 Date: Thu, 22 Aug 2024 17:09:50 +0200 Subject: [PATCH 19/31] flight-sql: fix up dependencie issues --- sources/flight-sql/Cargo.toml | 1 + sources/flight-sql/src/server/service.rs | 43 ++++++++++++------------ sources/flight-sql/src/server/state.rs | 3 +- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/sources/flight-sql/Cargo.toml b/sources/flight-sql/Cargo.toml index 7ba4d72..572d26b 100644 --- a/sources/flight-sql/Cargo.toml +++ b/sources/flight-sql/Cargo.toml @@ -23,3 +23,4 @@ prost = "0.12.3" arrow = "52.0.0" arrow-flight = { version = "52.0.0", features = ["flight-sql-experimental"] } log = "0.4.20" + diff --git a/sources/flight-sql/src/server/service.rs b/sources/flight-sql/src/server/service.rs index 6bc77ff..afa4a5f 100644 --- a/sources/flight-sql/src/server/service.rs +++ b/sources/flight-sql/src/server/service.rs @@ -1,14 +1,11 @@ -use arrow::datatypes::SchemaRef; -use arrow::error::ArrowError; -use arrow::ipc::writer::IpcWriteOptions; -use arrow_flight::encode::FlightDataEncoderBuilder; -use arrow_flight::error::FlightError; -use arrow_flight::flight_service_server::{FlightService, FlightServiceServer}; -use arrow_flight::sql::server::{ - FlightSqlService as ArrowFlightSqlService, PeekableFlightDataStream, -}; +use std::pin::Pin; +use std::sync::Arc; + +use arrow::{datatypes::SchemaRef, error::ArrowError, ipc::writer::IpcWriteOptions}; use arrow_flight::sql::{ - self, ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, + self, + server::{FlightSqlService as ArrowFlightSqlService, PeekableFlightDataStream}, + ActionBeginSavepointRequest, ActionBeginSavepointResult, ActionBeginTransactionRequest, ActionBeginTransactionResult, ActionCancelQueryRequest, ActionCancelQueryResult, ActionClosePreparedStatementRequest, ActionCreatePreparedStatementRequest, ActionCreatePreparedStatementResult, ActionCreatePreparedSubstraitPlanRequest, @@ -20,22 +17,26 @@ use arrow_flight::sql::{ DoPutPreparedStatementResult, SqlInfo, TicketStatementQuery, }; use arrow_flight::{ + encode::FlightDataEncoderBuilder, + error::FlightError, + flight_service_server::{FlightService, FlightServiceServer}, Action, FlightDescriptor, FlightEndpoint, FlightInfo, HandshakeRequest, HandshakeResponse, IpcMessage, SchemaAsIpc, Ticket, }; -use datafusion::common::arrow::datatypes::Schema; -use datafusion::dataframe::DataFrame; -use datafusion::error::{DataFusionError, Result as DataFusionResult}; -use datafusion::execution::context::{SQLOptions, SessionContext, SessionState}; -use datafusion::logical_expr::LogicalPlan; -use datafusion::physical_plan::SendableRecordBatchStream; -use datafusion_substrait::logical_plan::consumer::from_substrait_plan; -use datafusion_substrait::serializer::deserialize_bytes; +use datafusion::{ + common::arrow::datatypes::Schema, + dataframe::DataFrame, + error::{DataFusionError, Result as DataFusionResult}, + execution::context::{SQLOptions, SessionContext, SessionState}, + logical_expr::LogicalPlan, + physical_plan::SendableRecordBatchStream, +}; +use datafusion_substrait::{ + logical_plan::consumer::from_substrait_plan, serializer::deserialize_bytes, +}; use futures::{Stream, StreamExt, TryStreamExt}; use log::info; use prost::bytes::Bytes; -use std::pin::Pin; -use std::sync::Arc; use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; @@ -601,7 +602,7 @@ impl ArrowFlightSqlService for FlightSqlService { &self, _query: CommandPreparedStatementQuery, request: Request, - ) -> Result { + ) -> Result { info!("do_put_prepared_statement_query"); let (_, _) = self.new_context(request)?; diff --git a/sources/flight-sql/src/server/state.rs b/sources/flight-sql/src/server/state.rs index 0b4ec09..e28ffdd 100644 --- a/sources/flight-sql/src/server/state.rs +++ b/sources/flight-sql/src/server/state.rs @@ -1,9 +1,10 @@ +use std::fmt::Display; + use arrow_flight::{ error::FlightError, sql::{self, Any, Command}, }; use prost::{bytes::Bytes, Message}; -use std::fmt::Display; pub type Result = std::result::Result; From 3e5996d04b2f4c4dc1abc49060d8c40280ab6f8c Mon Sep 17 00:00:00 2001 From: hozan23 Date: Fri, 23 Aug 2024 09:45:32 +0200 Subject: [PATCH 20/31] move sources/sql code to datafusion-federation crate and add sql feature --- Cargo.toml | 3 +-- datafusion-federation/Cargo.toml | 5 +++++ datafusion-federation/src/lib.rs | 17 ++++++++++------- .../src/sql}/executor.rs | 0 .../src/sql/mod.rs | 12 ++++++++---- .../src/sql}/schema.rs | 11 +++++------ 6 files changed, 29 insertions(+), 19 deletions(-) rename {sources/sql/src => datafusion-federation/src/sql}/executor.rs (100%) rename sources/sql/src/lib.rs => datafusion-federation/src/sql/mod.rs (99%) rename {sources/sql/src => datafusion-federation/src/sql}/schema.rs (96%) diff --git a/Cargo.toml b/Cargo.toml index 7d9b5fc..f582ab5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,8 +3,7 @@ resolver = "2" members = [ "datafusion-federation", - "sources/sql", - "sources/flight-sql", + # "sources/flight-sql", ] [workspace.package] diff --git a/datafusion-federation/Cargo.toml b/datafusion-federation/Cargo.toml index f344a71..336c256 100644 --- a/datafusion-federation/Cargo.toml +++ b/datafusion-federation/Cargo.toml @@ -10,6 +10,9 @@ description = "Datafusion federation." name = "datafusion_federation" path = "src/lib.rs" +[features] +sql = ["futures"] + [dependencies] async-trait.workspace = true datafusion.workspace = true @@ -17,6 +20,8 @@ async-stream.workspace = true futures.workspace = true arrow-json.workspace = true +futures = { version = "0.3.30", optional = true } + [package.metadata.docs.rs] # Whether to pass `--all-features` to Cargo (default: false) diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index fad97de..0d43954 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -1,5 +1,11 @@ -use core::fmt; +mod optimizer; +mod plan_node; +mod table_provider; +#[cfg(feature = "sql")] +pub mod sql; + use std::{ + fmt, hash::{Hash, Hasher}, sync::Arc, }; @@ -9,13 +15,10 @@ use datafusion::{ optimizer::{optimizer::Optimizer, OptimizerRule}, }; -mod optimizer; -pub use optimizer::*; -mod table_provider; -pub use table_provider::*; -mod plan_node; -pub use plan_node::*; +pub use optimizer::{get_table_source, FederationOptimizerRule}; +pub use plan_node::{FederatedPlanNode, FederatedQueryPlanner, FederationPlanner}; +pub use table_provider::{FederatedTableProviderAdaptor, FederatedTableSource}; pub mod schema_cast; pub fn default_session_state() -> SessionState { diff --git a/sources/sql/src/executor.rs b/datafusion-federation/src/sql/executor.rs similarity index 100% rename from sources/sql/src/executor.rs rename to datafusion-federation/src/sql/executor.rs diff --git a/sources/sql/src/lib.rs b/datafusion-federation/src/sql/mod.rs similarity index 99% rename from sources/sql/src/lib.rs rename to datafusion-federation/src/sql/mod.rs index 438acfa..cc42f30 100644 --- a/sources/sql/src/lib.rs +++ b/datafusion-federation/src/sql/mod.rs @@ -1,6 +1,10 @@ +mod executor; +mod schema; + use core::fmt; use std::{any::Any, collections::HashMap, sync::Arc, vec}; +use crate::{FederatedPlanNode, FederationPlanner, FederationProvider}; use async_trait::async_trait; use datafusion::{ arrow::datatypes::{Schema, SchemaRef}, @@ -22,14 +26,14 @@ use datafusion_federation::{ get_table_source, schema_cast, FederatedPlanNode, FederationPlanner, FederationProvider, }; -mod schema; -pub use schema::*; + #[cfg(feature = "connectorx")] pub mod connectorx; -mod executor; -pub use executor::*; + +pub use executor::{SQLExecutor, SQLExecutorRef}; +pub use schema::{MultiSchemaProvider, SQLSchemaProvider, SQLTableSource}; // #[macro_use] // extern crate derive_builder; diff --git a/sources/sql/src/schema.rs b/datafusion-federation/src/sql/schema.rs similarity index 96% rename from sources/sql/src/schema.rs rename to datafusion-federation/src/sql/schema.rs index aa23fd0..8e9cf25 100644 --- a/sources/sql/src/schema.rs +++ b/datafusion-federation/src/sql/schema.rs @@ -1,18 +1,17 @@ -use async_trait::async_trait; +use std::{any::Any, sync::Arc}; +use async_trait::async_trait; use datafusion::logical_expr::{TableSource, TableType}; use datafusion::{ arrow::datatypes::SchemaRef, catalog::SchemaProvider, datasource::TableProvider, error::Result, }; use futures::future::join_all; -use std::{any::Any, sync::Arc}; -use datafusion_federation::{ - FederatedTableProviderAdaptor, FederatedTableSource, FederationProvider, +use crate::{ + sql::SQLFederationProvider, FederatedTableProviderAdaptor, FederatedTableSource, + FederationProvider, }; -use crate::SQLFederationProvider; - pub struct SQLSchemaProvider { // provider: Arc, tables: Vec>, From 6e6acd5d54d6a4aa1afeef801f9ea7df0836508e Mon Sep 17 00:00:00 2001 From: hozan23 Date: Fri, 23 Aug 2024 10:00:34 +0200 Subject: [PATCH 21/31] move sources/flight-sql to datafusion-flight-sql-server --- Cargo.toml | 2 +- datafusion-federation/src/lib.rs | 2 +- datafusion-federation/src/sql/mod.rs | 3 ++- .../Cargo.toml | 17 +++++++++++------ .../src/executor/mod.rs | 6 +++--- .../src/lib.rs | 0 .../src/server/mod.rs | 0 .../src/server/service.rs | 0 .../src/server/session.rs | 0 .../src/server/state.rs | 0 sources/sql/README.md | 4 ---- 11 files changed, 18 insertions(+), 16 deletions(-) rename {sources/flight-sql => datafusion-flight-sql-server}/Cargo.toml (60%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/executor/mod.rs (98%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/lib.rs (100%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/server/mod.rs (100%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/server/service.rs (100%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/server/session.rs (100%) rename {sources/flight-sql => datafusion-flight-sql-server}/src/server/state.rs (100%) delete mode 100644 sources/sql/README.md diff --git a/Cargo.toml b/Cargo.toml index f582ab5..af44c42 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = [ "datafusion-federation", - # "sources/flight-sql", + "datafusion-flight-sql-server", ] [workspace.package] diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index 0d43954..abb729a 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -1,8 +1,8 @@ mod optimizer; mod plan_node; -mod table_provider; #[cfg(feature = "sql")] pub mod sql; +mod table_provider; use std::{ fmt, diff --git a/datafusion-federation/src/sql/mod.rs b/datafusion-federation/src/sql/mod.rs index cc42f30..0055ee3 100644 --- a/datafusion-federation/src/sql/mod.rs +++ b/datafusion-federation/src/sql/mod.rs @@ -4,7 +4,6 @@ mod schema; use core::fmt; use std::{any::Any, collections::HashMap, sync::Arc, vec}; -use crate::{FederatedPlanNode, FederationPlanner, FederationProvider}; use async_trait::async_trait; use datafusion::{ arrow::datatypes::{Schema, SchemaRef}, @@ -35,6 +34,8 @@ pub mod connectorx; pub use executor::{SQLExecutor, SQLExecutorRef}; pub use schema::{MultiSchemaProvider, SQLSchemaProvider, SQLTableSource}; +use crate::{FederatedPlanNode, FederationPlanner, FederationProvider}; + // #[macro_use] // extern crate derive_builder; diff --git a/sources/flight-sql/Cargo.toml b/datafusion-flight-sql-server/Cargo.toml similarity index 60% rename from sources/flight-sql/Cargo.toml rename to datafusion-flight-sql-server/Cargo.toml index 572d26b..3ee581d 100644 --- a/sources/flight-sql/Cargo.toml +++ b/datafusion-flight-sql-server/Cargo.toml @@ -1,24 +1,29 @@ [package] -name = "datafusion-federation-flight-sql" +name = "datafusion-flight-sql-server" version.workspace = true edition.workspace = true license.workspace = true [lib] -name = "datafusion_federation_flight_sql" +name = "datafusion_flight_sql_server" path = "src/lib.rs" [dependencies] async-trait.workspace = true datafusion.workspace = true datafusion-substrait.workspace = true - # XXX use the release verion on crates.io -datafusion-federation.path = "../../datafusion-federation" -datafusion-federation-sql.path = "../sql" +datafusion-federation = { path = "../datafusion-federation", features = [ + "sql", +] } futures = "0.3.30" -tonic = {version="0.11.0", features=["tls"] } +tonic = { version = "0.11.0", features = [ + "tls", + "transport", + "codegen", + "prost", +] } prost = "0.12.3" arrow = "52.0.0" arrow-flight = { version = "52.0.0", features = ["flight-sql-experimental"] } diff --git a/sources/flight-sql/src/executor/mod.rs b/datafusion-flight-sql-server/src/executor/mod.rs similarity index 98% rename from sources/flight-sql/src/executor/mod.rs rename to datafusion-flight-sql-server/src/executor/mod.rs index c4f1225..0bcf432 100644 --- a/sources/flight-sql/src/executor/mod.rs +++ b/datafusion-flight-sql-server/src/executor/mod.rs @@ -1,3 +1,5 @@ +use std::sync::Arc; + use arrow::{datatypes::SchemaRef, error::ArrowError}; use arrow_flight::sql::client::FlightSqlServiceClient; use async_trait::async_trait; @@ -6,10 +8,8 @@ use datafusion::{ physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}, sql::unparser::dialect::{DefaultDialect, Dialect}, }; -use datafusion_federation_sql::SQLExecutor; +use datafusion_federation::sql::SQLExecutor; use futures::TryStreamExt; - -use std::sync::Arc; use tonic::transport::Channel; pub struct FlightSQLExecutor { diff --git a/sources/flight-sql/src/lib.rs b/datafusion-flight-sql-server/src/lib.rs similarity index 100% rename from sources/flight-sql/src/lib.rs rename to datafusion-flight-sql-server/src/lib.rs diff --git a/sources/flight-sql/src/server/mod.rs b/datafusion-flight-sql-server/src/server/mod.rs similarity index 100% rename from sources/flight-sql/src/server/mod.rs rename to datafusion-flight-sql-server/src/server/mod.rs diff --git a/sources/flight-sql/src/server/service.rs b/datafusion-flight-sql-server/src/server/service.rs similarity index 100% rename from sources/flight-sql/src/server/service.rs rename to datafusion-flight-sql-server/src/server/service.rs diff --git a/sources/flight-sql/src/server/session.rs b/datafusion-flight-sql-server/src/server/session.rs similarity index 100% rename from sources/flight-sql/src/server/session.rs rename to datafusion-flight-sql-server/src/server/session.rs diff --git a/sources/flight-sql/src/server/state.rs b/datafusion-flight-sql-server/src/server/state.rs similarity index 100% rename from sources/flight-sql/src/server/state.rs rename to datafusion-flight-sql-server/src/server/state.rs diff --git a/sources/sql/README.md b/sources/sql/README.md deleted file mode 100644 index def0668..0000000 --- a/sources/sql/README.md +++ /dev/null @@ -1,4 +0,0 @@ - - -This will be move to -[datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers) repository From 0cb719da0007d7504b20173a24679a083542ba5b Mon Sep 17 00:00:00 2001 From: hozan23 Date: Fri, 23 Aug 2024 10:59:20 +0200 Subject: [PATCH 22/31] "create datafusion-flight-sql-table-provider crate & move the executor from datafusion-flight-sql-server" --- Cargo.toml | 1 + README.md | 3 +++ datafusion-flight-sql-server/Cargo.toml | 8 ++++--- .../examples/flight-sql.rs | 13 ++++------ .../examples/test.csv | 4 ++++ datafusion-flight-sql-server/src/lib.rs | 5 ++-- .../src/server/mod.rs | 6 ----- .../src/{server => }/service.rs | 5 ++-- .../src/{server => }/session.rs | 0 .../src/{server => }/state.rs | 0 .../Cargo.toml | 24 +++++++++++++++++++ .../src/lib.rs | 0 12 files changed, 47 insertions(+), 22 deletions(-) rename {examples => datafusion-flight-sql-server}/examples/flight-sql.rs (88%) create mode 100644 datafusion-flight-sql-server/examples/test.csv delete mode 100644 datafusion-flight-sql-server/src/server/mod.rs rename datafusion-flight-sql-server/src/{server => }/service.rs (99%) rename datafusion-flight-sql-server/src/{server => }/session.rs (100%) rename datafusion-flight-sql-server/src/{server => }/state.rs (100%) create mode 100644 datafusion-flight-sql-table-provider/Cargo.toml rename datafusion-flight-sql-server/src/executor/mod.rs => datafusion-flight-sql-table-provider/src/lib.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index af44c42..44e9361 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ resolver = "2" members = [ "datafusion-federation", "datafusion-flight-sql-server", + "datafusion-flight-sql-table-provider", ] [workspace.package] diff --git a/README.md b/README.md index 028db87..ed7fdef 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,9 @@ The goal of this repo is to allow [DataFusion](https://github.com/apache/arrow-datafusion) to resolve queries across remote query engines while pushing down as much compute as possible down. + +> :warning: **All the examples are deprecated for now** + Check out [the examples](./examples/) to get a feel for how it works. Potential use-cases: diff --git a/datafusion-flight-sql-server/Cargo.toml b/datafusion-flight-sql-server/Cargo.toml index 3ee581d..ede6979 100644 --- a/datafusion-flight-sql-server/Cargo.toml +++ b/datafusion-flight-sql-server/Cargo.toml @@ -9,7 +9,6 @@ name = "datafusion_flight_sql_server" path = "src/lib.rs" [dependencies] -async-trait.workspace = true datafusion.workspace = true datafusion-substrait.workspace = true # XXX use the release verion on crates.io @@ -26,6 +25,9 @@ tonic = { version = "0.11.0", features = [ ] } prost = "0.12.3" arrow = "52.0.0" -arrow-flight = { version = "52.0.0", features = ["flight-sql-experimental"] } -log = "0.4.20" +arrow-flight = { version = "52.2.0", features = ["flight-sql-experimental"] } +log = "0.4.22" +[dev-dependencies] +tokio = { version = "1.39.3", features = ["full"] } +datafusion-flight-sql-table-provider = { path = "../datafusion-flight-sql-table-provider" } diff --git a/examples/examples/flight-sql.rs b/datafusion-flight-sql-server/examples/flight-sql.rs similarity index 88% rename from examples/examples/flight-sql.rs rename to datafusion-flight-sql-server/examples/flight-sql.rs index e0899cb..021227a 100644 --- a/examples/examples/flight-sql.rs +++ b/datafusion-flight-sql-server/examples/flight-sql.rs @@ -2,15 +2,16 @@ use std::{sync::Arc, time::Duration}; use arrow_flight::sql::client::FlightSqlServiceClient; use datafusion::{ - catalog::schema::SchemaProvider, + catalog::SchemaProvider, error::{DataFusionError, Result}, execution::{ context::{SessionContext, SessionState}, options::CsvReadOptions, }, }; -use datafusion_federation_flight_sql::{executor::FlightSQLExecutor, server::FlightSqlService}; -use datafusion_federation_sql::{SQLFederationProvider, SQLSchemaProvider}; +use datafusion_federation::sql::{SQLFederationProvider, SQLSchemaProvider}; +use datafusion_flight_sql_server::service::FlightSqlService; +use datafusion_flight_sql_table_provider::FlightSQLExecutor; use tokio::time::sleep; use tonic::transport::Endpoint; @@ -19,11 +20,7 @@ async fn main() -> Result<()> { let dsn: String = "0.0.0.0:50051".to_string(); let remote_ctx = SessionContext::new(); remote_ctx - .register_csv( - "test", - "./examples/examples/test.csv", - CsvReadOptions::new(), - ) + .register_csv("test", "./examples/test.csv", CsvReadOptions::new()) .await?; // Remote context diff --git a/datafusion-flight-sql-server/examples/test.csv b/datafusion-flight-sql-server/examples/test.csv new file mode 100644 index 0000000..811d276 --- /dev/null +++ b/datafusion-flight-sql-server/examples/test.csv @@ -0,0 +1,4 @@ +foo,bar +a,1 +b,2 +c,3 \ No newline at end of file diff --git a/datafusion-flight-sql-server/src/lib.rs b/datafusion-flight-sql-server/src/lib.rs index a8795c9..f7f6eac 100644 --- a/datafusion-flight-sql-server/src/lib.rs +++ b/datafusion-flight-sql-server/src/lib.rs @@ -1,2 +1,3 @@ -pub mod executor; -pub mod server; +pub mod service; +pub mod session; +pub mod state; diff --git a/datafusion-flight-sql-server/src/server/mod.rs b/datafusion-flight-sql-server/src/server/mod.rs deleted file mode 100644 index 0c054b9..0000000 --- a/datafusion-flight-sql-server/src/server/mod.rs +++ /dev/null @@ -1,6 +0,0 @@ -mod service; -pub use service::*; -mod state; -pub use state::*; -mod session; -pub use session::*; diff --git a/datafusion-flight-sql-server/src/server/service.rs b/datafusion-flight-sql-server/src/service.rs similarity index 99% rename from datafusion-flight-sql-server/src/server/service.rs rename to datafusion-flight-sql-server/src/service.rs index afa4a5f..8a85132 100644 --- a/datafusion-flight-sql-server/src/server/service.rs +++ b/datafusion-flight-sql-server/src/service.rs @@ -1,5 +1,4 @@ -use std::pin::Pin; -use std::sync::Arc; +use std::{pin::Pin, sync::Arc}; use arrow::{datatypes::SchemaRef, error::ArrowError, ipc::writer::IpcWriteOptions}; use arrow_flight::sql::{ @@ -40,8 +39,8 @@ use prost::bytes::Bytes; use tonic::transport::Server; use tonic::{Request, Response, Status, Streaming}; +use super::session::{SessionStateProvider, StaticSessionStateProvider}; use super::state::{CommandTicket, QueryHandle}; -use super::{SessionStateProvider, StaticSessionStateProvider}; type Result = std::result::Result; diff --git a/datafusion-flight-sql-server/src/server/session.rs b/datafusion-flight-sql-server/src/session.rs similarity index 100% rename from datafusion-flight-sql-server/src/server/session.rs rename to datafusion-flight-sql-server/src/session.rs diff --git a/datafusion-flight-sql-server/src/server/state.rs b/datafusion-flight-sql-server/src/state.rs similarity index 100% rename from datafusion-flight-sql-server/src/server/state.rs rename to datafusion-flight-sql-server/src/state.rs diff --git a/datafusion-flight-sql-table-provider/Cargo.toml b/datafusion-flight-sql-table-provider/Cargo.toml new file mode 100644 index 0000000..4302133 --- /dev/null +++ b/datafusion-flight-sql-table-provider/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "datafusion-flight-sql-table-provider" +version.workspace = true +edition.workspace = true +license.workspace = true +readme.workspace = true + +[dependencies] +async-trait.workspace = true +datafusion.workspace = true +# XXX use the release verion on crates.io +datafusion-federation = { path = "../datafusion-federation", features = [ + "sql", +] } + +futures = "0.3.30" +tonic = { version = "0.11.0", features = [ + "tls", + "transport", + "codegen", + "prost", +] } +arrow = "52.0.0" +arrow-flight = { version = "52.2.0", features = ["flight-sql-experimental"] } diff --git a/datafusion-flight-sql-server/src/executor/mod.rs b/datafusion-flight-sql-table-provider/src/lib.rs similarity index 100% rename from datafusion-flight-sql-server/src/executor/mod.rs rename to datafusion-flight-sql-table-provider/src/lib.rs From 66fb74789c5514d61dbd22143754b9b69d3cf66a Mon Sep 17 00:00:00 2001 From: hozan23 Date: Fri, 23 Aug 2024 11:27:17 +0200 Subject: [PATCH 23/31] override supports_rewrite for FederationOptimizerRule and SQLFederationOptimizerRule --- datafusion-federation/src/optimizer.rs | 6 ++++++ datafusion-federation/src/sql/mod.rs | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/datafusion-federation/src/optimizer.rs b/datafusion-federation/src/optimizer.rs index d0fc24b..8f4cf04 100644 --- a/datafusion-federation/src/optimizer.rs +++ b/datafusion-federation/src/optimizer.rs @@ -35,6 +35,12 @@ impl OptimizerRule for FederationOptimizerRule { fn name(&self) -> &str { "federation_optimizer_rule" } + + /// XXX + /// Does this rule support rewriting owned plans (rather than by reference)? + fn supports_rewrite(&self) -> bool { + false + } } enum ScanResult { diff --git a/datafusion-federation/src/sql/mod.rs b/datafusion-federation/src/sql/mod.rs index 0055ee3..e224483 100644 --- a/datafusion-federation/src/sql/mod.rs +++ b/datafusion-federation/src/sql/mod.rs @@ -106,6 +106,12 @@ impl OptimizerRule for SQLFederationOptimizerRule { fn name(&self) -> &str { "federate_sql" } + + /// XXX + /// Does this rule support rewriting owned plans (rather than by reference)? + fn supports_rewrite(&self) -> bool { + false + } } /// Rewrite table scans to use the original federated table name. From 5e7bba44eee77c49ef39cd1d09390811708c181d Mon Sep 17 00:00:00 2001 From: Michiel De Backker Date: Fri, 23 Aug 2024 14:56:37 +0200 Subject: [PATCH 24/31] Improve project overview --- README.md | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ed7fdef..9d39ffa 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,20 @@ [![crates.io](https://img.shields.io/crates/v/datafusion-federation.svg)](https://crates.io/crates/datafusion-federation) [![docs.rs](https://docs.rs/datafusion-federation/badge.svg)](https://docs.rs/datafusion-federation) -The goal of this repo is to allow [DataFusion](https://github.com/apache/arrow-datafusion) to resolve queries across remote query engines while pushing down as much compute as possible down. +DataFusion Federation allows [DataFusion](https://github.com/apache/arrow-datafusion) to execute (part of) a query plan by a remote execution engine. + ┌────────────────┐ + ┌────────────┐ │ Remote DBMS(s) │ + SQL Query ───> │ DataFusion │ ───> │ ( execution │ + └────────────┘ │ happens here ) │ + └────────────────┘ + +The goal is to allow resolving queries across remote query engines while pushing down as much compute as possible to the remote database(s). This allows execution to happen as close to the storage as possible. This concept is referred to as 'query federation'. + +> [!TIP] +> This repository implements the federation framework itself. If you want to connect to a specific database, check out the compatible providers available in [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). + +#### Usage > :warning: **All the examples are deprecated for now** @@ -17,6 +29,71 @@ Potential use-cases: - DataFusion -> Flight SQL -> DataFusion - .. +#### Design concept + +Say you have a query plan as follows: + + ┌────────────┐ + │ Join │ + └────────────┘ + ▲ + ┌───────┴────────┐ + ┌────────────┐ ┌────────────┐ + │ Scan A │ │ Join │ + └────────────┘ └────────────┘ + ▲ + ┌───────┴────────┐ + ┌────────────┐ ┌────────────┐ + │ Scan B │ │ Scan C │ + └────────────┘ └────────────┘ + +DataFusion Federation will identify the largest possible sub-plans that +can be executed by an external database: + + ┌────────────┐ Optimizer pass + │ Join │ recognizes B and C + └────────────┘ are available in an + ▲ external database + ┌──────────────┴────────┐ + │ ┌ ─ ─ ─ ─ ─ ─ ┴ ─ ── ─ ─ ─ ─ ─┐ + ┌────────────┐ ┌────────────┐ │ + │ Scan A │ │ │ Join │ + └────────────┘ └────────────┘ │ + │ ▲ + ┌───────┴────────┐ │ + ┌────────────┐ ┌────────────┐ │ + ││ Scan B │ │ Scan C │ + └────────────┘ └────────────┘ │ + ─ ── ─ ─ ── ─ ─ ─ ─ ─ ─ ─ ── ─ ┘ + +The sub-plans are cut out and replaced by an opaque federation node in the plan: + + ┌────────────┐ + │ Join │ + └────────────┘ Rewritten Plan + ▲ + ┌────────┴───────────┐ + │ │ + ┌────────────┐ ┏━━━━━━━━━━━━━━━━━━┓ + │ Scan A │ ┃ Scan B+C ┃ + └────────────┘ ┃ (TableProvider ┃ + ┃ that can execute ┃ + ┃ sub-plan in an ┃ + ┃external database)┃ + ┗━━━━━━━━━━━━━━━━━━┛ + +Different databases may have different query languages and execution capabilities. To accommodate for this, we allow each 'federation provider' to self-determine what part of a sub-plan it will actually federate. This is done by letting each federation provider define its own optimizer rule. When a sub-plan is 'cut out' of the overall plan, it is first passed the federation provider's optimizer rule. This optimizer rule determines the part of the plan that is cut out, based based on the execution capabilities of the database it represents. + +#### Implementation + +A remote database is represented by the `FederationProvider` trait. To identify table scans that are available in the same database, they implement `FederatedTableSource` trait. This trait allows lookup of the corresponding `FederationProvider`. + +Identifying sub-plans to federate is done by the `FederationOptimizerRule`. This rule needs to be registered in your DataFusion SessionState. One easy way to do this is using `default_session_state`. To do its job, the `FederationOptimizerRule` currently requires that all TableProviders that need to be federated are `FederatedTableProviderAdaptor`s. The `FederatedTableProviderAdaptor` also has a fallback mechanism that allows implementations to fallback to a 'vanilla' TableProvider in case the `FederationOptimizerRule` isn't registered. + +The `FederationProvider` can provide a `compute_context`. This allows it to differentiate between multiple remote execution context of the same type. For example two different mysql instances, database schemas, access level, etc. The `FederationProvider` also returns the `Optimizer` that is allows it to self-determine what part of a sub-plan it can federate. + +The `sql` module implements a generic `FederationProvider` for SQL execution engines. A specific SQL engine implements the `SQLExecutor` trait for its engine specific execution. There are a number of compatible providers available in [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). + #### Status The project is in alpha status. Contributions welcome; land a PR = commit access. From b80203a420c1f35cea937ae2be3a7fe20b0e92a1 Mon Sep 17 00:00:00 2001 From: hozan23 <119854621+hozan23@users.noreply.github.com> Date: Sat, 24 Aug 2024 21:06:53 +0200 Subject: [PATCH 25/31] update README.md (#48) --- README.md | 84 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 59 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 9d39ffa..0553c83 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,11 @@ -## DataFusion Federation +# DataFusion Federation [![crates.io](https://img.shields.io/crates/v/datafusion-federation.svg)](https://crates.io/crates/datafusion-federation) [![docs.rs](https://docs.rs/datafusion-federation/badge.svg)](https://docs.rs/datafusion-federation) -DataFusion Federation allows [DataFusion](https://github.com/apache/arrow-datafusion) to execute (part of) a query plan by a remote execution engine. +DataFusion Federation allows +[DataFusion](https://github.com/apache/arrow-datafusion) to execute (part of) a +query plan by a remote execution engine. ┌────────────────┐ ┌────────────┐ │ Remote DBMS(s) │ @@ -11,25 +13,31 @@ DataFusion Federation allows [DataFusion](https://github.com/apache/arrow-datafu └────────────┘ │ happens here ) │ └────────────────┘ -The goal is to allow resolving queries across remote query engines while pushing down as much compute as possible to the remote database(s). This allows execution to happen as close to the storage as possible. This concept is referred to as 'query federation'. +The goal is to allow resolving queries across remote query engines while +pushing down as much compute as possible to the remote database(s). This allows +execution to happen as close to the storage as possible. This concept is +referred to as 'query federation'. > [!TIP] -> This repository implements the federation framework itself. If you want to connect to a specific database, check out the compatible providers available in [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). +> This repository implements the federation framework itself. If you want to +> connect to a specific database, check out the compatible providers available +> in +> [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). -#### Usage +## Usage > :warning: **All the examples are deprecated for now** Check out [the examples](./examples/) to get a feel for how it works. -Potential use-cases: +## Potential use-cases: - Querying across SQLite, MySQL, PostgreSQL, ... - Pushing down SQL or [Substrait](https://substrait.io/) plans. - DataFusion -> Flight SQL -> DataFusion - .. -#### Design concept +## Design concept Say you have a query plan as follows: @@ -50,9 +58,9 @@ Say you have a query plan as follows: DataFusion Federation will identify the largest possible sub-plans that can be executed by an external database: - ┌────────────┐ Optimizer pass - │ Join │ recognizes B and C - └────────────┘ are available in an + ┌────────────┐ Optimizer recognizes + │ Join │ that B and C are + └────────────┘ available in an ▲ external database ┌──────────────┴────────┐ │ ┌ ─ ─ ─ ─ ─ ─ ┴ ─ ── ─ ─ ─ ─ ─┐ @@ -82,21 +90,47 @@ The sub-plans are cut out and replaced by an opaque federation node in the plan: ┃external database)┃ ┗━━━━━━━━━━━━━━━━━━┛ -Different databases may have different query languages and execution capabilities. To accommodate for this, we allow each 'federation provider' to self-determine what part of a sub-plan it will actually federate. This is done by letting each federation provider define its own optimizer rule. When a sub-plan is 'cut out' of the overall plan, it is first passed the federation provider's optimizer rule. This optimizer rule determines the part of the plan that is cut out, based based on the execution capabilities of the database it represents. - -#### Implementation - -A remote database is represented by the `FederationProvider` trait. To identify table scans that are available in the same database, they implement `FederatedTableSource` trait. This trait allows lookup of the corresponding `FederationProvider`. - -Identifying sub-plans to federate is done by the `FederationOptimizerRule`. This rule needs to be registered in your DataFusion SessionState. One easy way to do this is using `default_session_state`. To do its job, the `FederationOptimizerRule` currently requires that all TableProviders that need to be federated are `FederatedTableProviderAdaptor`s. The `FederatedTableProviderAdaptor` also has a fallback mechanism that allows implementations to fallback to a 'vanilla' TableProvider in case the `FederationOptimizerRule` isn't registered. - -The `FederationProvider` can provide a `compute_context`. This allows it to differentiate between multiple remote execution context of the same type. For example two different mysql instances, database schemas, access level, etc. The `FederationProvider` also returns the `Optimizer` that is allows it to self-determine what part of a sub-plan it can federate. - -The `sql` module implements a generic `FederationProvider` for SQL execution engines. A specific SQL engine implements the `SQLExecutor` trait for its engine specific execution. There are a number of compatible providers available in [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). - -#### Status - -The project is in alpha status. Contributions welcome; land a PR = commit access. +Different databases may have different query languages and execution +capabilities. To accommodate for this, we allow each 'federation provider' to +self-determine what part of a sub-plan it will actually federate. This is done +by letting each federation provider define its own optimizer rule. When a +sub-plan is 'cut out' of the overall plan, it is first passed the federation +provider's optimizer rule. This optimizer rule determines the part of the plan +that is cut out, based on the execution capabilities of the database it +represents. + +## Implementation + +A remote database is represented by the `FederationProvider` trait. To identify +table scans that are available in the same database, they implement +`FederatedTableSource` trait. This trait allows lookup of the corresponding +`FederationProvider`. + +Identifying sub-plans to federate is done by the `FederationOptimizerRule`. +This rule needs to be registered in your DataFusion SessionState. One easy way +to do this is using `default_session_state`. To do its job, the +`FederationOptimizerRule` currently requires that all TableProviders that need +to be federated are `FederatedTableProviderAdaptor`s. The +`FederatedTableProviderAdaptor` also has a fallback mechanism that allows +implementations to fallback to a 'vanilla' TableProvider in case the +`FederationOptimizerRule` isn't registered. + +The `FederationProvider` can provide a `compute_context`. This allows it to +differentiate between multiple remote execution context of the same type. For +example two different mysql instances, database schemas, access level, etc. The +`FederationProvider` also returns the `Optimizer` that is allows it to +self-determine what part of a sub-plan it can federate. + +The `sql` module implements a generic `FederationProvider` for SQL execution +engines. A specific SQL engine implements the `SQLExecutor` trait for its +engine specific execution. There are a number of compatible providers available +in +[datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/). + +## Status + +The project is in alpha status. Contributions welcome; land a PR = commit +access. - [Docs (release)](https://docs.rs/datafusion-federation) - [Docs (main)](https://datafusion-contrib.github.io/datafusion-federation/) From 95d3416c2ea73a3e038ff9dad2a0f002ed180b21 Mon Sep 17 00:00:00 2001 From: hozan23 <119854621+hozan23@users.noreply.github.com> Date: Mon, 26 Aug 2024 09:27:43 +0200 Subject: [PATCH 26/31] datafusion-fedeartion: Remove assert_eq macros and handle errors properly (#49) --- datafusion-federation/src/plan_node.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/datafusion-federation/src/plan_node.rs b/datafusion-federation/src/plan_node.rs index 535ee4e..6473aca 100644 --- a/datafusion-federation/src/plan_node.rs +++ b/datafusion-federation/src/plan_node.rs @@ -8,7 +8,7 @@ use std::{ use async_trait::async_trait; use datafusion::{ common::DFSchemaRef, - error::Result, + error::{DataFusionError, Result}, execution::context::{QueryPlanner, SessionState}, logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNode, UserDefinedLogicalNodeCore}, physical_plan::ExecutionPlan, @@ -58,16 +58,13 @@ impl UserDefinedLogicalNodeCore for FederatedPlanNode { } fn with_exprs_and_inputs(&self, exprs: Vec, inputs: Vec) -> Result { - assert_eq!(inputs.len(), 0, "input size inconsistent"); - assert_eq!(exprs.len(), 0, "expression size inconsistent"); - Ok(Self { - plan: self.plan.clone(), - planner: Arc::clone(&self.planner), - }) - } + if !inputs.is_empty() { + return Err(DataFusionError::Plan("input size inconsistent".into())); + } + if !exprs.is_empty() { + return Err(DataFusionError::Plan("expression size inconsistent".into())); + } - /// XXX should consider something else here ? - fn with_exprs_and_inputs(&self, _exprs: Vec, _inputs: Vec) -> Result { Ok(Self { plan: self.plan.clone(), planner: self.planner.clone(), @@ -149,8 +146,11 @@ impl ExtensionPlanner for FederatedPlanner { ) -> Result>> { let dc_node = node.as_any().downcast_ref::(); if let Some(fed_node) = dc_node { - assert_eq!(logical_inputs.len(), 0, "Inconsistent number of inputs"); - assert_eq!(physical_inputs.len(), 0, "Inconsistent number of inputs"); + if !logical_inputs.is_empty() || !physical_inputs.is_empty() { + return Err(DataFusionError::Plan( + "Inconsistent number of inputs".into(), + )); + } let fed_planner = Arc::clone(&fed_node.planner); let exec_plan = fed_planner.plan_federation(fed_node, session_state).await?; From f7891873c17373556be4473d0c70315d235b7906 Mon Sep 17 00:00:00 2001 From: hozan23 <119854621+hozan23@users.noreply.github.com> Date: Mon, 26 Aug 2024 17:59:24 +0200 Subject: [PATCH 27/31] Add example in datafusion-federation crate (#50) --- README.md | 5 +- datafusion-federation/Cargo.toml | 18 ++-- datafusion-federation/examples/df-csv.rs | 115 +++++++++++++++++++++++ datafusion-federation/examples/test.csv | 4 + 4 files changed, 133 insertions(+), 9 deletions(-) create mode 100644 datafusion-federation/examples/df-csv.rs create mode 100644 datafusion-federation/examples/test.csv diff --git a/README.md b/README.md index 0553c83..dcfeaa4 100644 --- a/README.md +++ b/README.md @@ -26,9 +26,8 @@ referred to as 'query federation'. ## Usage -> :warning: **All the examples are deprecated for now** - -Check out [the examples](./examples/) to get a feel for how it works. +Check out the [examples](./datafusion-federation/examples/) to get a feel for +how it works. ## Potential use-cases: diff --git a/datafusion-federation/Cargo.toml b/datafusion-federation/Cargo.toml index 336c256..2b9a2b9 100644 --- a/datafusion-federation/Cargo.toml +++ b/datafusion-federation/Cargo.toml @@ -10,6 +10,12 @@ description = "Datafusion federation." name = "datafusion_federation" path = "src/lib.rs" +[package.metadata.docs.rs] +# Whether to pass `--all-features` to Cargo (default: false) +all-features = true +# Whether to pass `--no-default-features` to Cargo (default: false) +no-default-features = true + [features] sql = ["futures"] @@ -22,10 +28,10 @@ arrow-json.workspace = true futures = { version = "0.3.30", optional = true } -[package.metadata.docs.rs] - -# Whether to pass `--all-features` to Cargo (default: false) -all-features = true +[dev-dependencies] +tokio = { version = "1.39.3", features = ["full"] } -# Whether to pass `--no-default-features` to Cargo (default: false) -no-default-features = true +[[example]] +name = "df-csv" +path = "examples/df-csv.rs" +required-features = ["sql"] diff --git a/datafusion-federation/examples/df-csv.rs b/datafusion-federation/examples/df-csv.rs new file mode 100644 index 0000000..24ac495 --- /dev/null +++ b/datafusion-federation/examples/df-csv.rs @@ -0,0 +1,115 @@ +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::{ + arrow::datatypes::SchemaRef, + catalog::SchemaProvider, + error::{DataFusionError, Result}, + execution::{ + context::{SessionContext, SessionState}, + options::CsvReadOptions, + }, + physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}, + sql::sqlparser::dialect::{Dialect, GenericDialect}, +}; +use datafusion_federation::sql::{SQLExecutor, SQLFederationProvider, SQLSchemaProvider}; +use futures::TryStreamExt; + +const CSV_PATH: &str = "./examples/test.csv"; +const TABLE_NAME: &str = "test"; + +#[tokio::main] +async fn main() -> Result<()> { + // Create a remote context + let remote_ctx = Arc::new(SessionContext::new()); + + // Registers a CSV file + remote_ctx + .register_csv(TABLE_NAME, CSV_PATH, CsvReadOptions::new()) + .await?; + let known_tables: Vec = [TABLE_NAME].iter().map(|&x| x.into()).collect(); + + // Register schema + let executor = Arc::new(InMemorySQLExecutor::new(remote_ctx)); + let provider = Arc::new(SQLFederationProvider::new(executor)); + let schema_provider = + Arc::new(SQLSchemaProvider::new_with_tables(provider, known_tables).await?); + + // Local context + let state = datafusion_federation::default_session_state(); + overwrite_default_schema(&state, schema_provider)?; + let ctx = SessionContext::new_with_state(state); + + // Run query + let query = r#"SELECT * from test"#; + let df = ctx.sql(query).await?; + + // let explain = df.clone().explain(true, false)?; + // explain.show().await?; + + df.show().await +} + +fn overwrite_default_schema(state: &SessionState, schema: Arc) -> Result<()> { + let options = &state.config().options().catalog; + let catalog = state + .catalog_list() + .catalog(options.default_catalog.as_str()) + .unwrap(); + + catalog.register_schema(options.default_schema.as_str(), schema)?; + + Ok(()) +} + +pub struct InMemorySQLExecutor { + session: Arc, +} + +impl InMemorySQLExecutor { + pub fn new(session: Arc) -> Self { + Self { session } + } +} + +#[async_trait] +impl SQLExecutor for InMemorySQLExecutor { + fn name(&self) -> &str { + "in_memory_sql_executor" + } + + fn compute_context(&self) -> Option { + None + } + + fn execute(&self, sql: &str, schema: SchemaRef) -> Result { + // Execute it using the remote datafusion session context + let future_stream = _execute(self.session.clone(), sql.to_string()); + let stream = futures::stream::once(future_stream).try_flatten(); + Ok(Box::pin(RecordBatchStreamAdapter::new( + schema.clone(), + stream, + ))) + } + + async fn table_names(&self) -> Result> { + Err(DataFusionError::NotImplemented( + "table inference not implemented".to_string(), + )) + } + + async fn get_table_schema(&self, table_name: &str) -> Result { + let sql = format!("select * from {table_name} limit 1"); + let df = self.session.sql(&sql).await?; + let schema = df.schema().as_arrow().clone(); + Ok(Arc::new(schema)) + } + + fn dialect(&self) -> Arc { + Arc::new(GenericDialect {}) + } +} + +async fn _execute(ctx: Arc, sql: String) -> Result { + ctx.sql(&sql).await?.execute_stream().await +} diff --git a/datafusion-federation/examples/test.csv b/datafusion-federation/examples/test.csv new file mode 100644 index 0000000..811d276 --- /dev/null +++ b/datafusion-federation/examples/test.csv @@ -0,0 +1,4 @@ +foo,bar +a,1 +b,2 +c,3 \ No newline at end of file From 49f8be0d22d90f0697c0f723faf2f7a4f6cf76b3 Mon Sep 17 00:00:00 2001 From: hozan23 Date: Thu, 29 Aug 2024 11:24:03 +0200 Subject: [PATCH 28/31] fix rebase conflicts --- datafusion-federation/Cargo.toml | 7 +- datafusion-federation/examples/df-csv.rs | 4 +- datafusion-federation/src/lib.rs | 4 +- datafusion-federation/src/schema_cast.rs | 2 +- .../src/schema_cast/intervals_cast.rs | 27 +++- .../src/schema_cast/record_convert.rs | 5 +- datafusion-federation/src/sql/mod.rs | 32 +++-- sources/sql/Cargo.toml | 32 ----- sources/sql/src/connectorx/executor.rs | 126 ------------------ 9 files changed, 48 insertions(+), 191 deletions(-) delete mode 100644 sources/sql/Cargo.toml delete mode 100644 sources/sql/src/connectorx/executor.rs diff --git a/datafusion-federation/Cargo.toml b/datafusion-federation/Cargo.toml index 2b9a2b9..1d80673 100644 --- a/datafusion-federation/Cargo.toml +++ b/datafusion-federation/Cargo.toml @@ -17,19 +17,20 @@ all-features = true no-default-features = true [features] -sql = ["futures"] +sql = [] [dependencies] +futures.workspace = true async-trait.workspace = true datafusion.workspace = true async-stream.workspace = true -futures.workspace = true arrow-json.workspace = true -futures = { version = "0.3.30", optional = true } [dev-dependencies] tokio = { version = "1.39.3", features = ["full"] } +tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } +tracing = "0.1.40" [[example]] name = "df-csv" diff --git a/datafusion-federation/examples/df-csv.rs b/datafusion-federation/examples/df-csv.rs index 24ac495..0d83f63 100644 --- a/datafusion-federation/examples/df-csv.rs +++ b/datafusion-federation/examples/df-csv.rs @@ -10,7 +10,7 @@ use datafusion::{ options::CsvReadOptions, }, physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream}, - sql::sqlparser::dialect::{Dialect, GenericDialect}, + sql::unparser::dialect::{DefaultDialect, Dialect}, }; use datafusion_federation::sql::{SQLExecutor, SQLFederationProvider, SQLSchemaProvider}; use futures::TryStreamExt; @@ -106,7 +106,7 @@ impl SQLExecutor for InMemorySQLExecutor { } fn dialect(&self) -> Arc { - Arc::new(GenericDialect {}) + Arc::new(DefaultDialect {}) } } diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index abb729a..e6ec438 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -15,10 +15,12 @@ use datafusion::{ optimizer::{optimizer::Optimizer, OptimizerRule}, }; - pub use optimizer::{get_table_source, FederationOptimizerRule}; pub use plan_node::{FederatedPlanNode, FederatedQueryPlanner, FederationPlanner}; pub use table_provider::{FederatedTableProviderAdaptor, FederatedTableSource}; + +// TODO clean up this +// TODO move schema_cast.rs to schema_cast directory pub mod schema_cast; pub fn default_session_state() -> SessionState { diff --git a/datafusion-federation/src/schema_cast.rs b/datafusion-federation/src/schema_cast.rs index d23a470..f38f65c 100644 --- a/datafusion-federation/src/schema_cast.rs +++ b/datafusion-federation/src/schema_cast.rs @@ -14,8 +14,8 @@ use std::sync::Arc; mod intervals_cast; mod lists_cast; -mod struct_cast; pub mod record_convert; +mod struct_cast; #[derive(Debug)] #[allow(clippy::module_name_repetitions)] diff --git a/datafusion-federation/src/schema_cast/intervals_cast.rs b/datafusion-federation/src/schema_cast/intervals_cast.rs index 3e445c5..5fbd806 100644 --- a/datafusion-federation/src/schema_cast/intervals_cast.rs +++ b/datafusion-federation/src/schema_cast/intervals_cast.rs @@ -50,8 +50,7 @@ pub(crate) fn cast_interval_monthdaynano_to_daytime( let interval_monthdaynano_array = interval_monthdaynano_array .as_any() .downcast_ref::() - .ok_or_else(|| - ArrowError::CastError("Failed to cast IntervalMonthDayNanoArray: Unable to downcast to IntervalMonthDayNanoArray".to_string()))?; + .ok_or_else(|| ArrowError::CastError("Failed to cast IntervalMonthDayNanoArray: Unable to downcast to IntervalMonthDayNanoArray".to_string()))?; let mut interval_daytime_builder = IntervalDayTimeBuilder::with_capacity(interval_monthdaynano_array.len()); @@ -78,8 +77,10 @@ pub(crate) fn cast_interval_monthdaynano_to_daytime( #[cfg(test)] mod test { use datafusion::arrow::{ - array::{RecordBatch, IntervalDayTimeArray, IntervalYearMonthArray}, - datatypes::{DataType, Field, Schema, SchemaRef, IntervalUnit, IntervalMonthDayNano, IntervalDayTime}, + array::{IntervalDayTimeArray, IntervalYearMonthArray, RecordBatch}, + datatypes::{ + DataType, Field, IntervalDayTime, IntervalMonthDayNano, IntervalUnit, Schema, SchemaRef, + }, }; use crate::schema_cast::record_convert::try_cast_to; @@ -88,9 +89,21 @@ mod test { fn input_schema() -> SchemaRef { Arc::new(Schema::new(vec![ - Field::new("interval_daytime", DataType::Interval(IntervalUnit::MonthDayNano), false), - Field::new("interval_monthday_nano", DataType::Interval(IntervalUnit::MonthDayNano), false), - Field::new("interval_yearmonth", DataType::Interval(IntervalUnit::MonthDayNano), false), + Field::new( + "interval_daytime", + DataType::Interval(IntervalUnit::MonthDayNano), + false, + ), + Field::new( + "interval_monthday_nano", + DataType::Interval(IntervalUnit::MonthDayNano), + false, + ), + Field::new( + "interval_yearmonth", + DataType::Interval(IntervalUnit::MonthDayNano), + false, + ), ])) } diff --git a/datafusion-federation/src/schema_cast/record_convert.rs b/datafusion-federation/src/schema_cast/record_convert.rs index 140ca38..a20401a 100644 --- a/datafusion-federation/src/schema_cast/record_convert.rs +++ b/datafusion-federation/src/schema_cast/record_convert.rs @@ -1,7 +1,7 @@ use datafusion::arrow::{ array::{Array, RecordBatch}, compute::cast, - datatypes::{DataType, IntervalUnit, SchemaRef} + datatypes::{DataType, IntervalUnit, SchemaRef}, }; use std::sync::Arc; @@ -9,7 +9,8 @@ use super::{ intervals_cast::{ cast_interval_monthdaynano_to_daytime, cast_interval_monthdaynano_to_yearmonth, }, - lists_cast::{cast_string_to_fixed_size_list, cast_string_to_large_list, cast_string_to_list}, struct_cast::cast_string_to_struct, + lists_cast::{cast_string_to_fixed_size_list, cast_string_to_large_list, cast_string_to_list}, + struct_cast::cast_string_to_struct, }; pub type Result = std::result::Result; diff --git a/datafusion-federation/src/sql/mod.rs b/datafusion-federation/src/sql/mod.rs index e224483..b305bd7 100644 --- a/datafusion-federation/src/sql/mod.rs +++ b/datafusion-federation/src/sql/mod.rs @@ -1,15 +1,22 @@ mod executor; mod schema; -use core::fmt; -use std::{any::Any, collections::HashMap, sync::Arc, vec}; +use std::{any::Any, collections::HashMap, fmt, sync::Arc, vec}; use async_trait::async_trait; use datafusion::{ arrow::datatypes::{Schema, SchemaRef}, + common::Column, error::Result, execution::{context::SessionState, TaskContext}, - logical_expr::{Extension, LogicalPlan}, + logical_expr::{ + expr::{ + AggregateFunction, Alias, Exists, InList, InSubquery, ScalarFunction, Sort, Unnest, + WindowFunction, + }, + Between, BinaryExpr, Case, Cast, Expr, Extension, GroupingSet, Like, LogicalPlan, Subquery, + TryCast, + }, optimizer::{optimizer::Optimizer, OptimizerConfig, OptimizerRule}, physical_expr::EquivalenceProperties, physical_plan::{ @@ -21,20 +28,13 @@ use datafusion::{ TableReference, }, }; -use datafusion_federation::{ - get_table_source, schema_cast, FederatedPlanNode, FederationPlanner, FederationProvider, -}; - - - -#[cfg(feature = "connectorx")] -pub mod connectorx; - pub use executor::{SQLExecutor, SQLExecutorRef}; pub use schema::{MultiSchemaProvider, SQLSchemaProvider, SQLTableSource}; -use crate::{FederatedPlanNode, FederationPlanner, FederationProvider}; +use crate::{ + get_table_source, schema_cast, FederatedPlanNode, FederationPlanner, FederationProvider, +}; // #[macro_use] // extern crate derive_builder; @@ -184,9 +184,7 @@ fn rewrite_column_name_in_expr( } // Find the first occurrence of table_ref_str starting from start_pos - let Some(idx) = col_name[start_pos..].find(table_ref_str) else { - return None; - }; + let idx = col_name[start_pos..].find(table_ref_str)?; // Calculate the absolute index of the occurrence in string as the index above is relative to start_pos let idx = start_pos + idx; @@ -712,6 +710,7 @@ impl ExecutionPlan for VirtualExecutionPlan { #[cfg(test)] mod tests { + use crate::FederatedTableProviderAdaptor; use datafusion::{ arrow::datatypes::{DataType, Field}, catalog::SchemaProvider, @@ -723,7 +722,6 @@ mod tests { logical_expr::LogicalPlanBuilder, sql::{unparser::dialect::DefaultDialect, unparser::dialect::Dialect}, }; - use datafusion_federation::FederatedTableProviderAdaptor; use super::*; diff --git a/sources/sql/Cargo.toml b/sources/sql/Cargo.toml deleted file mode 100644 index b5938c7..0000000 --- a/sources/sql/Cargo.toml +++ /dev/null @@ -1,32 +0,0 @@ -[package] -name = "datafusion-federation-sql" -version.workspace = true -edition.workspace = true -license.workspace = true - -[lib] -name = "datafusion_federation_sql" -path = "src/lib.rs" - -[dependencies] -async-trait.workspace = true -# connectorx = { version = "0.3.2", features = ["src_sqlite"] } -# https://github.com/sfu-db/connector-x/pull/555 -connectorx = { git = "https://github.com/devinjdangelo/connector-x.git", features = [ - "dst_arrow", - "src_sqlite" -], optional = true } -datafusion.workspace = true - -# XXX use the release verion on crates.io -datafusion-federation.path = "../../datafusion-federation" - -futures = "0.3.30" -tokio = "1.35.1" -tracing = "0.1.40" - -[features] -connectorx = ["dep:connectorx"] - -[dev-dependencies] -tracing-subscriber = { version = "0.3.18", features = ["env-filter"] } diff --git a/sources/sql/src/connectorx/executor.rs b/sources/sql/src/connectorx/executor.rs deleted file mode 100644 index 38a8f4e..0000000 --- a/sources/sql/src/connectorx/executor.rs +++ /dev/null @@ -1,126 +0,0 @@ -use async_trait::async_trait; -use connectorx::{ - destinations::arrow::ArrowDestinationError, - errors::{ConnectorXError, ConnectorXOutError}, - prelude::{get_arrow, CXQuery, SourceConn, SourceType}, -}; -use datafusion::{ - arrow::datatypes::{Field, Schema, SchemaRef}, - error::{DataFusionError, Result}, - physical_plan::{ - stream::RecordBatchStreamAdapter, EmptyRecordBatchStream, SendableRecordBatchStream, - }, - sql::unparser::dialect::{DefaultDialect, Dialect, PostgreSqlDialect, SqliteDialect}, -}; -use futures::executor::block_on; -use std::sync::Arc; -use tokio::task; - -use crate::executor::SQLExecutor; - -pub struct CXExecutor { - context: String, - conn: SourceConn, -} - -impl CXExecutor { - pub fn new(dsn: String) -> Result { - let conn = SourceConn::try_from(dsn.as_str()).map_err(cx_error_to_df)?; - Ok(Self { context: dsn, conn }) - } - - pub fn new_with_conn(conn: SourceConn) -> Self { - Self { - context: conn.conn.to_string(), - conn, - } - } - - pub fn context(&mut self, context: String) { - self.context = context; - } -} - -fn cx_error_to_df(err: ConnectorXError) -> DataFusionError { - DataFusionError::External(format!("ConnectorX: {err:?}").into()) -} - -#[async_trait] -impl SQLExecutor for CXExecutor { - fn name(&self) -> &str { - "connector_x_executor" - } - fn compute_context(&self) -> Option { - Some(self.context.clone()) - } - - fn execute(&self, sql: &str, schema: SchemaRef) -> Result { - let conn = self.conn.clone(); - let query: CXQuery = sql.into(); - - let mut dst = block_on(task::spawn_blocking(move || -> Result<_, _> { - get_arrow(&conn, None, &[query.clone()]).map_err(cx_out_error_to_df) - })) - .map_err(|err| DataFusionError::External(err.to_string().into()))??; - let stream = if let Some(batch) = dst.record_batch().map_err(cx_dst_error_to_df)? { - futures::stream::once(async move { Ok(batch) }) - } else { - return Ok(Box::pin(EmptyRecordBatchStream::new(Arc::new( - Schema::empty(), - )))); - }; - - Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) - } - - async fn table_names(&self) -> Result> { - Err(DataFusionError::NotImplemented( - "connector_x source: table inference not implemented".to_string(), - )) - } - - async fn get_table_schema(&self, table_name: &str) -> Result { - let conn = self.conn.clone(); - let query: CXQuery = format!("select * from {table_name} limit 1") - .as_str() - .into(); - - let dst = get_arrow(&conn, None, &[query.clone()]).map_err(cx_out_error_to_df)?; - let schema = schema_to_lowercase(dst.arrow_schema()); - Ok(schema) - } - - fn dialect(&self) -> Arc { - match &self.conn.ty { - SourceType::Postgres => Arc::new(PostgreSqlDialect {}), - SourceType::SQLite => Arc::new(SqliteDialect {}), - _ => Arc::new(DefaultDialect {}), - } - } -} - -fn cx_dst_error_to_df(err: ArrowDestinationError) -> DataFusionError { - DataFusionError::External(format!("ConnectorX failed to run query: {err:?}").into()) -} - -/// Get the schema with lowercase field names -fn schema_to_lowercase(schema: SchemaRef) -> SchemaRef { - // DF needs lower case schema - let lower_fields: Vec<_> = schema - .fields - .iter() - .map(|f| { - Field::new( - f.name().to_ascii_lowercase(), - f.data_type().clone(), - f.is_nullable(), - ) - }) - .collect(); - - Arc::new(Schema::new(lower_fields)) -} - -fn cx_out_error_to_df(err: ConnectorXOutError) -> DataFusionError { - DataFusionError::External(format!("ConnectorX failed to run query: {err:?}").into()) -} From ffdbe8705c5c255c8f8d4a537fe2e2798025f485 Mon Sep 17 00:00:00 2001 From: hozan23 Date: Thu, 29 Aug 2024 12:53:45 +0200 Subject: [PATCH 29/31] cargo fmt --- datafusion-federation/src/lib.rs | 1 - datafusion-federation/src/sql/mod.rs | 1 - 2 files changed, 2 deletions(-) diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index fc7eb5b..e6ec438 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -23,7 +23,6 @@ pub use table_provider::{FederatedTableProviderAdaptor, FederatedTableSource}; // TODO move schema_cast.rs to schema_cast directory pub mod schema_cast; - pub fn default_session_state() -> SessionState { let rules = default_optimizer_rules(); SessionStateBuilder::new() diff --git a/datafusion-federation/src/sql/mod.rs b/datafusion-federation/src/sql/mod.rs index c8eb8a4..63b81d3 100644 --- a/datafusion-federation/src/sql/mod.rs +++ b/datafusion-federation/src/sql/mod.rs @@ -964,4 +964,3 @@ mod tests { Ok(()) } } - From f9abf4301c9dfeb982cb793b1cf1d323d6e8a18d Mon Sep 17 00:00:00 2001 From: hozan23 Date: Thu, 29 Aug 2024 13:10:55 +0200 Subject: [PATCH 30/31] remove datafusion-federation/analyzer.rs --- datafusion-federation/src/analyzer.rs | 183 -------------------------- 1 file changed, 183 deletions(-) delete mode 100644 datafusion-federation/src/analyzer.rs diff --git a/datafusion-federation/src/analyzer.rs b/datafusion-federation/src/analyzer.rs deleted file mode 100644 index 7760734..0000000 --- a/datafusion-federation/src/analyzer.rs +++ /dev/null @@ -1,183 +0,0 @@ -use std::sync::Arc; - -use datafusion::{ - common::Column, - config::ConfigOptions, - datasource::source_as_provider, - error::Result, - logical_expr::{Expr, LogicalPlan, Projection, TableScan, TableSource}, - optimizer::analyzer::AnalyzerRule, - sql::TableReference, -}; - -use crate::{FederatedTableProviderAdaptor, FederatedTableSource, FederationProviderRef}; - -#[derive(Default)] -pub struct FederationAnalyzerRule {} - -impl AnalyzerRule for FederationAnalyzerRule { - // Walk over the plan, look for the largest subtrees that only have - // TableScans from the same FederationProvider. - // There 'largest sub-trees' are passed to their respective FederationProvider.optimizer. - fn analyze(&self, plan: LogicalPlan, config: &ConfigOptions) -> Result { - let (optimized, _) = self.optimize_recursively(&plan, None, config)?; - if let Some(result) = optimized { - return Ok(result); - } - Ok(plan.clone()) - } - - /// A human readable name for this optimizer rule - fn name(&self) -> &str { - "federation_optimizer_rule" - } -} - -impl FederationAnalyzerRule { - pub fn new() -> Self { - Self::default() - } - - // optimize_recursively recursively finds the largest sub-plans that can be federated - // to a single FederationProvider. - // Returns a plan if a sub-tree was federated, otherwise None. - // Returns a FederationProvider if it covers the entire sub-tree, otherwise None. - fn optimize_recursively( - &self, - plan: &LogicalPlan, - parent: Option<&LogicalPlan>, - _config: &ConfigOptions, - ) -> Result<(Option, Option)> { - // Check if this node determines the FederationProvider - let sole_provider = self.get_federation_provider(plan)?; - if sole_provider.is_some() { - return Ok((None, sole_provider)); - } - - // optimize_inputs - let inputs = plan.inputs(); - if inputs.is_empty() { - return Ok((None, None)); - } - - let (new_inputs, providers): (Vec<_>, Vec<_>) = inputs - .iter() - .map(|i| self.optimize_recursively(i, Some(plan), _config)) - .collect::>>()? - .into_iter() - .unzip(); - - // Note: assumes provider is None if ambiguous - let first_provider = providers.first().unwrap(); - let is_singular = providers.iter().all(|p| p.is_some() && p == first_provider); - - if is_singular { - if parent.is_none() { - // federate the entire plan - if let Some(provider) = first_provider { - if let Some(optimizer) = provider.analyzer() { - let optimized = - optimizer.execute_and_check(plan.clone(), _config, |_, _| {})?; - return Ok((Some(optimized), None)); - } - return Ok((None, None)); - } - return Ok((None, None)); - } - // The largest sub-plan is higher up. - return Ok((None, first_provider.clone())); - } - - // The plan is ambiguous, any inputs that are not federated and - // have a sole provider, should be federated. - let new_inputs = new_inputs - .into_iter() - .enumerate() - .map(|(i, new_sub_plan)| { - if let Some(sub_plan) = new_sub_plan { - // Already federated - return Ok(sub_plan); - } - let sub_plan = inputs.get(i).unwrap(); - // Check if the input has a sole provider and can be federated. - if let Some(provider) = providers.get(i).unwrap() { - if let Some(optimizer) = provider.analyzer() { - let wrapped = wrap_projection((*sub_plan).clone())?; - - let optimized = optimizer.execute_and_check(wrapped, _config, |_, _| {})?; - return Ok(optimized); - } - // No federation for this sub-plan (no analyzer) - return Ok((*sub_plan).clone()); - } - // No federation for this sub-plan (no provider) - Ok((*sub_plan).clone()) - }) - .collect::>>()?; - - let new_plan = plan.with_new_exprs(plan.expressions(), new_inputs)?; - - Ok((Some(new_plan), None)) - } - - fn get_federation_provider(&self, plan: &LogicalPlan) -> Result> { - match plan { - LogicalPlan::TableScan(TableScan { ref source, .. }) => { - let Some(federated_source) = get_table_source(source)? else { - return Ok(None); - }; - let provider = federated_source.federation_provider(); - Ok(Some(provider)) - } - _ => Ok(None), - } - } -} - -fn wrap_projection(plan: LogicalPlan) -> Result { - // TODO: minimize requested columns - match plan { - LogicalPlan::Projection(_) => Ok(plan), - _ => { - let expr = plan - .schema() - .fields() - .iter() - .enumerate() - .map(|(i, f)| { - Expr::Column(Column::from_qualified_name(format!( - "{}.{}", - plan.schema() - .qualified_field(i) - .0 - .map(TableReference::table) - .unwrap_or_default(), - f.name() - ))) - }) - .collect::>(); - Ok(LogicalPlan::Projection(Projection::try_new( - expr, - Arc::new(plan), - )?)) - } - } -} - -pub fn get_table_source( - source: &Arc, -) -> Result>> { - // Unwrap TableSource - let source = source_as_provider(source)?; - - // Get FederatedTableProviderAdaptor - let Some(wrapper) = source - .as_any() - .downcast_ref::() - else { - return Ok(None); - }; - - // Return original FederatedTableSource - Ok(Some(Arc::clone(&wrapper.source))) -} From 238f836f17a98802282a4c443f09fc2a39f104ca Mon Sep 17 00:00:00 2001 From: hozan23 Date: Thu, 29 Aug 2024 13:29:59 +0200 Subject: [PATCH 31/31] datafusion-federation: export FederatedPlanner --- datafusion-federation/src/lib.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion-federation/src/lib.rs b/datafusion-federation/src/lib.rs index e6ec438..f7b4ab2 100644 --- a/datafusion-federation/src/lib.rs +++ b/datafusion-federation/src/lib.rs @@ -16,7 +16,9 @@ use datafusion::{ }; pub use optimizer::{get_table_source, FederationOptimizerRule}; -pub use plan_node::{FederatedPlanNode, FederatedQueryPlanner, FederationPlanner}; +pub use plan_node::{ + FederatedPlanNode, FederatedPlanner, FederatedQueryPlanner, FederationPlanner, +}; pub use table_provider::{FederatedTableProviderAdaptor, FederatedTableSource}; // TODO clean up this