|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +/// This example shows how to implement the DataFusion [`CatalogProvider`] API |
| 19 | +/// for catalogs that are remote (require network access) and/or offer only |
| 20 | +/// asynchronous APIs such as [Polaris], [Unity], and [Hive]. |
| 21 | +/// |
| 22 | +/// Integrating with this catalogs is a bit more complex than with local |
| 23 | +/// catalogs because calls like `ctx.sql("SELECT * FROM db.schm.tbl")` may need |
| 24 | +/// to perform remote network requests, but many Catalog APIs are synchronous. |
| 25 | +/// See the documentation on [`CatalogProvider`] for more details. |
| 26 | +/// |
| 27 | +/// [`CatalogProvider`]: datafusion_catalog::CatalogProvider |
| 28 | +/// |
| 29 | +/// [Polaris]: https://github.com/apache/polaris |
| 30 | +/// [Unity]: https://github.com/unitycatalog/unitycatalog |
| 31 | +/// [Hive]: https://hive.apache.org/ |
| 32 | +use arrow::array::record_batch; |
| 33 | +use arrow_schema::{Field, Fields, Schema, SchemaRef}; |
| 34 | +use async_trait::async_trait; |
| 35 | +use datafusion::catalog::{SchemaProvider, TableProvider}; |
| 36 | +use datafusion::common::DataFusionError; |
| 37 | +use datafusion::common::Result; |
| 38 | +use datafusion::execution::SendableRecordBatchStream; |
| 39 | +use datafusion::physical_plan::memory::MemoryExec; |
| 40 | +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; |
| 41 | +use datafusion::physical_plan::ExecutionPlan; |
| 42 | +use datafusion::prelude::{DataFrame, SessionContext}; |
| 43 | +use datafusion_catalog::Session; |
| 44 | +use datafusion_common::{ |
| 45 | + assert_batches_eq, internal_datafusion_err, plan_err, HashMap, TableReference, |
| 46 | +}; |
| 47 | +use datafusion_expr::{Expr, TableType}; |
| 48 | +use futures::TryStreamExt; |
| 49 | +use std::any::Any; |
| 50 | +use std::sync::{Arc, Mutex}; |
| 51 | + |
| 52 | +#[tokio::main] |
| 53 | +async fn main() -> Result<()> { |
| 54 | + // As always, we create a session context to interact with DataFusion |
| 55 | + let ctx = SessionContext::new(); |
| 56 | + |
| 57 | + // Make a connection to the remote catalog, asynchronously, and configure it |
| 58 | + let remote_catalog_interface = RemoteCatalogInterface::connect().await?; |
| 59 | + |
| 60 | + // Register a SchemaProvider for tables in a schema named "remote_schema". |
| 61 | + // |
| 62 | + // This will let DataFusion query tables such as |
| 63 | + // `datafusion.remote_schema.remote_table` |
| 64 | + let remote_schema: Arc<dyn SchemaProvider> = |
| 65 | + Arc::new(RemoteSchema::new(Arc::new(remote_catalog_interface))); |
| 66 | + ctx.catalog("datafusion") |
| 67 | + .ok_or_else(|| internal_datafusion_err!("default catalog was not installed"))? |
| 68 | + .register_schema("remote_schema", Arc::clone(&remote_schema))?; |
| 69 | + |
| 70 | + // Here is a query that selects data from a table in the remote catalog. |
| 71 | + let sql = "SELECT * from remote_schema.remote_table"; |
| 72 | + |
| 73 | + // The `SessionContext::sql` interface is async, but it does not |
| 74 | + // support asynchronous access to catalogs, so the following query errors. |
| 75 | + let results = ctx.sql(sql).await; |
| 76 | + assert_eq!( |
| 77 | + results.unwrap_err().to_string(), |
| 78 | + "Error during planning: table 'datafusion.remote_schema.remote_table' not found" |
| 79 | + ); |
| 80 | + |
| 81 | + // Instead, to use a remote catalog, we must use lower level APIs on |
| 82 | + // SessionState (what `SessionContext::sql` does internally). |
| 83 | + let state = ctx.state(); |
| 84 | + |
| 85 | + // First, parse the SQL (but don't plan it / resolve any table references) |
| 86 | + let dialect = state.config().options().sql_parser.dialect.as_str(); |
| 87 | + let statement = state.sql_to_statement(sql, dialect)?; |
| 88 | + |
| 89 | + // Find all `TableReferences` in the parsed queries. These correspond to the |
| 90 | + // tables referred to by the query (in this case |
| 91 | + // `remote_schema.remote_table`) |
| 92 | + let references = state.resolve_table_references(&statement)?; |
| 93 | + |
| 94 | + // Call `load_tables` to load information from the remote catalog for each |
| 95 | + // of the referenced tables. Best practice is to fetch the the information |
| 96 | + // for all tables required by the query once (rather than one per table) to |
| 97 | + // minimize network overhead |
| 98 | + let table_names = references.iter().filter_map(|r| { |
| 99 | + if refers_to_schema("datafusion", "remote_schema", r) { |
| 100 | + Some(r.table()) |
| 101 | + } else { |
| 102 | + None |
| 103 | + } |
| 104 | + }); |
| 105 | + remote_schema |
| 106 | + .as_any() |
| 107 | + .downcast_ref::<RemoteSchema>() |
| 108 | + .expect("correct types") |
| 109 | + .load_tables(table_names) |
| 110 | + .await?; |
| 111 | + |
| 112 | + // Now continue planing the query after having fetched the remote table and |
| 113 | + // it can run as normal |
| 114 | + let plan = state.statement_to_plan(statement).await?; |
| 115 | + let results = DataFrame::new(state, plan).collect().await?; |
| 116 | + assert_batches_eq!( |
| 117 | + [ |
| 118 | + "+----+-------+", |
| 119 | + "| id | name |", |
| 120 | + "+----+-------+", |
| 121 | + "| 1 | alpha |", |
| 122 | + "| 2 | beta |", |
| 123 | + "| 3 | gamma |", |
| 124 | + "+----+-------+", |
| 125 | + ], |
| 126 | + &results |
| 127 | + ); |
| 128 | + |
| 129 | + Ok(()) |
| 130 | +} |
| 131 | + |
| 132 | +/// This is an example of an API that interacts with a remote catalog. |
| 133 | +/// |
| 134 | +/// Specifically, its APIs are all `async` and thus can not be used by |
| 135 | +/// [`SchemaProvider`] or [`TableProvider`] directly. |
| 136 | +#[derive(Debug)] |
| 137 | +struct RemoteCatalogInterface {} |
| 138 | + |
| 139 | +impl RemoteCatalogInterface { |
| 140 | + /// Establish a connection to the remote catalog |
| 141 | + pub async fn connect() -> Result<Self> { |
| 142 | + // In a real implementation this method might connect to a remote |
| 143 | + // catalog, validate credentials, cache basic information, etc |
| 144 | + Ok(Self {}) |
| 145 | + } |
| 146 | + |
| 147 | + /// Fetches information for a specific table |
| 148 | + pub async fn table_info(&self, name: &str) -> Result<SchemaRef> { |
| 149 | + if name != "remote_table" { |
| 150 | + return plan_err!("Remote table not found: {}", name); |
| 151 | + } |
| 152 | + |
| 153 | + // In this example, we'll model a remote table with columns "id" and |
| 154 | + // "name" |
| 155 | + // |
| 156 | + // A real remote catalog would make a network call to fetch this |
| 157 | + // information from a remote source. |
| 158 | + let schema = Schema::new(Fields::from(vec![ |
| 159 | + Field::new("id", arrow::datatypes::DataType::Int32, false), |
| 160 | + Field::new("name", arrow::datatypes::DataType::Utf8, false), |
| 161 | + ])); |
| 162 | + Ok(Arc::new(schema)) |
| 163 | + } |
| 164 | + |
| 165 | + /// Fetches data for a table from a remote data source |
| 166 | + pub async fn read_data(&self, name: &str) -> Result<SendableRecordBatchStream> { |
| 167 | + if name != "remote_table" { |
| 168 | + return plan_err!("Remote table not found: {}", name); |
| 169 | + } |
| 170 | + |
| 171 | + // In a real remote catalog this call would likely perform network IO to |
| 172 | + // open and begin reading from a remote datasource, prefetching |
| 173 | + // information, etc. |
| 174 | + // |
| 175 | + // In this example we are just demonstrating how the API works so simply |
| 176 | + // return back some static data as a stream. |
| 177 | + let batch = record_batch!( |
| 178 | + ("id", Int32, [1, 2, 3]), |
| 179 | + ("name", Utf8, ["alpha", "beta", "gamma"]) |
| 180 | + ) |
| 181 | + .unwrap(); |
| 182 | + let schema = batch.schema(); |
| 183 | + |
| 184 | + let stream = futures::stream::iter([Ok(batch)]); |
| 185 | + Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) |
| 186 | + } |
| 187 | +} |
| 188 | + |
| 189 | +/// Implements the DataFusion Catalog API interface for tables |
| 190 | +/// stored in a remote catalog. |
| 191 | +#[derive(Debug)] |
| 192 | +struct RemoteSchema { |
| 193 | + /// Connection with the remote catalog |
| 194 | + remote_catalog_interface: Arc<RemoteCatalogInterface>, |
| 195 | + /// Local cache of tables that have been preloaded from the remote |
| 196 | + /// catalog |
| 197 | + tables: Mutex<HashMap<String, Arc<dyn TableProvider>>>, |
| 198 | +} |
| 199 | + |
| 200 | +impl RemoteSchema { |
| 201 | + /// Create a new RemoteSchema |
| 202 | + pub fn new(remote_catalog_interface: Arc<RemoteCatalogInterface>) -> Self { |
| 203 | + Self { |
| 204 | + remote_catalog_interface, |
| 205 | + tables: Mutex::new(HashMap::new()), |
| 206 | + } |
| 207 | + } |
| 208 | + |
| 209 | + /// Load information for the specified tables from the remote source into |
| 210 | + /// the local cached copy. |
| 211 | + pub async fn load_tables( |
| 212 | + &self, |
| 213 | + references: impl IntoIterator<Item = &str>, |
| 214 | + ) -> Result<()> { |
| 215 | + for table_name in references { |
| 216 | + if !self.table_exist(table_name) { |
| 217 | + // Fetch information about the table from the remote catalog |
| 218 | + // |
| 219 | + // Note that a real remote catalog interface could return more |
| 220 | + // information, but at the minimum, DataFusion requires the |
| 221 | + // table's schema for planing. |
| 222 | + let schema = self.remote_catalog_interface.table_info(table_name).await?; |
| 223 | + let remote_table = RemoteTable::new( |
| 224 | + Arc::clone(&self.remote_catalog_interface), |
| 225 | + table_name, |
| 226 | + schema, |
| 227 | + ); |
| 228 | + |
| 229 | + // Add the table to our local cached list |
| 230 | + self.tables |
| 231 | + .lock() |
| 232 | + .expect("mutex invalid") |
| 233 | + .insert(table_name.to_string(), Arc::new(remote_table)); |
| 234 | + }; |
| 235 | + } |
| 236 | + Ok(()) |
| 237 | + } |
| 238 | +} |
| 239 | + |
| 240 | +/// Implement the DataFusion Catalog API for [`RemoteSchema`] |
| 241 | +#[async_trait] |
| 242 | +impl SchemaProvider for RemoteSchema { |
| 243 | + fn as_any(&self) -> &dyn Any { |
| 244 | + self |
| 245 | + } |
| 246 | + |
| 247 | + fn table_names(&self) -> Vec<String> { |
| 248 | + // Note this API is not async so we can't directly call the RemoteCatalogInterface |
| 249 | + // instead we use the cached list of loaded tables |
| 250 | + self.tables |
| 251 | + .lock() |
| 252 | + .expect("mutex valid") |
| 253 | + .keys() |
| 254 | + .cloned() |
| 255 | + .collect() |
| 256 | + } |
| 257 | + |
| 258 | + // While this API is actually `async` and thus could consult a remote |
| 259 | + // catalog directly it is more efficient to use a local cached copy instead, |
| 260 | + // which is what we model in this example |
| 261 | + async fn table( |
| 262 | + &self, |
| 263 | + name: &str, |
| 264 | + ) -> Result<Option<Arc<dyn TableProvider>>, DataFusionError> { |
| 265 | + // Look for any pre-loaded tables |
| 266 | + let table = self |
| 267 | + .tables |
| 268 | + .lock() |
| 269 | + .expect("mutex valid") |
| 270 | + .get(name) |
| 271 | + .map(Arc::clone); |
| 272 | + Ok(table) |
| 273 | + } |
| 274 | + |
| 275 | + fn table_exist(&self, name: &str) -> bool { |
| 276 | + // Look for any pre-loaded tables, note this function is also `async` |
| 277 | + self.tables.lock().expect("mutex valid").contains_key(name) |
| 278 | + } |
| 279 | +} |
| 280 | + |
| 281 | +/// Represents the information about a table retrieved from the remote catalog |
| 282 | +#[derive(Debug)] |
| 283 | +struct RemoteTable { |
| 284 | + /// connection to the remote catalog |
| 285 | + remote_catalog_interface: Arc<RemoteCatalogInterface>, |
| 286 | + name: String, |
| 287 | + schema: SchemaRef, |
| 288 | +} |
| 289 | + |
| 290 | +impl RemoteTable { |
| 291 | + pub fn new( |
| 292 | + remote_catalog_interface: Arc<RemoteCatalogInterface>, |
| 293 | + name: impl Into<String>, |
| 294 | + schema: SchemaRef, |
| 295 | + ) -> Self { |
| 296 | + Self { |
| 297 | + remote_catalog_interface, |
| 298 | + name: name.into(), |
| 299 | + schema, |
| 300 | + } |
| 301 | + } |
| 302 | +} |
| 303 | + |
| 304 | +/// Implement the DataFusion Catalog API for [`RemoteTable`] |
| 305 | +#[async_trait] |
| 306 | +impl TableProvider for RemoteTable { |
| 307 | + fn as_any(&self) -> &dyn Any { |
| 308 | + self |
| 309 | + } |
| 310 | + |
| 311 | + fn schema(&self) -> SchemaRef { |
| 312 | + self.schema.clone() |
| 313 | + } |
| 314 | + |
| 315 | + fn table_type(&self) -> TableType { |
| 316 | + TableType::Base |
| 317 | + } |
| 318 | + |
| 319 | + async fn scan( |
| 320 | + &self, |
| 321 | + _state: &dyn Session, |
| 322 | + projection: Option<&Vec<usize>>, |
| 323 | + _filters: &[Expr], |
| 324 | + _limit: Option<usize>, |
| 325 | + ) -> Result<Arc<dyn ExecutionPlan>> { |
| 326 | + // Note that `scan` is called once the plan begin execution, and thus is |
| 327 | + // async. When interacting with remote data sources, this is the place |
| 328 | + // to begin establishing the remote connections and interacting with the |
| 329 | + // remote storage system. |
| 330 | + // |
| 331 | + // As this example is just modeling the catalog API interface, we buffer |
| 332 | + // the results locally in memory for simplicity. |
| 333 | + let batches = self |
| 334 | + .remote_catalog_interface |
| 335 | + .read_data(&self.name) |
| 336 | + .await? |
| 337 | + .try_collect() |
| 338 | + .await?; |
| 339 | + Ok(Arc::new(MemoryExec::try_new( |
| 340 | + &[batches], |
| 341 | + self.schema.clone(), |
| 342 | + projection.cloned(), |
| 343 | + )?)) |
| 344 | + } |
| 345 | +} |
| 346 | + |
| 347 | +/// Return true if this `table_reference` might be for a table in the specified |
| 348 | +/// catalog and schema. |
| 349 | +fn refers_to_schema( |
| 350 | + catalog_name: &str, |
| 351 | + schema_name: &str, |
| 352 | + table_reference: &TableReference, |
| 353 | +) -> bool { |
| 354 | + // Check the references are in the correct catalog and schema |
| 355 | + // references like foo.bar.baz |
| 356 | + if let Some(catalog) = table_reference.catalog() { |
| 357 | + if catalog != catalog_name { |
| 358 | + return false; |
| 359 | + } |
| 360 | + } |
| 361 | + // references like bar.baz |
| 362 | + if let Some(schema) = table_reference.schema() { |
| 363 | + if schema != schema_name { |
| 364 | + return false; |
| 365 | + } |
| 366 | + } |
| 367 | + |
| 368 | + true |
| 369 | +} |
0 commit comments