Skip to content

Commit 71f157f

Browse files
alambberkaysynnadajonahgaowestonpace
authored
Add example of interacting with a remote catalog (#13722)
* Add example of interacting with a remote catalog * Update datafusion/core/src/execution/session_state.rs Co-authored-by: Berkay Şahin <[email protected]> * Apply suggestions from code review Co-authored-by: Jonah Gao <[email protected]> Co-authored-by: Weston Pace <[email protected]> * Use HashMap to hold tables --------- Co-authored-by: Berkay Şahin <[email protected]> Co-authored-by: Jonah Gao <[email protected]> Co-authored-by: Weston Pace <[email protected]>
1 parent 3467011 commit 71f157f

File tree

4 files changed

+380
-6
lines changed

4 files changed

+380
-6
lines changed

datafusion-examples/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ cargo run --example dataframe
7777
- [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3
7878
- [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP
7979
- [`regexp.rs`](examples/regexp.rs): Examples of using regular expression functions
80+
- [`remote_catalog.rs`](examples/regexp.rs): Examples of interfacing with a remote catalog (e.g. over a network)
8081
- [`simple_udaf.rs`](examples/simple_udaf.rs): Define and invoke a User Defined Aggregate Function (UDAF)
8182
- [`simple_udf.rs`](examples/simple_udf.rs): Define and invoke a User Defined Scalar Function (UDF)
8283
- [`simple_udfw.rs`](examples/simple_udwf.rs): Define and invoke a User Defined Window Function (UDWF)
Lines changed: 369 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,369 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
/// This example shows how to implement the DataFusion [`CatalogProvider`] API
19+
/// for catalogs that are remote (require network access) and/or offer only
20+
/// asynchronous APIs such as [Polaris], [Unity], and [Hive].
21+
///
22+
/// Integrating with this catalogs is a bit more complex than with local
23+
/// catalogs because calls like `ctx.sql("SELECT * FROM db.schm.tbl")` may need
24+
/// to perform remote network requests, but many Catalog APIs are synchronous.
25+
/// See the documentation on [`CatalogProvider`] for more details.
26+
///
27+
/// [`CatalogProvider`]: datafusion_catalog::CatalogProvider
28+
///
29+
/// [Polaris]: https://github.com/apache/polaris
30+
/// [Unity]: https://github.com/unitycatalog/unitycatalog
31+
/// [Hive]: https://hive.apache.org/
32+
use arrow::array::record_batch;
33+
use arrow_schema::{Field, Fields, Schema, SchemaRef};
34+
use async_trait::async_trait;
35+
use datafusion::catalog::{SchemaProvider, TableProvider};
36+
use datafusion::common::DataFusionError;
37+
use datafusion::common::Result;
38+
use datafusion::execution::SendableRecordBatchStream;
39+
use datafusion::physical_plan::memory::MemoryExec;
40+
use datafusion::physical_plan::stream::RecordBatchStreamAdapter;
41+
use datafusion::physical_plan::ExecutionPlan;
42+
use datafusion::prelude::{DataFrame, SessionContext};
43+
use datafusion_catalog::Session;
44+
use datafusion_common::{
45+
assert_batches_eq, internal_datafusion_err, plan_err, HashMap, TableReference,
46+
};
47+
use datafusion_expr::{Expr, TableType};
48+
use futures::TryStreamExt;
49+
use std::any::Any;
50+
use std::sync::{Arc, Mutex};
51+
52+
#[tokio::main]
53+
async fn main() -> Result<()> {
54+
// As always, we create a session context to interact with DataFusion
55+
let ctx = SessionContext::new();
56+
57+
// Make a connection to the remote catalog, asynchronously, and configure it
58+
let remote_catalog_interface = RemoteCatalogInterface::connect().await?;
59+
60+
// Register a SchemaProvider for tables in a schema named "remote_schema".
61+
//
62+
// This will let DataFusion query tables such as
63+
// `datafusion.remote_schema.remote_table`
64+
let remote_schema: Arc<dyn SchemaProvider> =
65+
Arc::new(RemoteSchema::new(Arc::new(remote_catalog_interface)));
66+
ctx.catalog("datafusion")
67+
.ok_or_else(|| internal_datafusion_err!("default catalog was not installed"))?
68+
.register_schema("remote_schema", Arc::clone(&remote_schema))?;
69+
70+
// Here is a query that selects data from a table in the remote catalog.
71+
let sql = "SELECT * from remote_schema.remote_table";
72+
73+
// The `SessionContext::sql` interface is async, but it does not
74+
// support asynchronous access to catalogs, so the following query errors.
75+
let results = ctx.sql(sql).await;
76+
assert_eq!(
77+
results.unwrap_err().to_string(),
78+
"Error during planning: table 'datafusion.remote_schema.remote_table' not found"
79+
);
80+
81+
// Instead, to use a remote catalog, we must use lower level APIs on
82+
// SessionState (what `SessionContext::sql` does internally).
83+
let state = ctx.state();
84+
85+
// First, parse the SQL (but don't plan it / resolve any table references)
86+
let dialect = state.config().options().sql_parser.dialect.as_str();
87+
let statement = state.sql_to_statement(sql, dialect)?;
88+
89+
// Find all `TableReferences` in the parsed queries. These correspond to the
90+
// tables referred to by the query (in this case
91+
// `remote_schema.remote_table`)
92+
let references = state.resolve_table_references(&statement)?;
93+
94+
// Call `load_tables` to load information from the remote catalog for each
95+
// of the referenced tables. Best practice is to fetch the the information
96+
// for all tables required by the query once (rather than one per table) to
97+
// minimize network overhead
98+
let table_names = references.iter().filter_map(|r| {
99+
if refers_to_schema("datafusion", "remote_schema", r) {
100+
Some(r.table())
101+
} else {
102+
None
103+
}
104+
});
105+
remote_schema
106+
.as_any()
107+
.downcast_ref::<RemoteSchema>()
108+
.expect("correct types")
109+
.load_tables(table_names)
110+
.await?;
111+
112+
// Now continue planing the query after having fetched the remote table and
113+
// it can run as normal
114+
let plan = state.statement_to_plan(statement).await?;
115+
let results = DataFrame::new(state, plan).collect().await?;
116+
assert_batches_eq!(
117+
[
118+
"+----+-------+",
119+
"| id | name |",
120+
"+----+-------+",
121+
"| 1 | alpha |",
122+
"| 2 | beta |",
123+
"| 3 | gamma |",
124+
"+----+-------+",
125+
],
126+
&results
127+
);
128+
129+
Ok(())
130+
}
131+
132+
/// This is an example of an API that interacts with a remote catalog.
133+
///
134+
/// Specifically, its APIs are all `async` and thus can not be used by
135+
/// [`SchemaProvider`] or [`TableProvider`] directly.
136+
#[derive(Debug)]
137+
struct RemoteCatalogInterface {}
138+
139+
impl RemoteCatalogInterface {
140+
/// Establish a connection to the remote catalog
141+
pub async fn connect() -> Result<Self> {
142+
// In a real implementation this method might connect to a remote
143+
// catalog, validate credentials, cache basic information, etc
144+
Ok(Self {})
145+
}
146+
147+
/// Fetches information for a specific table
148+
pub async fn table_info(&self, name: &str) -> Result<SchemaRef> {
149+
if name != "remote_table" {
150+
return plan_err!("Remote table not found: {}", name);
151+
}
152+
153+
// In this example, we'll model a remote table with columns "id" and
154+
// "name"
155+
//
156+
// A real remote catalog would make a network call to fetch this
157+
// information from a remote source.
158+
let schema = Schema::new(Fields::from(vec![
159+
Field::new("id", arrow::datatypes::DataType::Int32, false),
160+
Field::new("name", arrow::datatypes::DataType::Utf8, false),
161+
]));
162+
Ok(Arc::new(schema))
163+
}
164+
165+
/// Fetches data for a table from a remote data source
166+
pub async fn read_data(&self, name: &str) -> Result<SendableRecordBatchStream> {
167+
if name != "remote_table" {
168+
return plan_err!("Remote table not found: {}", name);
169+
}
170+
171+
// In a real remote catalog this call would likely perform network IO to
172+
// open and begin reading from a remote datasource, prefetching
173+
// information, etc.
174+
//
175+
// In this example we are just demonstrating how the API works so simply
176+
// return back some static data as a stream.
177+
let batch = record_batch!(
178+
("id", Int32, [1, 2, 3]),
179+
("name", Utf8, ["alpha", "beta", "gamma"])
180+
)
181+
.unwrap();
182+
let schema = batch.schema();
183+
184+
let stream = futures::stream::iter([Ok(batch)]);
185+
Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream)))
186+
}
187+
}
188+
189+
/// Implements the DataFusion Catalog API interface for tables
190+
/// stored in a remote catalog.
191+
#[derive(Debug)]
192+
struct RemoteSchema {
193+
/// Connection with the remote catalog
194+
remote_catalog_interface: Arc<RemoteCatalogInterface>,
195+
/// Local cache of tables that have been preloaded from the remote
196+
/// catalog
197+
tables: Mutex<HashMap<String, Arc<dyn TableProvider>>>,
198+
}
199+
200+
impl RemoteSchema {
201+
/// Create a new RemoteSchema
202+
pub fn new(remote_catalog_interface: Arc<RemoteCatalogInterface>) -> Self {
203+
Self {
204+
remote_catalog_interface,
205+
tables: Mutex::new(HashMap::new()),
206+
}
207+
}
208+
209+
/// Load information for the specified tables from the remote source into
210+
/// the local cached copy.
211+
pub async fn load_tables(
212+
&self,
213+
references: impl IntoIterator<Item = &str>,
214+
) -> Result<()> {
215+
for table_name in references {
216+
if !self.table_exist(table_name) {
217+
// Fetch information about the table from the remote catalog
218+
//
219+
// Note that a real remote catalog interface could return more
220+
// information, but at the minimum, DataFusion requires the
221+
// table's schema for planing.
222+
let schema = self.remote_catalog_interface.table_info(table_name).await?;
223+
let remote_table = RemoteTable::new(
224+
Arc::clone(&self.remote_catalog_interface),
225+
table_name,
226+
schema,
227+
);
228+
229+
// Add the table to our local cached list
230+
self.tables
231+
.lock()
232+
.expect("mutex invalid")
233+
.insert(table_name.to_string(), Arc::new(remote_table));
234+
};
235+
}
236+
Ok(())
237+
}
238+
}
239+
240+
/// Implement the DataFusion Catalog API for [`RemoteSchema`]
241+
#[async_trait]
242+
impl SchemaProvider for RemoteSchema {
243+
fn as_any(&self) -> &dyn Any {
244+
self
245+
}
246+
247+
fn table_names(&self) -> Vec<String> {
248+
// Note this API is not async so we can't directly call the RemoteCatalogInterface
249+
// instead we use the cached list of loaded tables
250+
self.tables
251+
.lock()
252+
.expect("mutex valid")
253+
.keys()
254+
.cloned()
255+
.collect()
256+
}
257+
258+
// While this API is actually `async` and thus could consult a remote
259+
// catalog directly it is more efficient to use a local cached copy instead,
260+
// which is what we model in this example
261+
async fn table(
262+
&self,
263+
name: &str,
264+
) -> Result<Option<Arc<dyn TableProvider>>, DataFusionError> {
265+
// Look for any pre-loaded tables
266+
let table = self
267+
.tables
268+
.lock()
269+
.expect("mutex valid")
270+
.get(name)
271+
.map(Arc::clone);
272+
Ok(table)
273+
}
274+
275+
fn table_exist(&self, name: &str) -> bool {
276+
// Look for any pre-loaded tables, note this function is also `async`
277+
self.tables.lock().expect("mutex valid").contains_key(name)
278+
}
279+
}
280+
281+
/// Represents the information about a table retrieved from the remote catalog
282+
#[derive(Debug)]
283+
struct RemoteTable {
284+
/// connection to the remote catalog
285+
remote_catalog_interface: Arc<RemoteCatalogInterface>,
286+
name: String,
287+
schema: SchemaRef,
288+
}
289+
290+
impl RemoteTable {
291+
pub fn new(
292+
remote_catalog_interface: Arc<RemoteCatalogInterface>,
293+
name: impl Into<String>,
294+
schema: SchemaRef,
295+
) -> Self {
296+
Self {
297+
remote_catalog_interface,
298+
name: name.into(),
299+
schema,
300+
}
301+
}
302+
}
303+
304+
/// Implement the DataFusion Catalog API for [`RemoteTable`]
305+
#[async_trait]
306+
impl TableProvider for RemoteTable {
307+
fn as_any(&self) -> &dyn Any {
308+
self
309+
}
310+
311+
fn schema(&self) -> SchemaRef {
312+
self.schema.clone()
313+
}
314+
315+
fn table_type(&self) -> TableType {
316+
TableType::Base
317+
}
318+
319+
async fn scan(
320+
&self,
321+
_state: &dyn Session,
322+
projection: Option<&Vec<usize>>,
323+
_filters: &[Expr],
324+
_limit: Option<usize>,
325+
) -> Result<Arc<dyn ExecutionPlan>> {
326+
// Note that `scan` is called once the plan begin execution, and thus is
327+
// async. When interacting with remote data sources, this is the place
328+
// to begin establishing the remote connections and interacting with the
329+
// remote storage system.
330+
//
331+
// As this example is just modeling the catalog API interface, we buffer
332+
// the results locally in memory for simplicity.
333+
let batches = self
334+
.remote_catalog_interface
335+
.read_data(&self.name)
336+
.await?
337+
.try_collect()
338+
.await?;
339+
Ok(Arc::new(MemoryExec::try_new(
340+
&[batches],
341+
self.schema.clone(),
342+
projection.cloned(),
343+
)?))
344+
}
345+
}
346+
347+
/// Return true if this `table_reference` might be for a table in the specified
348+
/// catalog and schema.
349+
fn refers_to_schema(
350+
catalog_name: &str,
351+
schema_name: &str,
352+
table_reference: &TableReference,
353+
) -> bool {
354+
// Check the references are in the correct catalog and schema
355+
// references like foo.bar.baz
356+
if let Some(catalog) = table_reference.catalog() {
357+
if catalog != catalog_name {
358+
return false;
359+
}
360+
}
361+
// references like bar.baz
362+
if let Some(schema) = table_reference.schema() {
363+
if schema != schema_name {
364+
return false;
365+
}
366+
}
367+
368+
true
369+
}

0 commit comments

Comments
 (0)