Skip to content

Commit e08a27c

Browse files
committed
Add pycapsule interface to provide table provider to DataFusion
Signed-off-by: Tim Saucer <[email protected]>
1 parent bf94295 commit e08a27c

File tree

3 files changed

+54
-1
lines changed

3 files changed

+54
-1
lines changed

python/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ delta_kernel.workspace = true
2020
# arrow
2121
arrow-schema = { workspace = true, features = ["serde"] }
2222

23+
# datafusion
24+
datafusion-ffi = { workspace = true }
25+
2326
# serde
2427
serde = { workspace = true }
2528
serde_json = { workspace = true }

python/deltalake/table.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1423,6 +1423,42 @@ def repair(
14231423
def transaction_versions(self) -> Dict[str, Transaction]:
14241424
return self._table.transaction_versions()
14251425

1426+
def __datafusion_table_provider__(self) -> Any:
1427+
"""Return the DataFusion table provider PyCapsule interface.
1428+
1429+
To support DataFusion features such as push down filtering, this function will return a PyCapsule
1430+
interface that conforms to the FFI Table Provider required by DataFusion. From an end user perspective
1431+
you should not need to call this function directly. Instead you can use ``register_table_provider`` in
1432+
the DataFusion SessionContext.
1433+
1434+
Returns:
1435+
A PyCapsule DataFusion TableProvider interface.
1436+
1437+
Example:
1438+
```python
1439+
from deltalake import DeltaTable, write_deltalake
1440+
from datafusion import SessionContext
1441+
import pyarrow as pa
1442+
data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
1443+
write_deltalake("tmp", data)
1444+
dt = DeltaTable("tmp")
1445+
ctx = SessionContext()
1446+
ctx.register_table_provider("test", table)
1447+
ctx.table("test").show()
1448+
```
1449+
Results in
1450+
```
1451+
DataFrame()
1452+
+----+----+----+
1453+
| c3 | c1 | c2 |
1454+
+----+----+----+
1455+
| 4 | 6 | a |
1456+
| 6 | 5 | b |
1457+
| 5 | 4 | c |
1458+
+----+----+----+
1459+
```
1460+
"""
1461+
return self._table.__datafusion_table_provider__()
14261462

14271463
class TableMerger:
14281464
"""API for various table `MERGE` commands."""

python/src/lib.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,16 @@ mod schema;
66
mod utils;
77

88
use std::collections::{HashMap, HashSet};
9+
use std::ffi::CString;
910
use std::future::IntoFuture;
1011
use std::str::FromStr;
12+
use std::sync::Arc;
1113
use std::time;
1214
use std::time::{SystemTime, UNIX_EPOCH};
1315

1416
use arrow::pyarrow::PyArrowType;
1517
use chrono::{DateTime, Duration, FixedOffset, Utc};
18+
use datafusion_ffi::table_provider::FFI_TableProvider;
1619
use delta_kernel::expressions::Scalar;
1720
use delta_kernel::schema::StructField;
1821
use deltalake::arrow::compute::concat_batches;
@@ -58,7 +61,7 @@ use futures::future::join_all;
5861
use pyo3::exceptions::{PyRuntimeError, PyValueError};
5962
use pyo3::prelude::*;
6063
use pyo3::pybacked::PyBackedStr;
61-
use pyo3::types::{PyDict, PyFrozenSet};
64+
use pyo3::types::{PyCapsule, PyDict, PyFrozenSet};
6265
use serde_json::{Map, Value};
6366

6467
use crate::error::DeltaProtocolError;
@@ -1240,6 +1243,17 @@ impl RawDeltaTable {
12401243
.map(|(app_id, transaction)| (app_id, PyTransaction::from(transaction)))
12411244
.collect()
12421245
}
1246+
1247+
fn __datafusion_table_provider__<'py>(
1248+
&self,
1249+
py: Python<'py>,
1250+
) -> PyResult<Bound<'py, PyCapsule>> {
1251+
let name = CString::new("datafusion_table_provider").unwrap();
1252+
1253+
let provider = FFI_TableProvider::new(Arc::new(self._table.clone()), false);
1254+
1255+
PyCapsule::new_bound(py, provider, Some(name.clone()))
1256+
}
12431257
}
12441258

12451259
fn set_post_commithook_properties(

0 commit comments

Comments
 (0)