Skip to content

Commit 76d833a

Browse files
authoredMar 2, 2025
Improve documentation for DataSourceExec, FileScanConfig, DataSource etc (apache#14941) (apache#14965)
1 parent 8b3cd7b commit 76d833a

File tree

4 files changed

+38
-12
lines changed

4 files changed

+38
-12
lines changed
 

‎datafusion/core/src/datasource/listing/table.rs

+10-3
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,7 @@ impl ListingOptions {
616616
/// using an [`ObjectStore`] instance, for example from local files or objects
617617
/// from AWS S3.
618618
///
619+
/// # Reading Directories
619620
/// For example, given the `table1` directory (or object store prefix)
620621
///
621622
/// ```text
@@ -651,13 +652,19 @@ impl ListingOptions {
651652
/// If the query has a predicate like `WHERE date = '2024-06-01'`
652653
/// only the corresponding directory will be read.
653654
///
654-
/// `ListingTable` also supports filter and projection pushdown for formats that
655+
/// `ListingTable` also supports limit, filter and projection pushdown for formats that
655656
/// support it as such as Parquet.
656657
///
658+
/// # Implementation
659+
///
660+
/// `ListingTable` Uses [`DataSourceExec`] to execute the data. See that struct
661+
/// for more details.
662+
///
663+
/// [`DataSourceExec`]: crate::datasource::source::DataSourceExec
664+
///
657665
/// # Example
658666
///
659-
/// Here is an example of reading a directory of parquet files using a
660-
/// [`ListingTable`]:
667+
/// To read a directory of parquet files using a [`ListingTable`]:
661668
///
662669
/// ```no_run
663670
/// # use datafusion::prelude::SessionContext;

‎datafusion/core/src/lib.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -298,10 +298,10 @@
298298
//! (built in or user provided) ExecutionPlan
299299
//! ```
300300
//!
301-
//! DataFusion includes several built in data sources for common use
302-
//! cases, and can be extended by implementing the [`TableProvider`]
303-
//! trait. A [`TableProvider`] provides information for planning and
304-
//! an [`ExecutionPlan`]s for execution.
301+
//! A [`TableProvider`] provides information for planning and
302+
//! an [`ExecutionPlan`]s for execution. DataFusion includes [`ListingTable`]
303+
//! which supports reading several common file formats, and you can support any
304+
//! new file format by implementing the [`TableProvider`] trait. See also:
305305
//!
306306
//! 1. [`ListingTable`]: Reads data from Parquet, JSON, CSV, or AVRO
307307
//! files. Supports single files or multiple files with HIVE style
@@ -314,7 +314,7 @@
314314
//!
315315
//! [`ListingTable`]: crate::datasource::listing::ListingTable
316316
//! [`MemTable`]: crate::datasource::memory::MemTable
317-
//! [`StreamingTable`]: datafusion_catalog::streaming::StreamingTable
317+
//! [`StreamingTable`]: crate::catalog::streaming::StreamingTable
318318
//!
319319
//! ## Plan Representations
320320
//!

‎datafusion/datasource/src/file.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ use datafusion_physical_plan::DisplayFormatType;
3333

3434
use object_store::ObjectStore;
3535

36-
/// Common behaviors that every file format needs to implement.
36+
/// Common file format behaviors needs to implement.
3737
///
38-
/// See initialization examples on `ParquetSource`, `CsvSource`
38+
/// See implementation examples such as `ParquetSource`, `CsvSource`
3939
pub trait FileSource: Send + Sync {
4040
/// Creates a `dyn FileOpener` based on given parameters
4141
fn create_file_opener(

‎datafusion/datasource/src/source.rs

+21-2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18+
//! [`DataSource`] and [`DataSourceExec`]
19+
1820
use std::any::Any;
1921
use std::fmt;
2022
use std::fmt::{Debug, Formatter};
@@ -34,9 +36,15 @@ use datafusion_physical_expr::{EquivalenceProperties, Partitioning};
3436
use datafusion_physical_expr_common::sort_expr::LexOrdering;
3537

3638
/// Common behaviors in Data Sources for both from Files and Memory.
37-
/// See `DataSourceExec` for physical plan implementation
3839
///
40+
/// # See Also
41+
/// * [`DataSourceExec`] for physical plan implementation
42+
/// * [`FileSource`] for file format implementations (Parquet, Json, etc)
43+
///
44+
/// # Notes
3945
/// Requires `Debug` to assist debugging
46+
///
47+
/// [`FileSource`]: crate::file::FileSource
4048
pub trait DataSource: Send + Sync + Debug {
4149
fn open(
4250
&self,
@@ -71,10 +79,21 @@ pub trait DataSource: Send + Sync + Debug {
7179
) -> datafusion_common::Result<Option<Arc<dyn ExecutionPlan>>>;
7280
}
7381

74-
/// Unified data source for file formats like JSON, CSV, AVRO, ARROW, PARQUET
82+
/// [`ExecutionPlan`] handles different file formats like JSON, CSV, AVRO, ARROW, PARQUET
83+
///
84+
/// `DataSourceExec` implements common functionality such as applying projections,
85+
/// and caching plan properties.
86+
///
87+
/// The [`DataSource`] trait describes where to find the data for this data
88+
/// source (for example what files or what in memory partitions). Format
89+
/// specifics are implemented with the [`FileSource`] trait.
90+
///
91+
/// [`FileSource`]: crate::file::FileSource
7592
#[derive(Clone, Debug)]
7693
pub struct DataSourceExec {
94+
/// The source of the data -- for example, `FileScanConfig` or `MemorySourceConfig`
7795
data_source: Arc<dyn DataSource>,
96+
/// Cached plan properties such as sort order
7897
cache: PlanProperties,
7998
}
8099

0 commit comments

Comments
 (0)