-
Notifications
You must be signed in to change notification settings - Fork 205
fix: default values for native_datafusion scan #1756
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d5d52a6
daefb5a
4134563
b99e2b0
70a4a0e
93197d6
6ce6632
46b4d20
78d43be
f7cbe21
213aa2e
56a5bd4
37b5e46
38b0a3e
118cab0
75aa236
b0947e2
916b43b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,8 +28,10 @@ use datafusion::datasource::source::DataSourceExec; | |
use datafusion::execution::object_store::ObjectStoreUrl; | ||
use datafusion::physical_expr::expressions::BinaryExpr; | ||
use datafusion::physical_expr::PhysicalExpr; | ||
use datafusion::scalar::ScalarValue; | ||
use datafusion_comet_spark_expr::EvalMode; | ||
use itertools::Itertools; | ||
use std::collections::HashMap; | ||
use std::sync::Arc; | ||
|
||
/// Initializes a DataSourceExec plan with a ParquetSource. This may be used by either the | ||
|
@@ -61,12 +63,14 @@ pub(crate) fn init_datasource_exec( | |
file_groups: Vec<Vec<PartitionedFile>>, | ||
projection_vector: Option<Vec<usize>>, | ||
data_filters: Option<Vec<Arc<dyn PhysicalExpr>>>, | ||
default_values: Option<HashMap<usize, ScalarValue>>, | ||
session_timezone: &str, | ||
) -> Result<Arc<DataSourceExec>, ExecutionError> { | ||
let (table_parquet_options, spark_parquet_options) = get_options(session_timezone); | ||
let mut parquet_source = ParquetSource::new(table_parquet_options).with_schema_adapter_factory( | ||
Arc::new(SparkSchemaAdapterFactory::new(spark_parquet_options)), | ||
); | ||
let mut parquet_source = | ||
ParquetSource::new(table_parquet_options).with_schema_adapter_factory(Arc::new( | ||
SparkSchemaAdapterFactory::new(spark_parquet_options, default_values), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can discuss if it makes more sense to stick There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think it makes sense to do that even though it might make the code. a little bit simpler. |
||
)); | ||
// Create a conjunctive form of the vector because ParquetExecBuilder takes | ||
// a single expression | ||
if let Some(data_filters) = data_filters { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -60,9 +60,6 @@ pub struct SparkParquetOptions { | |
pub allow_incompat: bool, | ||
/// Support casting unsigned ints to signed ints (used by Parquet SchemaAdapter) | ||
pub allow_cast_unsigned_ints: bool, | ||
/// We also use the cast logic for adapting Parquet schemas, so this flag is used | ||
/// for that use case | ||
pub is_adapting_schema: bool, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is dead code from when we used the cast logic (and |
||
/// Whether to always represent decimals using 128 bits. If false, the native reader may represent decimals using 32 or 64 bits, depending on the precision. | ||
pub use_decimal_128: bool, | ||
/// Whether to read dates/timestamps that were written in the legacy hybrid Julian + Gregorian calendar as it is. If false, throw exceptions instead. If the spark type is TimestampNTZ, this should be true. | ||
|
@@ -78,7 +75,6 @@ impl SparkParquetOptions { | |
timezone: timezone.to_string(), | ||
allow_incompat, | ||
allow_cast_unsigned_ints: false, | ||
is_adapting_schema: false, | ||
use_decimal_128: false, | ||
use_legacy_date_timestamp_or_ntz: false, | ||
case_sensitive: false, | ||
|
@@ -91,7 +87,6 @@ impl SparkParquetOptions { | |
timezone: "".to_string(), | ||
allow_incompat, | ||
allow_cast_unsigned_ints: false, | ||
is_adapting_schema: false, | ||
use_decimal_128: false, | ||
use_legacy_date_timestamp_or_ntz: false, | ||
case_sensitive: false, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,11 +18,13 @@ | |
//! Custom schema adapter that uses Spark-compatible conversions | ||
|
||
use crate::parquet::parquet_support::{spark_parquet_convert, SparkParquetOptions}; | ||
use arrow::array::{new_null_array, RecordBatch, RecordBatchOptions}; | ||
use arrow::array::{RecordBatch, RecordBatchOptions}; | ||
use arrow::datatypes::{Schema, SchemaRef}; | ||
use datafusion::common::ColumnStatistics; | ||
use datafusion::datasource::schema_adapter::{SchemaAdapter, SchemaAdapterFactory, SchemaMapper}; | ||
use datafusion::physical_plan::ColumnarValue; | ||
use datafusion::scalar::ScalarValue; | ||
use std::collections::HashMap; | ||
use std::sync::Arc; | ||
|
||
/// An implementation of DataFusion's `SchemaAdapterFactory` that uses a Spark-compatible | ||
|
@@ -31,12 +33,17 @@ use std::sync::Arc; | |
pub struct SparkSchemaAdapterFactory { | ||
/// Spark cast options | ||
parquet_options: SparkParquetOptions, | ||
default_values: Option<HashMap<usize, ScalarValue>>, | ||
} | ||
|
||
impl SparkSchemaAdapterFactory { | ||
pub fn new(options: SparkParquetOptions) -> Self { | ||
pub fn new( | ||
options: SparkParquetOptions, | ||
default_values: Option<HashMap<usize, ScalarValue>>, | ||
) -> Self { | ||
Self { | ||
parquet_options: options, | ||
default_values, | ||
} | ||
} | ||
} | ||
|
@@ -56,6 +63,7 @@ impl SchemaAdapterFactory for SparkSchemaAdapterFactory { | |
Box::new(SparkSchemaAdapter { | ||
required_schema, | ||
parquet_options: self.parquet_options.clone(), | ||
default_values: self.default_values.clone(), | ||
}) | ||
} | ||
} | ||
|
@@ -69,6 +77,7 @@ pub struct SparkSchemaAdapter { | |
required_schema: SchemaRef, | ||
/// Spark cast options | ||
parquet_options: SparkParquetOptions, | ||
default_values: Option<HashMap<usize, ScalarValue>>, | ||
} | ||
|
||
impl SchemaAdapter for SparkSchemaAdapter { | ||
|
@@ -134,6 +143,7 @@ impl SchemaAdapter for SparkSchemaAdapter { | |
required_schema: Arc::<Schema>::clone(&self.required_schema), | ||
field_mappings, | ||
parquet_options: self.parquet_options.clone(), | ||
default_values: self.default_values.clone(), | ||
}), | ||
projection, | ||
)) | ||
|
@@ -158,16 +168,7 @@ impl SchemaAdapter for SparkSchemaAdapter { | |
/// out of the execution of this query. Thus `map_batch` uses | ||
/// `projected_table_schema` as it can only operate on the projected fields. | ||
/// | ||
/// [`map_partial_batch`] is used to create a RecordBatch with a schema that | ||
/// can be used for Parquet predicate pushdown, meaning that it may contain | ||
/// fields which are not in the projected schema (as the fields that parquet | ||
/// pushdown filters operate can be completely distinct from the fields that are | ||
/// projected (output) out of the ParquetExec). `map_partial_batch` thus uses | ||
/// `table_schema` to create the resulting RecordBatch (as it could be operating | ||
/// on any fields in the schema). | ||
/// | ||
/// [`map_batch`]: Self::map_batch | ||
/// [`map_partial_batch`]: Self::map_partial_batch | ||
#[derive(Debug)] | ||
pub struct SchemaMapping { | ||
/// The schema of the table. This is the expected schema after conversion | ||
|
@@ -181,6 +182,7 @@ pub struct SchemaMapping { | |
field_mappings: Vec<Option<usize>>, | ||
/// Spark cast options | ||
parquet_options: SparkParquetOptions, | ||
default_values: Option<HashMap<usize, ScalarValue>>, | ||
} | ||
|
||
impl SchemaMapper for SchemaMapping { | ||
|
@@ -197,15 +199,43 @@ impl SchemaMapper for SchemaMapping { | |
// go through each field in the projected schema | ||
.fields() | ||
.iter() | ||
.enumerate() | ||
// and zip it with the index that maps fields from the projected table schema to the | ||
// projected file schema in `batch` | ||
.zip(&self.field_mappings) | ||
// and for each one... | ||
.map(|(field, file_idx)| { | ||
.map(|((field_idx, field), file_idx)| { | ||
file_idx.map_or_else( | ||
// If this field only exists in the table, and not in the file, then we know | ||
// that it's null, so just return that. | ||
|| Ok(new_null_array(field.data_type(), batch_rows)), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Got rid of instantiating an entire null array in favor of a single null value for column. |
||
// If this field only exists in the table, and not in the file, then we need to | ||
// populate a default value for it. | ||
|| { | ||
if self.default_values.is_some() { | ||
// We have a map of default values, see if this field is in there. | ||
if let Some(value) = | ||
self.default_values.as_ref().unwrap().get(&field_idx) | ||
// Default value exists, construct a column from it. | ||
{ | ||
let cv = if field.data_type() == &value.data_type() { | ||
ColumnarValue::Scalar(value.clone()) | ||
} else { | ||
// Data types don't match. This can happen when default values | ||
// are stored by Spark in a format different than the column's | ||
// type (e.g., INT32 when the column is DATE32) | ||
spark_parquet_convert( | ||
ColumnarValue::Scalar(value.clone()), | ||
field.data_type(), | ||
&self.parquet_options, | ||
)? | ||
}; | ||
return cv.into_array(batch_rows); | ||
} | ||
} | ||
// Construct an entire column of nulls. We use the Scalar representation | ||
// for better performance. | ||
let cv = | ||
ColumnarValue::Scalar(ScalarValue::try_new_null(field.data_type())?); | ||
cv.into_array(batch_rows) | ||
}, | ||
// However, if it does exist in both, then try to cast it to the correct output | ||
// type | ||
|batch_idx| { | ||
|
@@ -316,7 +346,7 @@ mod test { | |
|
||
let parquet_source = Arc::new( | ||
ParquetSource::new(TableParquetOptions::new()).with_schema_adapter_factory(Arc::new( | ||
SparkSchemaAdapterFactory::new(spark_parquet_options), | ||
SparkSchemaAdapterFactory::new(spark_parquet_options, None), | ||
)), | ||
); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight, Normalize | |
import org.apache.spark.sql.catalyst.plans._ | ||
import org.apache.spark.sql.catalyst.plans.physical._ | ||
import org.apache.spark.sql.catalyst.util.CharVarcharCodegenUtils | ||
import org.apache.spark.sql.catalyst.util.ResolveDefaultColumns.getExistenceDefaultValues | ||
import org.apache.spark.sql.comet._ | ||
import org.apache.spark.sql.comet.execution.shuffle.CometShuffleExchangeExec | ||
import org.apache.spark.sql.execution | ||
|
@@ -2302,6 +2303,24 @@ object QueryPlanSerde extends Logging with CometExprShim { | |
nativeScanBuilder.addAllDataFilters(dataFilters.asJava) | ||
} | ||
|
||
val possibleDefaultValues = getExistenceDefaultValues(scan.requiredSchema) | ||
if (possibleDefaultValues.exists(_ != null)) { | ||
// Our schema has default values. Serialize two lists, one with the default values | ||
// and another with the indexes in the schema so the native side can map missing | ||
// columns to these default values. | ||
val (defaultValues, indexes) = possibleDefaultValues.zipWithIndex | ||
.filter { case (expr, _) => expr != null } | ||
.map { case (expr, index) => | ||
// ResolveDefaultColumnsUtil.getExistenceDefaultValues has evaluated these | ||
// expressions and they should now just be literals. | ||
(Literal(expr), index.toLong.asInstanceOf[java.lang.Long]) | ||
} | ||
.unzip | ||
nativeScanBuilder.addAllDefaultValues( | ||
defaultValues.flatMap(exprToProto(_, scan.output)).toIterable.asJava) | ||
nativeScanBuilder.addAllDefaultValuesIndexes(indexes.toIterable.asJava) | ||
} | ||
|
||
// TODO: modify CometNativeScan to generate the file partitions without instantiating RDD. | ||
scan.inputRDD match { | ||
case rdd: DataSourceRDD => | ||
|
@@ -2326,18 +2345,18 @@ object QueryPlanSerde extends Logging with CometExprShim { | |
val requiredSchema = schema2Proto(scan.requiredSchema.fields) | ||
val dataSchema = schema2Proto(scan.relation.dataSchema.fields) | ||
|
||
val data_schema_idxs = scan.requiredSchema.fields.map(field => { | ||
val dataSchemaIndexes = scan.requiredSchema.fields.map(field => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just fixing incorrectly formatted variables as I find them. |
||
scan.relation.dataSchema.fieldIndex(field.name) | ||
}) | ||
val partition_schema_idxs = Array | ||
val partitionSchemaIndexes = Array | ||
.range( | ||
scan.relation.dataSchema.fields.length, | ||
scan.relation.dataSchema.length + scan.relation.partitionSchema.fields.length) | ||
|
||
val projection_vector = (data_schema_idxs ++ partition_schema_idxs).map(idx => | ||
val projectionVector = (dataSchemaIndexes ++ partitionSchemaIndexes).map(idx => | ||
idx.toLong.asInstanceOf[java.lang.Long]) | ||
|
||
nativeScanBuilder.addAllProjectionVector(projection_vector.toIterable.asJava) | ||
nativeScanBuilder.addAllProjectionVector(projectionVector.toIterable.asJava) | ||
|
||
// In `CometScanRule`, we ensure partitionSchema is supported. | ||
assert(partitionSchema.length == scan.relation.partitionSchema.fields.length) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As far as I can tell, missing columns for native_iceberg_compat are handled elsewhere and the DataSourceExec will never know about them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's handled in the
ConstantColumnReader
which is shared between native_comet and native_iceberg_compat.Also see
ResolveDefaultColumns.getExistenceDefaultValues
. Not quite sure what the difference between ExistenceDefaultValues and simply default values is.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From Spark javadoc -
I'll assume that the default values you get are the 'existence' defaults