Skip to content

Commit 84ac4f9

Browse files
authored
Provide field and schema metadata missing on distinct aggregations. (#12691)
* test(12687): reproducer of missing metadata bug * fix(12687): minimum change needed to fix the missing metadata
1 parent 0242767 commit 84ac4f9

File tree

3 files changed

+55
-10
lines changed

3 files changed

+55
-10
lines changed

datafusion/physical-plan/src/aggregates/mod.rs

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ use crate::aggregates::{
2626
topk_stream::GroupedTopKAggregateStream,
2727
};
2828
use crate::metrics::{ExecutionPlanMetricsSet, MetricsSet};
29+
use crate::projection::get_field_metadata;
2930
use crate::windows::get_ordered_partition_by_indices;
3031
use crate::{
3132
DisplayFormatType, Distribution, ExecutionPlan, InputOrderMode,
@@ -795,14 +796,17 @@ fn create_schema(
795796
) -> Result<Schema> {
796797
let mut fields = Vec::with_capacity(group_expr.len() + aggr_expr.len());
797798
for (index, (expr, name)) in group_expr.iter().enumerate() {
798-
fields.push(Field::new(
799-
name,
800-
expr.data_type(input_schema)?,
801-
// In cases where we have multiple grouping sets, we will use NULL expressions in
802-
// order to align the grouping sets. So the field must be nullable even if the underlying
803-
// schema field is not.
804-
group_expr_nullable[index] || expr.nullable(input_schema)?,
805-
))
799+
fields.push(
800+
Field::new(
801+
name,
802+
expr.data_type(input_schema)?,
803+
// In cases where we have multiple grouping sets, we will use NULL expressions in
804+
// order to align the grouping sets. So the field must be nullable even if the underlying
805+
// schema field is not.
806+
group_expr_nullable[index] || expr.nullable(input_schema)?,
807+
)
808+
.with_metadata(get_field_metadata(expr, input_schema).unwrap_or_default()),
809+
)
806810
}
807811

808812
match mode {
@@ -823,7 +827,10 @@ fn create_schema(
823827
}
824828
}
825829

826-
Ok(Schema::new(fields))
830+
Ok(Schema::new_with_metadata(
831+
fields,
832+
input_schema.metadata().clone(),
833+
))
827834
}
828835

829836
fn group_schema(schema: &Schema, group_count: usize) -> SchemaRef {

datafusion/physical-plan/src/projection.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ impl ExecutionPlan for ProjectionExec {
237237

238238
/// If e is a direct column reference, returns the field level
239239
/// metadata for that field, if any. Otherwise returns None
240-
fn get_field_metadata(
240+
pub(crate) fn get_field_metadata(
241241
e: &Arc<dyn PhysicalExpr>,
242242
input_schema: &Schema,
243243
) -> Option<HashMap<String, String>> {

datafusion/sqllogictest/test_files/metadata.slt

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,5 +58,43 @@ WHERE "data"."id" = "samples"."id";
5858
1
5959
3
6060

61+
62+
63+
# Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
64+
query I
65+
select count(distinct name) from table_with_metadata;
66+
----
67+
2
68+
69+
# Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
70+
query I
71+
select approx_median(distinct id) from table_with_metadata;
72+
----
73+
2
74+
75+
# Regression test: prevent field metadata loss per https://github.com/apache/datafusion/issues/12687
76+
statement ok
77+
select array_agg(distinct id) from table_with_metadata;
78+
79+
query I
80+
select distinct id from table_with_metadata order by id;
81+
----
82+
1
83+
3
84+
NULL
85+
86+
query I
87+
select count(id) from table_with_metadata;
88+
----
89+
2
90+
91+
query I
92+
select count(id) cnt from table_with_metadata group by name order by cnt;
93+
----
94+
0
95+
1
96+
1
97+
98+
6199
statement ok
62100
drop table table_with_metadata;

0 commit comments

Comments
 (0)