Remove redundant upper case aliases for median, first_value and last_value (#10696)

goldmedal · web-flow · commit 08e19f4956d3 · 2024-05-29T13:08:14.000+08:00
* change udaf name to lowercase

* enhance test for duplicate function name

* fix test

* fix tests

* remove redundant alias field
diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs
@@ -49,7 +49,6 @@ make_udaf_expr_and_func!(
 
 pub struct FirstValue {
     signature: Signature,
-    aliases: Vec<String>,
     requirement_satisfied: bool,
 }
 
@@ -72,7 +71,6 @@ impl Default for FirstValue {
 impl FirstValue {
     pub fn new() -> Self {
         Self {
-            aliases: vec![String::from("first_value")],
             signature: Signature::one_of(
                 vec![
                     // TODO: we can introduce more strict signature that only numeric of array types are allowed
@@ -97,7 +95,7 @@ impl AggregateUDFImpl for FirstValue {
     }
 
     fn name(&self) -> &str {
-        "FIRST_VALUE"
+        "first_value"
     }
 
     fn signature(&self) -> &Signature {
@@ -144,7 +142,7 @@ impl AggregateUDFImpl for FirstValue {
     }
 
     fn aliases(&self) -> &[String] {
-        &self.aliases
+        &[]
     }
 
     fn with_beneficial_ordering(
@@ -349,7 +347,6 @@ make_udaf_expr_and_func!(
 
 pub struct LastValue {
     signature: Signature,
-    aliases: Vec<String>,
     requirement_satisfied: bool,
 }
 
@@ -372,7 +369,6 @@ impl Default for LastValue {
 impl LastValue {
     pub fn new() -> Self {
         Self {
-            aliases: vec![String::from("last_value")],
             signature: Signature::one_of(
                 vec![
                     // TODO: we can introduce more strict signature that only numeric of array types are allowed
@@ -397,7 +393,7 @@ impl AggregateUDFImpl for LastValue {
     }
 
     fn name(&self) -> &str {
-        "LAST_VALUE"
+        "last_value"
     }
 
     fn signature(&self) -> &Signature {
@@ -449,7 +445,7 @@ impl AggregateUDFImpl for LastValue {
     }
 
     fn aliases(&self) -> &[String] {
-        &self.aliases
+        &[]
     }
 
     fn with_beneficial_ordering(
diff --git a/datafusion/functions-aggregate/src/lib.rs b/datafusion/functions-aggregate/src/lib.rs
@@ -109,13 +109,13 @@ mod tests {
         let mut names = HashSet::new();
         for func in all_default_aggregate_functions() {
             assert!(
-                names.insert(func.name().to_string()),
+                names.insert(func.name().to_string().to_lowercase()),
                 "duplicate function name: {}",
                 func.name()
             );
             for alias in func.aliases() {
                 assert!(
-                    names.insert(alias.to_string()),
+                    names.insert(alias.to_string().to_lowercase()),
                     "duplicate function name: {}",
                     alias
                 );
diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs
@@ -58,7 +58,6 @@ make_udaf_expr_and_func!(
 /// result, but if cardinality is low then memory usage will also be lower.
 pub struct Median {
     signature: Signature,
-    aliases: Vec<String>,
 }
 
 impl Debug for Median {
@@ -79,7 +78,6 @@ impl Default for Median {
 impl Median {
     pub fn new() -> Self {
         Self {
-            aliases: vec!["median".to_string()],
             signature: Signature::numeric(1, Volatility::Immutable),
         }
     }
@@ -91,7 +89,7 @@ impl AggregateUDFImpl for Median {
     }
 
     fn name(&self) -> &str {
-        "MEDIAN"
+        "median"
     }
 
     fn signature(&self) -> &Signature {
@@ -152,7 +150,7 @@ impl AggregateUDFImpl for Median {
     }
 
     fn aliases(&self) -> &[String] {
-        &self.aliases
+        &[]
     }
 }
 
diff --git a/datafusion/functions-array/src/lib.rs b/datafusion/functions-array/src/lib.rs
@@ -168,13 +168,13 @@ mod tests {
         let mut names = HashSet::new();
         for func in all_default_array_functions() {
             assert!(
-                names.insert(func.name().to_string()),
+                names.insert(func.name().to_string().to_lowercase()),
                 "duplicate function name: {}",
                 func.name()
             );
             for alias in func.aliases() {
                 assert!(
-                    names.insert(alias.to_string()),
+                    names.insert(alias.to_string().to_lowercase()),
                     "duplicate function name: {}",
                     alias
                 );
diff --git a/datafusion/functions/src/lib.rs b/datafusion/functions/src/lib.rs
@@ -191,13 +191,13 @@ mod tests {
         let mut names = HashSet::new();
         for func in all_default_functions() {
             assert!(
-                names.insert(func.name().to_string()),
+                names.insert(func.name().to_string().to_lowercase()),
                 "duplicate function name: {}",
                 func.name()
             );
             for alias in func.aliases() {
                 assert!(
-                    names.insert(alias.to_string()),
+                    names.insert(alias.to_string().to_lowercase()),
                     "duplicate function name: {}",
                     alias
                 );
diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs
@@ -201,9 +201,9 @@ mod tests {
             )?
             .build()?;
 
-        let expected = "Projection: FIRST_VALUE(test.b) ORDER BY [test.a DESC NULLS FIRST, test.c ASC NULLS LAST] AS b\
+        let expected = "Projection: first_value(test.b) ORDER BY [test.a DESC NULLS FIRST, test.c ASC NULLS LAST] AS b\
         \n  Sort: test.a DESC NULLS FIRST\
-        \n    Aggregate: groupBy=[[test.a]], aggr=[[FIRST_VALUE(test.b) ORDER BY [test.a DESC NULLS FIRST, test.c ASC NULLS LAST]]]\
+        \n    Aggregate: groupBy=[[test.a]], aggr=[[first_value(test.b) ORDER BY [test.a DESC NULLS FIRST, test.c ASC NULLS LAST]]]\
         \n      TableScan: test";
 
         assert_optimized_plan_eq(
diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt
@@ -879,15 +879,15 @@ query TT
 explain select median(distinct c) from t;
 ----
 logical_plan
-01)Projection: MEDIAN(alias1) AS MEDIAN(DISTINCT t.c)
-02)--Aggregate: groupBy=[[]], aggr=[[MEDIAN(alias1)]]
+01)Projection: median(alias1) AS median(DISTINCT t.c)
+02)--Aggregate: groupBy=[[]], aggr=[[median(alias1)]]
 03)----Aggregate: groupBy=[[t.c AS alias1]], aggr=[[]]
 04)------TableScan: t projection=[c]
 physical_plan
-01)ProjectionExec: expr=[MEDIAN(alias1)@0 as MEDIAN(DISTINCT t.c)]
-02)--AggregateExec: mode=Final, gby=[], aggr=[MEDIAN(alias1)]
+01)ProjectionExec: expr=[median(alias1)@0 as median(DISTINCT t.c)]
+02)--AggregateExec: mode=Final, gby=[], aggr=[median(alias1)]
 03)----CoalescePartitionsExec
-04)------AggregateExec: mode=Partial, gby=[], aggr=[MEDIAN(alias1)]
+04)------AggregateExec: mode=Partial, gby=[], aggr=[median(alias1)]
 05)--------AggregateExec: mode=FinalPartitioned, gby=[alias1@0 as alias1], aggr=[]
 06)----------CoalesceBatchesExec: target_batch_size=8192
 07)------------RepartitionExec: partitioning=Hash([alias1@0], 4), input_partitions=4
@@ -5024,12 +5024,12 @@ query TT
 explain select first_value(c1 order by c3 desc) from convert_first_last_table;
 ----
 logical_plan
-01)Aggregate: groupBy=[[]], aggr=[[FIRST_VALUE(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]]
+01)Aggregate: groupBy=[[]], aggr=[[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]]
 02)--TableScan: convert_first_last_table projection=[c1, c3]
 physical_plan
-01)AggregateExec: mode=Final, gby=[], aggr=[FIRST_VALUE(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]
+01)AggregateExec: mode=Final, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 DESC NULLS FIRST]]
 02)--CoalescePartitionsExec
-03)----AggregateExec: mode=Partial, gby=[], aggr=[LAST_VALUE(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]]
+03)----AggregateExec: mode=Partial, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c3 ASC NULLS LAST]]
 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
 05)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c3], output_orderings=[[c1@0 ASC NULLS LAST], [c3@1 ASC NULLS LAST]], has_header=true
 
@@ -5038,11 +5038,11 @@ query TT
 explain select last_value(c1 order by c2 asc) from convert_first_last_table;
 ----
 logical_plan
-01)Aggregate: groupBy=[[]], aggr=[[LAST_VALUE(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]]
+01)Aggregate: groupBy=[[]], aggr=[[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]]
 02)--TableScan: convert_first_last_table projection=[c1, c2]
 physical_plan
-01)AggregateExec: mode=Final, gby=[], aggr=[LAST_VALUE(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]
+01)AggregateExec: mode=Final, gby=[], aggr=[last_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 ASC NULLS LAST]]
 02)--CoalescePartitionsExec
-03)----AggregateExec: mode=Partial, gby=[], aggr=[FIRST_VALUE(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]]
+03)----AggregateExec: mode=Partial, gby=[], aggr=[first_value(convert_first_last_table.c1) ORDER BY [convert_first_last_table.c2 DESC NULLS FIRST]]
 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
 05)--------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/convert_first_last.csv]]}, projection=[c1, c2], output_orderings=[[c1@0 ASC NULLS LAST], [c2@1 DESC]], has_header=true
diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt
@@ -89,18 +89,18 @@ query TT
 EXPLAIN SELECT DISTINCT ON (c1) c3, c2 FROM aggregate_test_100 ORDER BY c1, c3;
 ----
 logical_plan
-01)Projection: FIRST_VALUE(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST] AS c3, FIRST_VALUE(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST] AS c2
+01)Projection: first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST] AS c3, first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST] AS c2
 02)--Sort: aggregate_test_100.c1 ASC NULLS LAST
-03)----Aggregate: groupBy=[[aggregate_test_100.c1]], aggr=[[FIRST_VALUE(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], FIRST_VALUE(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]]
+03)----Aggregate: groupBy=[[aggregate_test_100.c1]], aggr=[[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]]
 04)------TableScan: aggregate_test_100 projection=[c1, c2, c3]
 physical_plan
-01)ProjectionExec: expr=[FIRST_VALUE(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]@1 as c3, FIRST_VALUE(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]@2 as c2]
+01)ProjectionExec: expr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]@1 as c3, first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]@2 as c2]
 02)--SortPreservingMergeExec: [c1@0 ASC NULLS LAST]
 03)----SortExec: expr=[c1@0 ASC NULLS LAST], preserve_partitioning=[true]
-04)------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[FIRST_VALUE(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], FIRST_VALUE(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]
+04)------AggregateExec: mode=FinalPartitioned, gby=[c1@0 as c1], aggr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]
 05)--------CoalesceBatchesExec: target_batch_size=8192
 06)----------RepartitionExec: partitioning=Hash([c1@0], 4), input_partitions=4
-07)------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[FIRST_VALUE(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], FIRST_VALUE(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]
+07)------------AggregateExec: mode=Partial, gby=[c1@0 as c1], aggr=[first_value(aggregate_test_100.c3) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST], first_value(aggregate_test_100.c2) ORDER BY [aggregate_test_100.c1 ASC NULLS LAST, aggregate_test_100.c3 ASC NULLS LAST]]
 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
 09)----------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c2, c3], has_header=true
 
diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt
diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt

Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,6 @@ make_udaf_expr_and_func!(`
`49`	`49`
`50`	`50`	`pub struct FirstValue {`
`51`	`51`	`signature: Signature,`
`52`		`- aliases: Vec<String>,`
`53`	`52`	`requirement_satisfied: bool,`
`54`	`53`	`}`
`55`	`54`
`@@ -72,7 +71,6 @@ impl Default for FirstValue {`
`72`	`71`	`impl FirstValue {`
`73`	`72`	`pub fn new() -> Self {`
`74`	`73`	`Self {`
`75`		`- aliases: vec![String::from("first_value")],`
`76`	`74`	`signature: Signature::one_of(`
`77`	`75`	`vec![`
`78`	`76`	`// TODO: we can introduce more strict signature that only numeric of array types are allowed`
`@@ -97,7 +95,7 @@ impl AggregateUDFImpl for FirstValue {`
`97`	`95`	`}`
`98`	`96`
`99`	`97`	`fn name(&self) -> &str {`
`100`		`- "FIRST_VALUE"`
	`98`	`+ "first_value"`
`101`	`99`	`}`
`102`	`100`
`103`	`101`	`fn signature(&self) -> &Signature {`
`@@ -144,7 +142,7 @@ impl AggregateUDFImpl for FirstValue {`
`144`	`142`	`}`
`145`	`143`
`146`	`144`	`fn aliases(&self) -> &[String] {`
`147`		`- &self.aliases`
	`145`	`+ &[]`
`148`	`146`	`}`
`149`	`147`
`150`	`148`	`fn with_beneficial_ordering(`
`@@ -349,7 +347,6 @@ make_udaf_expr_and_func!(`
`349`	`347`
`350`	`348`	`pub struct LastValue {`
`351`	`349`	`signature: Signature,`
`352`		`- aliases: Vec<String>,`
`353`	`350`	`requirement_satisfied: bool,`
`354`	`351`	`}`
`355`	`352`
`@@ -372,7 +369,6 @@ impl Default for LastValue {`
`372`	`369`	`impl LastValue {`
`373`	`370`	`pub fn new() -> Self {`
`374`	`371`	`Self {`
`375`		`- aliases: vec![String::from("last_value")],`
`376`	`372`	`signature: Signature::one_of(`
`377`	`373`	`vec![`
`378`	`374`	`// TODO: we can introduce more strict signature that only numeric of array types are allowed`
`@@ -397,7 +393,7 @@ impl AggregateUDFImpl for LastValue {`
`397`	`393`	`}`
`398`	`394`
`399`	`395`	`fn name(&self) -> &str {`
`400`		`- "LAST_VALUE"`
	`396`	`+ "last_value"`
`401`	`397`	`}`
`402`	`398`
`403`	`399`	`fn signature(&self) -> &Signature {`
`@@ -449,7 +445,7 @@ impl AggregateUDFImpl for LastValue {`
`449`	`445`	`}`
`450`	`446`
`451`	`447`	`fn aliases(&self) -> &[String] {`
`452`		`- &self.aliases`
	`448`	`+ &[]`
`453`	`449`	`}`
`454`	`450`
`455`	`451`	`fn with_beneficial_ordering(`
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,6 @@ make_udaf_expr_and_func!(`
`58`	`58`	`/// result, but if cardinality is low then memory usage will also be lower.`
`59`	`59`	`pub struct Median {`
`60`	`60`	`signature: Signature,`
`61`		`- aliases: Vec<String>,`
`62`	`61`	`}`
`63`	`62`
`64`	`63`	`impl Debug for Median {`
`@@ -79,7 +78,6 @@ impl Default for Median {`
`79`	`78`	`impl Median {`
`80`	`79`	`pub fn new() -> Self {`
`81`	`80`	`Self {`
`82`		`- aliases: vec!["median".to_string()],`
`83`	`81`	`signature: Signature::numeric(1, Volatility::Immutable),`
`84`	`82`	`}`
`85`	`83`	`}`
`@@ -91,7 +89,7 @@ impl AggregateUDFImpl for Median {`
`91`	`89`	`}`
`92`	`90`
`93`	`91`	`fn name(&self) -> &str {`
`94`		`- "MEDIAN"`
	`92`	`+ "median"`
`95`	`93`	`}`
`96`	`94`
`97`	`95`	`fn signature(&self) -> &Signature {`
`@@ -152,7 +150,7 @@ impl AggregateUDFImpl for Median {`
`152`	`150`	`}`
`153`	`151`
`154`	`152`	`fn aliases(&self) -> &[String] {`
`155`		`- &self.aliases`
	`153`	`+ &[]`
`156`	`154`	`}`
`157`	`155`	`}`
`158`	`156`