Skip to content

Commit fdb221f

Browse files
[refactor]: Convert Vec<PhysicalExpr> to HashSet<PhysicalExpr> (#13612)
* Initial commit * Change implementation to take iterator * Minor changes * Update datafusion/physical-expr/src/equivalence/class.rs Co-authored-by: Alex Huang <[email protected]> * Remove leftover comment --------- Co-authored-by: Alex Huang <[email protected]>
1 parent 86740bf commit fdb221f

File tree

3 files changed

+38
-82
lines changed

3 files changed

+38
-82
lines changed

datafusion/physical-expr-common/src/physical_expr.rs

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -217,11 +217,24 @@ pub fn with_new_children_if_necessary(
217217
/// Returns [`Display`] able a list of [`PhysicalExpr`]
218218
///
219219
/// Example output: `[a + 1, b]`
220-
pub fn format_physical_expr_list(exprs: &[Arc<dyn PhysicalExpr>]) -> impl Display + '_ {
221-
struct DisplayWrapper<'a>(&'a [Arc<dyn PhysicalExpr>]);
222-
impl Display for DisplayWrapper<'_> {
220+
pub fn format_physical_expr_list<T>(exprs: T) -> impl Display
221+
where
222+
T: IntoIterator,
223+
T::Item: Display,
224+
T::IntoIter: Clone,
225+
{
226+
struct DisplayWrapper<I>(I)
227+
where
228+
I: Iterator + Clone,
229+
I::Item: Display;
230+
231+
impl<I> Display for DisplayWrapper<I>
232+
where
233+
I: Iterator + Clone,
234+
I::Item: Display,
235+
{
223236
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
224-
let mut iter = self.0.iter();
237+
let mut iter = self.0.clone();
225238
write!(f, "[")?;
226239
if let Some(expr) = iter.next() {
227240
write!(f, "{}", expr)?;
@@ -233,5 +246,6 @@ pub fn format_physical_expr_list(exprs: &[Arc<dyn PhysicalExpr>]) -> impl Displa
233246
Ok(())
234247
}
235248
}
236-
DisplayWrapper(exprs)
249+
250+
DisplayWrapper(exprs.into_iter())
237251
}

datafusion/physical-expr/src/equivalence/class.rs

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,14 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use std::fmt::Display;
19-
use std::sync::Arc;
20-
2118
use super::{add_offset_to_expr, collapse_lex_req, ProjectionMapping};
2219
use crate::{
23-
expressions::Column, physical_expr::deduplicate_physical_exprs,
24-
physical_exprs_bag_equal, physical_exprs_contains, LexOrdering, LexRequirement,
20+
expressions::Column, physical_exprs_contains, LexOrdering, LexRequirement,
2521
PhysicalExpr, PhysicalExprRef, PhysicalSortExpr, PhysicalSortRequirement,
2622
};
23+
use indexmap::IndexSet;
24+
use std::fmt::Display;
25+
use std::sync::Arc;
2726

2827
use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
2928
use datafusion_common::JoinType;
@@ -190,47 +189,47 @@ pub struct EquivalenceClass {
190189
/// The expressions in this equivalence class. The order doesn't
191190
/// matter for equivalence purposes
192191
///
193-
/// TODO: use a HashSet for this instead of a Vec
194-
exprs: Vec<Arc<dyn PhysicalExpr>>,
192+
exprs: IndexSet<Arc<dyn PhysicalExpr>>,
195193
}
196194

197195
impl PartialEq for EquivalenceClass {
198196
/// Returns true if other is equal in the sense
199197
/// of bags (multi-sets), disregarding their orderings.
200198
fn eq(&self, other: &Self) -> bool {
201-
physical_exprs_bag_equal(&self.exprs, &other.exprs)
199+
self.exprs.eq(&other.exprs)
202200
}
203201
}
204202

205203
impl EquivalenceClass {
206204
/// Create a new empty equivalence class
207205
pub fn new_empty() -> Self {
208-
Self { exprs: vec![] }
206+
Self {
207+
exprs: IndexSet::new(),
208+
}
209209
}
210210

211211
// Create a new equivalence class from a pre-existing `Vec`
212-
pub fn new(mut exprs: Vec<Arc<dyn PhysicalExpr>>) -> Self {
213-
deduplicate_physical_exprs(&mut exprs);
214-
Self { exprs }
212+
pub fn new(exprs: Vec<Arc<dyn PhysicalExpr>>) -> Self {
213+
Self {
214+
exprs: exprs.into_iter().collect(),
215+
}
215216
}
216217

217218
/// Return the inner vector of expressions
218219
pub fn into_vec(self) -> Vec<Arc<dyn PhysicalExpr>> {
219-
self.exprs
220+
self.exprs.into_iter().collect()
220221
}
221222

222223
/// Return the "canonical" expression for this class (the first element)
223224
/// if any
224225
fn canonical_expr(&self) -> Option<Arc<dyn PhysicalExpr>> {
225-
self.exprs.first().cloned()
226+
self.exprs.iter().next().cloned()
226227
}
227228

228229
/// Insert the expression into this class, meaning it is known to be equal to
229230
/// all other expressions in this class
230231
pub fn push(&mut self, expr: Arc<dyn PhysicalExpr>) {
231-
if !self.contains(&expr) {
232-
self.exprs.push(expr);
233-
}
232+
self.exprs.insert(expr);
234233
}
235234

236235
/// Inserts all the expressions from other into this class
@@ -243,7 +242,7 @@ impl EquivalenceClass {
243242

244243
/// Returns true if this equivalence class contains t expression
245244
pub fn contains(&self, expr: &Arc<dyn PhysicalExpr>) -> bool {
246-
physical_exprs_contains(&self.exprs, expr)
245+
self.exprs.contains(expr)
247246
}
248247

249248
/// Returns true if this equivalence class has any entries in common with `other`

datafusion/physical-expr/src/physical_expr.rs

Lines changed: 2 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -65,34 +65,14 @@ pub fn physical_exprs_bag_equal(
6565
}
6666
}
6767

68-
/// This utility function removes duplicates from the given `exprs` vector.
69-
/// Note that this function does not necessarily preserve its input ordering.
70-
pub fn deduplicate_physical_exprs(exprs: &mut Vec<Arc<dyn PhysicalExpr>>) {
71-
// TODO: Once we can use `HashSet`s with `Arc<dyn PhysicalExpr>`, this
72-
// function should use a `HashSet` to reduce computational complexity.
73-
// See issue: https://github.com/apache/datafusion/issues/8027
74-
let mut idx = 0;
75-
while idx < exprs.len() {
76-
let mut rest_idx = idx + 1;
77-
while rest_idx < exprs.len() {
78-
if exprs[idx].eq(&exprs[rest_idx]) {
79-
exprs.swap_remove(rest_idx);
80-
} else {
81-
rest_idx += 1;
82-
}
83-
}
84-
idx += 1;
85-
}
86-
}
87-
8868
#[cfg(test)]
8969
mod tests {
9070
use std::sync::Arc;
9171

9272
use crate::expressions::{Column, Literal};
9373
use crate::physical_expr::{
94-
deduplicate_physical_exprs, physical_exprs_bag_equal, physical_exprs_contains,
95-
physical_exprs_equal, PhysicalExpr,
74+
physical_exprs_bag_equal, physical_exprs_contains, physical_exprs_equal,
75+
PhysicalExpr,
9676
};
9777

9878
use datafusion_common::ScalarValue;
@@ -208,41 +188,4 @@ mod tests {
208188
assert!(physical_exprs_bag_equal(list3.as_slice(), list3.as_slice()));
209189
assert!(physical_exprs_bag_equal(list4.as_slice(), list4.as_slice()));
210190
}
211-
212-
#[test]
213-
fn test_deduplicate_physical_exprs() {
214-
let lit_true = &(Arc::new(Literal::new(ScalarValue::Boolean(Some(true))))
215-
as Arc<dyn PhysicalExpr>);
216-
let lit_false = &(Arc::new(Literal::new(ScalarValue::Boolean(Some(false))))
217-
as Arc<dyn PhysicalExpr>);
218-
let lit4 = &(Arc::new(Literal::new(ScalarValue::Int32(Some(4))))
219-
as Arc<dyn PhysicalExpr>);
220-
let lit2 = &(Arc::new(Literal::new(ScalarValue::Int32(Some(2))))
221-
as Arc<dyn PhysicalExpr>);
222-
let col_a_expr = &(Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>);
223-
let col_b_expr = &(Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>);
224-
225-
// First vector in the tuple is arguments, second one is the expected value.
226-
let test_cases = vec![
227-
// ---------- TEST CASE 1----------//
228-
(
229-
vec![
230-
lit_true, lit_false, lit4, lit2, col_a_expr, col_a_expr, col_b_expr,
231-
lit_true, lit2,
232-
],
233-
vec![lit_true, lit_false, lit4, lit2, col_a_expr, col_b_expr],
234-
),
235-
// ---------- TEST CASE 2----------//
236-
(
237-
vec![lit_true, lit_true, lit_false, lit4],
238-
vec![lit_true, lit4, lit_false],
239-
),
240-
];
241-
for (exprs, expected) in test_cases {
242-
let mut exprs = exprs.into_iter().cloned().collect::<Vec<_>>();
243-
let expected = expected.into_iter().cloned().collect::<Vec<_>>();
244-
deduplicate_physical_exprs(&mut exprs);
245-
assert!(physical_exprs_equal(&exprs, &expected));
246-
}
247-
}
248191
}

0 commit comments

Comments
 (0)