Skip to content

Commit e8d5c17

Browse files
vincevalamb
authored andcommitted
Improve median performance. (apache#6837)
* Improve median performance. * Fix formatting. * Review feedback * Renamed arrays size.
1 parent e044b5c commit e8d5c17

File tree

1 file changed

+38
-10
lines changed
  • datafusion/physical-expr/src/aggregate

1 file changed

+38
-10
lines changed

datafusion/physical-expr/src/aggregate/median.rs

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ impl AggregateExpr for Median {
6666
fn create_accumulator(&self) -> Result<Box<dyn Accumulator>> {
6767
Ok(Box::new(MedianAccumulator {
6868
data_type: self.data_type.clone(),
69+
arrays: vec![],
6970
all_values: vec![],
7071
}))
7172
}
@@ -108,29 +109,31 @@ impl PartialEq<dyn Any> for Median {
108109
/// The median accumulator accumulates the raw input values
109110
/// as `ScalarValue`s
110111
///
111-
/// The intermediate state is represented as a List of those scalars
112+
/// The intermediate state is represented as a List of scalar values updated by
113+
/// `merge_batch` and a `Vec` of `ArrayRef` that are converted to scalar values
114+
/// in the final evaluation step so that we avoid expensive conversions and
115+
/// allocations during `update_batch`.
112116
struct MedianAccumulator {
113117
data_type: DataType,
118+
arrays: Vec<ArrayRef>,
114119
all_values: Vec<ScalarValue>,
115120
}
116121

117122
impl Accumulator for MedianAccumulator {
118123
fn state(&self) -> Result<Vec<ScalarValue>> {
119-
let state =
120-
ScalarValue::new_list(Some(self.all_values.clone()), self.data_type.clone());
124+
let all_values = to_scalar_values(&self.arrays)?;
125+
let state = ScalarValue::new_list(Some(all_values), self.data_type.clone());
126+
121127
Ok(vec![state])
122128
}
123129

124130
fn update_batch(&mut self, values: &[ArrayRef]) -> Result<()> {
125131
assert_eq!(values.len(), 1);
126132
let array = &values[0];
127133

134+
// Defer conversions to scalar values to final evaluation.
128135
assert_eq!(array.data_type(), &self.data_type);
129-
self.all_values.reserve(array.len());
130-
for index in 0..array.len() {
131-
self.all_values
132-
.push(ScalarValue::try_from_array(array, index)?);
133-
}
136+
self.arrays.push(array.clone());
134137

135138
Ok(())
136139
}
@@ -157,7 +160,14 @@ impl Accumulator for MedianAccumulator {
157160
}
158161

159162
fn evaluate(&self) -> Result<ScalarValue> {
160-
if !self.all_values.iter().any(|v| !v.is_null()) {
163+
let batch_values = to_scalar_values(&self.arrays)?;
164+
165+
if !self
166+
.all_values
167+
.iter()
168+
.chain(batch_values.iter())
169+
.any(|v| !v.is_null())
170+
{
161171
return ScalarValue::try_from(&self.data_type);
162172
}
163173

@@ -166,6 +176,7 @@ impl Accumulator for MedianAccumulator {
166176
let array = ScalarValue::iter_to_array(
167177
self.all_values
168178
.iter()
179+
.chain(batch_values.iter())
169180
// ignore null values
170181
.filter(|v| !v.is_null())
171182
.cloned(),
@@ -214,13 +225,30 @@ impl Accumulator for MedianAccumulator {
214225
}
215226

216227
fn size(&self) -> usize {
217-
std::mem::size_of_val(self) + ScalarValue::size_of_vec(&self.all_values)
228+
let arrays_size: usize = self.arrays.iter().map(|a| a.len()).sum();
229+
230+
std::mem::size_of_val(self)
231+
+ ScalarValue::size_of_vec(&self.all_values)
232+
+ arrays_size
218233
- std::mem::size_of_val(&self.all_values)
219234
+ self.data_type.size()
220235
- std::mem::size_of_val(&self.data_type)
221236
}
222237
}
223238

239+
fn to_scalar_values(arrays: &[ArrayRef]) -> Result<Vec<ScalarValue>> {
240+
let num_values: usize = arrays.iter().map(|a| a.len()).sum();
241+
let mut all_values = Vec::with_capacity(num_values);
242+
243+
for array in arrays {
244+
for index in 0..array.len() {
245+
all_values.push(ScalarValue::try_from_array(&array, index)?);
246+
}
247+
}
248+
249+
Ok(all_values)
250+
}
251+
224252
/// Given a returns `array[indicies[indicie_index]]` as a `ScalarValue`
225253
fn scalar_at_index(
226254
array: &dyn Array,

0 commit comments

Comments
 (0)