Plubing for GroupsAccumulator

alamb · alamb · commit 363c3ee31e9a · 2023-06-29T15:42:25.000-04:00
diff --git a/datafusion/core/src/physical_plan/aggregates/row_hash2.rs b/datafusion/core/src/physical_plan/aggregates/row_hash2.rs
@@ -19,6 +19,7 @@
 //!
 //! POC demonstration of GroupByHashApproach
 
+use datafusion_physical_expr::GroupsAccumulator;
 use log::info;
 use std::sync::Arc;
 use std::task::{Context, Poll};
@@ -208,9 +209,10 @@ fn create_accumulators(
     aggregate_exprs: Vec<Arc<dyn AggregateExpr>>,
 ) -> Result<Vec<Box<dyn GroupsAccumulator>>> {
     info!("Creating accumulator for {aggregate_exprs:#?}");
-    // This code needs to instantiate the GroupsAccumulator correctly
-
-    todo!()
+    aggregate_exprs
+        .into_iter()
+        .map(|agg_expr| agg_expr.create_groups_accumulator())
+        .collect()
 }
 
 impl Stream for GroupedHashAggregateStream2 {
@@ -408,81 +410,6 @@ impl GroupedHashAggregateStream2 {
     }
 }
 
-/// An implementation of GroupAccumulator is for a single aggregate
-/// (e.g. AVG) and stores the state for *all* groups internally
-///
-/// The logical model is that each group is given a `group_index`
-/// assigned and maintained by the hash table.
-///
-/// group_indexes are contiguous (there aren't gaps), and thus it is
-/// expected that each GroupAccumulator will use something like `Vec<..>`
-/// to store the group states.
-pub trait GroupsAccumulator: Send {
-    /// updates the accumulator's state from a vector of arrays:
-    ///
-    /// * `values`: the input arguments to the accumulator
-    /// * `group_indices`:  To which groups do the rows in `values` belong, group id)
-    /// * `opt_filter`: if present, only update aggregate state using values[i] if opt_filter[i] is true
-    fn update_batch(
-        &mut self,
-        values: &[ArrayRef],
-        group_indicies: &[usize],
-        opt_filter: Option<&BooleanArray>,
-    ) -> Result<usize>;
-
-    /// Returns the final aggregate value for each group as a single
-    /// `RecordBatch`
-    ///
-    /// OPEN QUESTION: Should this method take a "batch_size: usize"
-    /// and produce a Vec<RecordBatch> as output to avoid 1) requiring
-    /// one giant intermediate buffer?
-    ///
-    /// For example, the `SUM` accumulator maintains a running sum,
-    /// and `evaluate` will produce that running sum as its output for
-    /// all groups, in group_index order
-    ///
-    /// This call should be treated as consuming (takes `self`, but it
-    /// can not be due to keeping it object save) the accumulator is
-    /// free to release / reset it is internal state after this call
-    /// and error on any subsequent call.
-    fn evaluate(&mut self) -> Result<ArrayRef>;
-
-    /// Returns any intermediate aggregate state used for multi-phase grouping
-    ///
-    /// For example, AVG returns two arrays:  `SUM` and `COUNT`.
-    ///
-    /// This call should be treated as consuming (takes `self`, but it
-    /// can not be due to keeping it object save) the accumulator is
-    /// free to release / reset it is internal state after this call
-    /// and error on any subsequent call.
-    ///
-    /// TODO: consider returning a single Array (which could be a
-    /// StructArray) instead
-    fn state(&mut self) -> Result<Vec<ArrayRef>>;
-
-    /// merges intermediate state (from `state()`) into this accumulators values
-    ///
-    /// For some aggregates (such as `SUM`), merge_batch is the same
-    /// as `update_batch`, but for some aggregrates (such as `COUNT`)
-    /// the operations differ. See [`Self::state`] for more details on how
-    /// state is used and merged.
-    ///
-    /// * `values`: arrays produced from calling `state` previously to the accumulator
-    /// * `group_indices`:  To which groups do the rows in `values` belong, group id)
-    /// * `opt_filter`: if present, only update aggregate state using values[i] if opt_filter[i] is true
-    fn merge_batch(
-        &mut self,
-        values: &[ArrayRef],
-        group_indicies: &[usize],
-        opt_filter: Option<&BooleanArray>,
-    ) -> Result<()>;
-
-    /// Amount of memory used to store the state of this
-    /// accumulator. This function is called once per batch, so it
-    /// should be O(n) to compute
-    fn size(&self) -> usize;
-}
-
 impl GroupedHashAggregateStream2 {
     /// Create an output RecordBatch with all group keys and accumulator states/values
     fn create_batch_from_map(&mut self) -> Result<RecordBatch> {
diff --git a/datafusion/physical-expr/src/aggregate/groups_accumulator.rs b/datafusion/physical-expr/src/aggregate/groups_accumulator.rs
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Vectorized [`GroupsAccumulator`]
+
+use arrow_array::{ArrayRef, BooleanArray};
+use datafusion_common::Result;
+
+/// An implementation of GroupAccumulator is for a single aggregate
+/// (e.g. AVG) and stores the state for *all* groups internally
+///
+/// The logical model is that each group is given a `group_index`
+/// assigned and maintained by the hash table.
+///
+/// group_indexes are contiguous (there aren't gaps), and thus it is
+/// expected that each GroupAccumulator will use something like `Vec<..>`
+/// to store the group states.
+pub trait GroupsAccumulator: Send {
+    /// updates the accumulator's state from a vector of arrays:
+    ///
+    /// * `values`: the input arguments to the accumulator
+    /// * `group_indices`:  To which groups do the rows in `values` belong, group id)
+    /// * `opt_filter`: if present, only update aggregate state using values[i] if opt_filter[i] is true
+    fn update_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indicies: &[usize],
+        opt_filter: Option<&BooleanArray>,
+    ) -> Result<usize>;
+
+    /// Returns the final aggregate value for each group as a single
+    /// `RecordBatch`
+    ///
+    /// OPEN QUESTION: Should this method take a "batch_size: usize"
+    /// and produce a Vec<RecordBatch> as output to avoid 1) requiring
+    /// one giant intermediate buffer?
+    ///
+    /// For example, the `SUM` accumulator maintains a running sum,
+    /// and `evaluate` will produce that running sum as its output for
+    /// all groups, in group_index order
+    ///
+    /// This call should be treated as consuming (takes `self`, but it
+    /// can not be due to keeping it object save) the accumulator is
+    /// free to release / reset it is internal state after this call
+    /// and error on any subsequent call.
+    fn evaluate(&mut self) -> Result<ArrayRef>;
+
+    /// Returns any intermediate aggregate state used for multi-phase grouping
+    ///
+    /// For example, AVG returns two arrays:  `SUM` and `COUNT`.
+    ///
+    /// This call should be treated as consuming (takes `self`, but it
+    /// can not be due to keeping it object save) the accumulator is
+    /// free to release / reset it is internal state after this call
+    /// and error on any subsequent call.
+    ///
+    /// TODO: consider returning a single Array (which could be a
+    /// StructArray) instead
+    fn state(&mut self) -> Result<Vec<ArrayRef>>;
+
+    /// merges intermediate state (from `state()`) into this accumulators values
+    ///
+    /// For some aggregates (such as `SUM`), merge_batch is the same
+    /// as `update_batch`, but for some aggregrates (such as `COUNT`)
+    /// the operations differ. See [`Self::state`] for more details on how
+    /// state is used and merged.
+    ///
+    /// * `values`: arrays produced from calling `state` previously to the accumulator
+    /// * `group_indices`:  To which groups do the rows in `values` belong, group id)
+    /// * `opt_filter`: if present, only update aggregate state using values[i] if opt_filter[i] is true
+    fn merge_batch(
+        &mut self,
+        values: &[ArrayRef],
+        group_indicies: &[usize],
+        opt_filter: Option<&BooleanArray>,
+    ) -> Result<()>;
+
+    /// Amount of memory used to store the state of this
+    /// accumulator. This function is called once per batch, so it
+    /// should be O(n) to compute
+    fn size(&self) -> usize;
+}
diff --git a/datafusion/physical-expr/src/aggregate/mod.rs b/datafusion/physical-expr/src/aggregate/mod.rs
@@ -25,6 +25,8 @@ use std::any::Any;
 use std::fmt::Debug;
 use std::sync::Arc;
 
+use self::groups_accumulator::GroupsAccumulator;
+
 pub(crate) mod approx_distinct;
 pub(crate) mod approx_median;
 pub(crate) mod approx_percentile_cont;
@@ -45,6 +47,7 @@ pub(crate) mod median;
 #[macro_use]
 pub(crate) mod min_max;
 pub mod build_in;
+pub(crate) mod groups_accumulator;
 mod hyperloglog;
 pub mod moving_min_max;
 pub mod row_accumulator;
@@ -118,6 +121,18 @@ pub trait AggregateExpr: Send + Sync + Debug + PartialEq<dyn Any> {
         )))
     }
 
+    /// Return a specialized [`GroupsAccumulator`] that manages state for all groups
+    ///
+    /// For maximum performance, [`GroupsAccumulator`] should be
+    /// implemented rather than [`Accumulator`].
+    fn create_groups_accumulator(&self) -> Result<Box<dyn GroupsAccumulator>> {
+        // TODO: The default should implement a wrapper over
+        // sef.create_accumulator
+        Err(DataFusionError::NotImplemented(format!(
+            "GroupsAccumulator hasn't been implemented for {self:?} yet"
+        )))
+    }
+
     /// Construct an expression that calculates the aggregate in reverse.
     /// Typically the "reverse" expression is itself (e.g. SUM, COUNT).
     /// For aggregates that do not support calculation in reverse,
diff --git a/datafusion/physical-expr/src/lib.rs b/datafusion/physical-expr/src/lib.rs
@@ -45,7 +45,9 @@ pub mod var_provider;
 pub mod window;
 
 // reexport this to maintain compatibility with anything that used from_slice previously
+pub use aggregate::groups_accumulator::GroupsAccumulator;
 pub use aggregate::AggregateExpr;
+
 pub use equivalence::{
     project_equivalence_properties, project_ordering_equivalence_properties,
     EquivalenceProperties, EquivalentClass, OrderingEquivalenceProperties,