Skip to content

Commit aed543e

Browse files
committed
Merge remote-tracking branch 'apache/main' into alamb/unified_rg_and_page
2 parents 3bd9b04 + e4f7b98 commit aed543e

File tree

69 files changed

+2921
-1594
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

69 files changed

+2921
-1594
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ parking_lot = "0.12"
109109
parquet = { version = "51.0.0", default-features = false, features = ["arrow", "async", "object_store"] }
110110
rand = "0.8"
111111
regex = "1.8"
112-
rstest = "0.20.0"
112+
rstest = "0.21.0"
113113
serde_json = "1"
114114
sqlparser = { version = "0.45.0", features = ["visitor"] }
115115
tempfile = "3"

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion-examples/examples/advanced_udaf.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,10 @@ impl AggregateUDFImpl for GeoMeanUdaf {
105105
true
106106
}
107107

108-
fn create_groups_accumulator(&self) -> Result<Box<dyn GroupsAccumulator>> {
108+
fn create_groups_accumulator(
109+
&self,
110+
_args: AccumulatorArgs,
111+
) -> Result<Box<dyn GroupsAccumulator>> {
109112
Ok(Box::new(GeometricMeanGroupsAccumulator::new()))
110113
}
111114
}

datafusion-examples/examples/simplify_udaf_expression.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,13 @@ impl AggregateUDFImpl for BetterAvgUdaf {
7878
true
7979
}
8080

81-
fn create_groups_accumulator(&self) -> Result<Box<dyn GroupsAccumulator>> {
81+
fn create_groups_accumulator(
82+
&self,
83+
_args: AccumulatorArgs,
84+
) -> Result<Box<dyn GroupsAccumulator>> {
8285
unimplemented!("should not get here");
8386
}
87+
8488
// we override method, to return new expression which would substitute
8589
// user defined function call
8690
fn simplify(&self) -> Option<AggregateFunctionSimplification> {

datafusion/common/src/utils/memory.rs

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! This module provides a function to estimate the memory size of a HashTable prior to alloaction
19+
20+
use crate::{DataFusionError, Result};
21+
22+
/// Estimates the memory size required for a hash table prior to allocation.
23+
///
24+
/// # Parameters
25+
/// - `num_elements`: The number of elements expected in the hash table.
26+
/// - `fixed_size`: A fixed overhead size associated with the collection
27+
/// (e.g., HashSet or HashTable).
28+
/// - `T`: The type of elements stored in the hash table.
29+
///
30+
/// # Details
31+
/// This function calculates the estimated memory size by considering:
32+
/// - An overestimation of buckets to keep approximately 1/8 of them empty.
33+
/// - The total memory size is computed as:
34+
/// - The size of each entry (`T`) multiplied by the estimated number of
35+
/// buckets.
36+
/// - One byte overhead for each bucket.
37+
/// - The fixed size overhead of the collection.
38+
/// - If the estimation overflows, we return a [`DataFusionError`]
39+
///
40+
/// # Examples
41+
/// ---
42+
///
43+
/// ## From within a struct
44+
///
45+
/// ```rust
46+
/// # use datafusion_common::utils::memory::estimate_memory_size;
47+
/// # use datafusion_common::Result;
48+
///
49+
/// struct MyStruct<T> {
50+
/// values: Vec<T>,
51+
/// other_data: usize,
52+
/// }
53+
///
54+
/// impl<T> MyStruct<T> {
55+
/// fn size(&self) -> Result<usize> {
56+
/// let num_elements = self.values.len();
57+
/// let fixed_size = std::mem::size_of_val(self) +
58+
/// std::mem::size_of_val(&self.values);
59+
///
60+
/// estimate_memory_size::<T>(num_elements, fixed_size)
61+
/// }
62+
/// }
63+
/// ```
64+
/// ---
65+
/// ## With a simple collection
66+
///
67+
/// ```rust
68+
/// # use datafusion_common::utils::memory::estimate_memory_size;
69+
/// # use std::collections::HashMap;
70+
///
71+
/// let num_rows = 100;
72+
/// let fixed_size = std::mem::size_of::<HashMap<u64, u64>>();
73+
/// let estimated_hashtable_size =
74+
/// estimate_memory_size::<(u64, u64)>(num_rows,fixed_size)
75+
/// .expect("Size estimation failed");
76+
/// ```
77+
pub fn estimate_memory_size<T>(num_elements: usize, fixed_size: usize) -> Result<usize> {
78+
// For the majority of cases hashbrown overestimates the bucket quantity
79+
// to keep ~1/8 of them empty. We take this factor into account by
80+
// multiplying the number of elements with a fixed ratio of 8/7 (~1.14).
81+
// This formula leads to overallocation for small tables (< 8 elements)
82+
// but should be fine overall.
83+
num_elements
84+
.checked_mul(8)
85+
.and_then(|overestimate| {
86+
let estimated_buckets = (overestimate / 7).next_power_of_two();
87+
// + size of entry * number of buckets
88+
// + 1 byte for each bucket
89+
// + fixed size of collection (HashSet/HashTable)
90+
std::mem::size_of::<T>()
91+
.checked_mul(estimated_buckets)?
92+
.checked_add(estimated_buckets)?
93+
.checked_add(fixed_size)
94+
})
95+
.ok_or_else(|| {
96+
DataFusionError::Execution(
97+
"usize overflow while estimating the number of buckets".to_string(),
98+
)
99+
})
100+
}
101+
102+
#[cfg(test)]
103+
mod tests {
104+
use std::collections::HashSet;
105+
106+
use super::estimate_memory_size;
107+
108+
#[test]
109+
fn test_estimate_memory() {
110+
// size (bytes): 48
111+
let fixed_size = std::mem::size_of::<HashSet<u32>>();
112+
113+
// estimated buckets: 16 = (8 * 8 / 7).next_power_of_two()
114+
let num_elements = 8;
115+
// size (bytes): 128 = 16 * 4 + 16 + 48
116+
let estimated = estimate_memory_size::<u32>(num_elements, fixed_size).unwrap();
117+
assert_eq!(estimated, 128);
118+
119+
// estimated buckets: 64 = (40 * 8 / 7).next_power_of_two()
120+
let num_elements = 40;
121+
// size (bytes): 368 = 64 * 4 + 64 + 48
122+
let estimated = estimate_memory_size::<u32>(num_elements, fixed_size).unwrap();
123+
assert_eq!(estimated, 368);
124+
}
125+
126+
#[test]
127+
fn test_estimate_memory_overflow() {
128+
let num_elements = usize::MAX;
129+
let fixed_size = std::mem::size_of::<HashSet<u32>>();
130+
let estimated = estimate_memory_size::<u32>(num_elements, fixed_size);
131+
132+
assert!(estimated.is_err());
133+
}
134+
}

datafusion/common/src/utils/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
//! This module provides the bisect function, which implements binary search.
1919
20+
pub mod memory;
2021
pub mod proxy;
2122

2223
use crate::error::{_internal_datafusion_err, _internal_err};

datafusion/core/src/dataframe/mod.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,9 @@ use datafusion_expr::{
5353
avg, count, max, min, stddev, utils::COUNT_STAR_EXPANSION,
5454
TableProviderFilterPushDown, UNNAMED_TABLE,
5555
};
56-
use datafusion_expr::{case, is_null, sum};
56+
use datafusion_expr::{case, is_null};
5757
use datafusion_functions_aggregate::expr_fn::median;
58+
use datafusion_functions_aggregate::expr_fn::sum;
5859

5960
use async_trait::async_trait;
6061

@@ -1593,9 +1594,8 @@ mod tests {
15931594
use datafusion_common::{Constraint, Constraints};
15941595
use datafusion_common_runtime::SpawnedTask;
15951596
use datafusion_expr::{
1596-
array_agg, cast, count_distinct, create_udf, expr, lit, sum,
1597-
BuiltInWindowFunction, ScalarFunctionImplementation, Volatility, WindowFrame,
1598-
WindowFunctionDefinition,
1597+
array_agg, cast, count_distinct, create_udf, expr, lit, BuiltInWindowFunction,
1598+
ScalarFunctionImplementation, Volatility, WindowFrame, WindowFunctionDefinition,
15991599
};
16001600
use datafusion_physical_expr::expressions::Column;
16011601
use datafusion_physical_plan::{get_plan_string, ExecutionPlanProperties};

0 commit comments

Comments
 (0)