Skip to content

Commit 0e52274

Browse files
authored
feat: Add support for --mem-pool-type and --memory-limit options to multiple benchmarks (#14642)
* Add support --mem-pool-type and --memory-limit options for all benchmarks * Add --sort-spill-reservation-bytes option
1 parent 469f18b commit 0e52274

File tree

7 files changed

+137
-64
lines changed

7 files changed

+137
-64
lines changed

benchmarks/src/bin/external_aggr.rs

Lines changed: 17 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
//! external_aggr binary entrypoint
1919
20+
use datafusion::execution::memory_pool::GreedyMemoryPool;
21+
use datafusion::execution::memory_pool::MemoryPool;
2022
use std::collections::HashMap;
2123
use std::path::PathBuf;
2224
use std::sync::Arc;
@@ -41,7 +43,7 @@ use datafusion::prelude::*;
4143
use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt};
4244
use datafusion_common::instant::Instant;
4345
use datafusion_common::utils::get_available_parallelism;
44-
use datafusion_common::{exec_datafusion_err, exec_err, DEFAULT_PARQUET_EXTENSION};
46+
use datafusion_common::{exec_err, DEFAULT_PARQUET_EXTENSION};
4547

4648
#[derive(Debug, StructOpt)]
4749
#[structopt(
@@ -58,10 +60,6 @@ struct ExternalAggrConfig {
5860
#[structopt(short, long)]
5961
query: Option<usize>,
6062

61-
/// Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query.
62-
#[structopt(long)]
63-
memory_limit: Option<String>,
64-
6563
/// Common options
6664
#[structopt(flatten)]
6765
common: CommonOpt,
@@ -129,10 +127,8 @@ impl ExternalAggrConfig {
129127
pub async fn run(&self) -> Result<()> {
130128
let mut benchmark_run = BenchmarkRun::new();
131129

132-
let memory_limit = match &self.memory_limit {
133-
Some(limit) => Some(Self::parse_memory_limit(limit)?),
134-
None => None,
135-
};
130+
let memory_limit = self.common.memory_limit.map(|limit| limit as u64);
131+
let mem_pool_type = self.common.mem_pool_type.as_str();
136132

137133
let query_range = match self.query {
138134
Some(query_id) => query_id..=query_id,
@@ -171,7 +167,9 @@ impl ExternalAggrConfig {
171167
human_readable_size(mem_limit as usize)
172168
));
173169

174-
let query_results = self.benchmark_query(query_id, mem_limit).await?;
170+
let query_results = self
171+
.benchmark_query(query_id, mem_limit, mem_pool_type)
172+
.await?;
175173
for iter in query_results {
176174
benchmark_run.write_iter(iter.elapsed, iter.row_count);
177175
}
@@ -187,12 +185,20 @@ impl ExternalAggrConfig {
187185
&self,
188186
query_id: usize,
189187
mem_limit: u64,
188+
mem_pool_type: &str,
190189
) -> Result<Vec<QueryResult>> {
191190
let query_name =
192191
format!("Q{query_id}({})", human_readable_size(mem_limit as usize));
193192
let config = self.common.config();
193+
let memory_pool: Arc<dyn MemoryPool> = match mem_pool_type {
194+
"fair" => Arc::new(FairSpillPool::new(mem_limit as usize)),
195+
"greedy" => Arc::new(GreedyMemoryPool::new(mem_limit as usize)),
196+
_ => {
197+
return exec_err!("Invalid memory pool type: {}", mem_pool_type);
198+
}
199+
};
194200
let runtime_env = RuntimeEnvBuilder::new()
195-
.with_memory_pool(Arc::new(FairSpillPool::new(mem_limit as usize)))
201+
.with_memory_pool(memory_pool)
196202
.build_arc()?;
197203
let state = SessionStateBuilder::new()
198204
.with_config(config)
@@ -331,22 +337,6 @@ impl ExternalAggrConfig {
331337
.partitions
332338
.unwrap_or(get_available_parallelism())
333339
}
334-
335-
/// Parse memory limit from string to number of bytes
336-
/// e.g. '1.5G', '100M' -> 1572864
337-
fn parse_memory_limit(limit: &str) -> Result<u64> {
338-
let (number, unit) = limit.split_at(limit.len() - 1);
339-
let number: f64 = number.parse().map_err(|_| {
340-
exec_datafusion_err!("Failed to parse number from memory limit '{}'", limit)
341-
})?;
342-
343-
match unit {
344-
"K" => Ok((number * 1024.0) as u64),
345-
"M" => Ok((number * 1024.0 * 1024.0) as u64),
346-
"G" => Ok((number * 1024.0 * 1024.0 * 1024.0) as u64),
347-
_ => exec_err!("Unsupported unit '{}' in memory limit '{}'", unit, limit),
348-
}
349-
}
350340
}
351341

352342
#[tokio::main]
@@ -359,31 +349,3 @@ pub async fn main() -> Result<()> {
359349

360350
Ok(())
361351
}
362-
363-
#[cfg(test)]
364-
mod tests {
365-
use super::*;
366-
367-
#[test]
368-
fn test_parse_memory_limit_all() {
369-
// Test valid inputs
370-
assert_eq!(
371-
ExternalAggrConfig::parse_memory_limit("100K").unwrap(),
372-
102400
373-
);
374-
assert_eq!(
375-
ExternalAggrConfig::parse_memory_limit("1.5M").unwrap(),
376-
1572864
377-
);
378-
assert_eq!(
379-
ExternalAggrConfig::parse_memory_limit("2G").unwrap(),
380-
2147483648
381-
);
382-
383-
// Test invalid unit
384-
assert!(ExternalAggrConfig::parse_memory_limit("500X").is_err());
385-
386-
// Test invalid number
387-
assert!(ExternalAggrConfig::parse_memory_limit("abcM").is_err());
388-
}
389-
}

benchmarks/src/clickbench.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ impl RunOpt {
124124
parquet_options.binary_as_string = true;
125125
}
126126

127-
let ctx = SessionContext::new_with_config(config);
127+
let rt_builder = self.common.runtime_env_builder()?;
128+
let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
128129
self.register_hits(&ctx).await?;
129130

130131
let iterations = self.common.iterations;

benchmarks/src/h2o.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ impl RunOpt {
6868
};
6969

7070
let config = self.common.config();
71-
let ctx = SessionContext::new_with_config(config);
71+
let rt_builder = self.common.runtime_env_builder()?;
72+
let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
7273

7374
// Register data
7475
self.register_data(&ctx).await?;

benchmarks/src/imdb/run.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,8 +306,8 @@ impl RunOpt {
306306
.config()
307307
.with_collect_statistics(!self.disable_statistics);
308308
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
309-
310-
let ctx = SessionContext::new_with_config(config);
309+
let rt_builder = self.common.runtime_env_builder()?;
310+
let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
311311

312312
// register tables
313313
self.register_tables(&ctx).await?;
@@ -515,6 +515,9 @@ mod tests {
515515
iterations: 1,
516516
partitions: Some(2),
517517
batch_size: 8192,
518+
mem_pool_type: "fair".to_string(),
519+
memory_limit: None,
520+
sort_spill_reservation_bytes: None,
518521
debug: false,
519522
};
520523
let opt = RunOpt {
@@ -548,6 +551,9 @@ mod tests {
548551
iterations: 1,
549552
partitions: Some(2),
550553
batch_size: 8192,
554+
mem_pool_type: "fair".to_string(),
555+
memory_limit: None,
556+
sort_spill_reservation_bytes: None,
551557
debug: false,
552558
};
553559
let opt = RunOpt {

benchmarks/src/sort_tpch.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,10 @@ impl RunOpt {
188188
/// Benchmark query `query_id` in `SORT_QUERIES`
189189
async fn benchmark_query(&self, query_id: usize) -> Result<Vec<QueryResult>> {
190190
let config = self.common.config();
191+
let rt_builder = self.common.runtime_env_builder()?;
191192
let state = SessionStateBuilder::new()
192193
.with_config(config)
194+
.with_runtime_env(rt_builder.build_arc()?)
193195
.with_default_features()
194196
.build();
195197
let ctx = SessionContext::from(state);

benchmarks/src/tpch/run.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,8 @@ impl RunOpt {
121121
.config()
122122
.with_collect_statistics(!self.disable_statistics);
123123
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
124-
let ctx = SessionContext::new_with_config(config);
124+
let rt_builder = self.common.runtime_env_builder()?;
125+
let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
125126

126127
// register tables
127128
self.register_tables(&ctx).await?;
@@ -342,6 +343,9 @@ mod tests {
342343
iterations: 1,
343344
partitions: Some(2),
344345
batch_size: 8192,
346+
mem_pool_type: "fair".to_string(),
347+
memory_limit: None,
348+
sort_spill_reservation_bytes: None,
345349
debug: false,
346350
};
347351
let opt = RunOpt {
@@ -375,6 +379,9 @@ mod tests {
375379
iterations: 1,
376380
partitions: Some(2),
377381
batch_size: 8192,
382+
mem_pool_type: "fair".to_string(),
383+
memory_limit: None,
384+
sort_spill_reservation_bytes: None,
378385
debug: false,
379386
};
380387
let opt = RunOpt {

benchmarks/src/util/options.rs

Lines changed: 98 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,17 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use datafusion::prelude::SessionConfig;
19-
use datafusion_common::utils::get_available_parallelism;
18+
use std::{num::NonZeroUsize, sync::Arc};
19+
20+
use datafusion::{
21+
execution::{
22+
disk_manager::DiskManagerConfig,
23+
memory_pool::{FairSpillPool, GreedyMemoryPool, MemoryPool, TrackConsumersPool},
24+
runtime_env::RuntimeEnvBuilder,
25+
},
26+
prelude::SessionConfig,
27+
};
28+
use datafusion_common::{utils::get_available_parallelism, DataFusionError, Result};
2029
use structopt::StructOpt;
2130

2231
// Common benchmark options (don't use doc comments otherwise this doc
@@ -35,6 +44,20 @@ pub struct CommonOpt {
3544
#[structopt(short = "s", long = "batch-size", default_value = "8192")]
3645
pub batch_size: usize,
3746

47+
/// The memory pool type to use, should be one of "fair" or "greedy"
48+
#[structopt(long = "mem-pool-type", default_value = "fair")]
49+
pub mem_pool_type: String,
50+
51+
/// Memory limit (e.g. '100M', '1.5G'). If not specified, run all pre-defined memory limits for given query
52+
/// if there's any, otherwise run with no memory limit.
53+
#[structopt(long = "memory-limit", parse(try_from_str = parse_memory_limit))]
54+
pub memory_limit: Option<usize>,
55+
56+
/// The amount of memory to reserve for sort spill operations. DataFusion's default value will be used
57+
/// if not specified.
58+
#[structopt(long = "sort-spill-reservation-bytes", parse(try_from_str = parse_memory_limit))]
59+
pub sort_spill_reservation_bytes: Option<usize>,
60+
3861
/// Activate debug mode to see more details
3962
#[structopt(short, long)]
4063
pub debug: bool,
@@ -48,10 +71,81 @@ impl CommonOpt {
4871

4972
/// Modify the existing config appropriately
5073
pub fn update_config(&self, config: SessionConfig) -> SessionConfig {
51-
config
74+
let mut config = config
5275
.with_target_partitions(
5376
self.partitions.unwrap_or(get_available_parallelism()),
5477
)
55-
.with_batch_size(self.batch_size)
78+
.with_batch_size(self.batch_size);
79+
if let Some(sort_spill_reservation_bytes) = self.sort_spill_reservation_bytes {
80+
config =
81+
config.with_sort_spill_reservation_bytes(sort_spill_reservation_bytes);
82+
}
83+
config
84+
}
85+
86+
/// Return an appropriately configured `RuntimeEnvBuilder`
87+
pub fn runtime_env_builder(&self) -> Result<RuntimeEnvBuilder> {
88+
let mut rt_builder = RuntimeEnvBuilder::new();
89+
const NUM_TRACKED_CONSUMERS: usize = 5;
90+
if let Some(memory_limit) = self.memory_limit {
91+
let pool: Arc<dyn MemoryPool> = match self.mem_pool_type.as_str() {
92+
"fair" => Arc::new(TrackConsumersPool::new(
93+
FairSpillPool::new(memory_limit),
94+
NonZeroUsize::new(NUM_TRACKED_CONSUMERS).unwrap(),
95+
)),
96+
"greedy" => Arc::new(TrackConsumersPool::new(
97+
GreedyMemoryPool::new(memory_limit),
98+
NonZeroUsize::new(NUM_TRACKED_CONSUMERS).unwrap(),
99+
)),
100+
_ => {
101+
return Err(DataFusionError::Configuration(format!(
102+
"Invalid memory pool type: {}",
103+
self.mem_pool_type
104+
)))
105+
}
106+
};
107+
rt_builder = rt_builder
108+
.with_memory_pool(pool)
109+
.with_disk_manager(DiskManagerConfig::NewOs);
110+
}
111+
Ok(rt_builder)
112+
}
113+
}
114+
115+
/// Parse memory limit from string to number of bytes
116+
/// e.g. '1.5G', '100M' -> 1572864
117+
fn parse_memory_limit(limit: &str) -> Result<usize, String> {
118+
let (number, unit) = limit.split_at(limit.len() - 1);
119+
let number: f64 = number
120+
.parse()
121+
.map_err(|_| format!("Failed to parse number from memory limit '{}'", limit))?;
122+
123+
match unit {
124+
"K" => Ok((number * 1024.0) as usize),
125+
"M" => Ok((number * 1024.0 * 1024.0) as usize),
126+
"G" => Ok((number * 1024.0 * 1024.0 * 1024.0) as usize),
127+
_ => Err(format!(
128+
"Unsupported unit '{}' in memory limit '{}'",
129+
unit, limit
130+
)),
131+
}
132+
}
133+
134+
#[cfg(test)]
135+
mod tests {
136+
use super::*;
137+
138+
#[test]
139+
fn test_parse_memory_limit_all() {
140+
// Test valid inputs
141+
assert_eq!(parse_memory_limit("100K").unwrap(), 102400);
142+
assert_eq!(parse_memory_limit("1.5M").unwrap(), 1572864);
143+
assert_eq!(parse_memory_limit("2G").unwrap(), 2147483648);
144+
145+
// Test invalid unit
146+
assert!(parse_memory_limit("500X").is_err());
147+
148+
// Test invalid number
149+
assert!(parse_memory_limit("abcM").is_err());
56150
}
57151
}

0 commit comments

Comments
 (0)