Skip to content

Commit 31cbc43

Browse files
author
mertak-synnada
authored
Simplify streaming_merge function parameters (#12719)
* simplify streaming_merge function parameters * revert test change * change StreamingMergeConfig into builder pattern
1 parent 642a812 commit 31cbc43

File tree

7 files changed

+177
-110
lines changed

7 files changed

+177
-110
lines changed

datafusion/core/tests/fuzz_cases/sort_preserving_repartition_fuzz.rs

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ mod sp_repartition_fuzz_tests {
2929
metrics::{BaselineMetrics, ExecutionPlanMetricsSet},
3030
repartition::RepartitionExec,
3131
sorts::sort_preserving_merge::SortPreservingMergeExec,
32-
sorts::streaming_merge::streaming_merge,
32+
sorts::streaming_merge::StreamingMergeBuilder,
3333
stream::RecordBatchStreamAdapter,
3434
ExecutionPlan, Partitioning,
3535
};
@@ -246,15 +246,14 @@ mod sp_repartition_fuzz_tests {
246246
MemoryConsumer::new("test".to_string()).register(context.memory_pool());
247247

248248
// Internally SortPreservingMergeExec uses this function for merging.
249-
let res = streaming_merge(
250-
streams,
251-
schema,
252-
&exprs,
253-
BaselineMetrics::new(&ExecutionPlanMetricsSet::new(), 0),
254-
1,
255-
None,
256-
mem_reservation,
257-
)?;
249+
let res = StreamingMergeBuilder::new()
250+
.with_streams(streams)
251+
.with_schema(schema)
252+
.with_expressions(&exprs)
253+
.with_metrics(BaselineMetrics::new(&ExecutionPlanMetricsSet::new(), 0))
254+
.with_batch_size(1)
255+
.with_reservation(mem_reservation)
256+
.build()?;
258257
let res = collect(res).await?;
259258
// Contains the merged result.
260259
let res = concat_batches(&res[0].schema(), &res)?;

datafusion/physical-plan/src/aggregates/row_hash.rs

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use crate::aggregates::{
2929
};
3030
use crate::metrics::{BaselineMetrics, MetricBuilder, RecordOutput};
3131
use crate::sorts::sort::sort_batch;
32-
use crate::sorts::streaming_merge;
32+
use crate::sorts::streaming_merge::StreamingMergeBuilder;
3333
use crate::spill::{read_spill_as_stream, spill_record_batch_by_size};
3434
use crate::stream::RecordBatchStreamAdapter;
3535
use crate::{aggregates, metrics, ExecutionPlan, PhysicalExpr};
@@ -1001,15 +1001,14 @@ impl GroupedHashAggregateStream {
10011001
streams.push(stream);
10021002
}
10031003
self.spill_state.is_stream_merging = true;
1004-
self.input = streaming_merge(
1005-
streams,
1006-
schema,
1007-
&self.spill_state.spill_expr,
1008-
self.baseline_metrics.clone(),
1009-
self.batch_size,
1010-
None,
1011-
self.reservation.new_empty(),
1012-
)?;
1004+
self.input = StreamingMergeBuilder::new()
1005+
.with_streams(streams)
1006+
.with_schema(schema)
1007+
.with_expressions(&self.spill_state.spill_expr)
1008+
.with_metrics(self.baseline_metrics.clone())
1009+
.with_batch_size(self.batch_size)
1010+
.with_reservation(self.reservation.new_empty())
1011+
.build()?;
10131012
self.input_done = false;
10141013
self.group_ordering = GroupOrdering::Full(GroupOrderingFull::new());
10151014
Ok(())

datafusion/physical-plan/src/repartition/mod.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ use crate::metrics::BaselineMetrics;
3434
use crate::repartition::distributor_channels::{
3535
channels, partition_aware_channels, DistributionReceiver, DistributionSender,
3636
};
37-
use crate::sorts::streaming_merge;
37+
use crate::sorts::streaming_merge::StreamingMergeBuilder;
3838
use crate::stream::RecordBatchStreamAdapter;
3939
use crate::{DisplayFormatType, ExecutionPlan, Partitioning, PlanProperties, Statistics};
4040

@@ -637,15 +637,15 @@ impl ExecutionPlan for RepartitionExec {
637637
let merge_reservation =
638638
MemoryConsumer::new(format!("{}[Merge {partition}]", name))
639639
.register(context.memory_pool());
640-
streaming_merge(
641-
input_streams,
642-
schema_captured,
643-
&sort_exprs,
644-
BaselineMetrics::new(&metrics, partition),
645-
context.session_config().batch_size(),
646-
fetch,
647-
merge_reservation,
648-
)
640+
StreamingMergeBuilder::new()
641+
.with_streams(input_streams)
642+
.with_schema(schema_captured)
643+
.with_expressions(&sort_exprs)
644+
.with_metrics(BaselineMetrics::new(&metrics, partition))
645+
.with_batch_size(context.session_config().batch_size())
646+
.with_fetch(fetch)
647+
.with_reservation(merge_reservation)
648+
.build()
649649
} else {
650650
Ok(Box::pin(RepartitionStream {
651651
num_input_partitions,

datafusion/physical-plan/src/sorts/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,3 @@ mod stream;
2828
pub mod streaming_merge;
2929

3030
pub use index::RowIndex;
31-
pub(crate) use streaming_merge::streaming_merge;

datafusion/physical-plan/src/sorts/sort.rs

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ use crate::limit::LimitStream;
3030
use crate::metrics::{
3131
BaselineMetrics, Count, ExecutionPlanMetricsSet, MetricBuilder, MetricsSet,
3232
};
33-
use crate::sorts::streaming_merge::streaming_merge;
33+
use crate::sorts::streaming_merge::StreamingMergeBuilder;
3434
use crate::spill::{read_spill_as_stream, spill_record_batches};
3535
use crate::stream::RecordBatchStreamAdapter;
3636
use crate::topk::TopK;
@@ -342,15 +342,15 @@ impl ExternalSorter {
342342
streams.push(stream);
343343
}
344344

345-
streaming_merge(
346-
streams,
347-
Arc::clone(&self.schema),
348-
&self.expr,
349-
self.metrics.baseline.clone(),
350-
self.batch_size,
351-
self.fetch,
352-
self.reservation.new_empty(),
353-
)
345+
StreamingMergeBuilder::new()
346+
.with_streams(streams)
347+
.with_schema(Arc::clone(&self.schema))
348+
.with_expressions(&self.expr)
349+
.with_metrics(self.metrics.baseline.clone())
350+
.with_batch_size(self.batch_size)
351+
.with_fetch(self.fetch)
352+
.with_reservation(self.reservation.new_empty())
353+
.build()
354354
} else {
355355
self.in_mem_sort_stream(self.metrics.baseline.clone())
356356
}
@@ -534,15 +534,15 @@ impl ExternalSorter {
534534
})
535535
.collect::<Result<_>>()?;
536536

537-
streaming_merge(
538-
streams,
539-
Arc::clone(&self.schema),
540-
&self.expr,
541-
metrics,
542-
self.batch_size,
543-
self.fetch,
544-
self.merge_reservation.new_empty(),
545-
)
537+
StreamingMergeBuilder::new()
538+
.with_streams(streams)
539+
.with_schema(Arc::clone(&self.schema))
540+
.with_expressions(&self.expr)
541+
.with_metrics(metrics)
542+
.with_batch_size(self.batch_size)
543+
.with_fetch(self.fetch)
544+
.with_reservation(self.merge_reservation.new_empty())
545+
.build()
546546
}
547547

548548
/// Sorts a single `RecordBatch` into a single stream.

datafusion/physical-plan/src/sorts/sort_preserving_merge.rs

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use crate::common::spawn_buffered;
2424
use crate::expressions::PhysicalSortExpr;
2525
use crate::limit::LimitStream;
2626
use crate::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
27-
use crate::sorts::streaming_merge;
27+
use crate::sorts::streaming_merge::StreamingMergeBuilder;
2828
use crate::{
2929
DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties,
3030
Partitioning, PlanProperties, SendableRecordBatchStream, Statistics,
@@ -273,15 +273,15 @@ impl ExecutionPlan for SortPreservingMergeExec {
273273

274274
debug!("Done setting up sender-receiver for SortPreservingMergeExec::execute");
275275

276-
let result = streaming_merge(
277-
receivers,
278-
schema,
279-
&self.expr,
280-
BaselineMetrics::new(&self.metrics, partition),
281-
context.session_config().batch_size(),
282-
self.fetch,
283-
reservation,
284-
)?;
276+
let result = StreamingMergeBuilder::new()
277+
.with_streams(receivers)
278+
.with_schema(schema)
279+
.with_expressions(&self.expr)
280+
.with_metrics(BaselineMetrics::new(&self.metrics, partition))
281+
.with_batch_size(context.session_config().batch_size())
282+
.with_fetch(self.fetch)
283+
.with_reservation(reservation)
284+
.build()?;
285285

286286
debug!("Got stream result from SortPreservingMergeStream::new_from_receivers");
287287

@@ -960,16 +960,15 @@ mod tests {
960960
MemoryConsumer::new("test").register(&task_ctx.runtime_env().memory_pool);
961961

962962
let fetch = None;
963-
let merge_stream = streaming_merge(
964-
streams,
965-
batches.schema(),
966-
sort.as_slice(),
967-
BaselineMetrics::new(&metrics, 0),
968-
task_ctx.session_config().batch_size(),
969-
fetch,
970-
reservation,
971-
)
972-
.unwrap();
963+
let merge_stream = StreamingMergeBuilder::new()
964+
.with_streams(streams)
965+
.with_schema(batches.schema())
966+
.with_expressions(sort.as_slice())
967+
.with_metrics(BaselineMetrics::new(&metrics, 0))
968+
.with_batch_size(task_ctx.session_config().batch_size())
969+
.with_fetch(fetch)
970+
.with_reservation(reservation)
971+
.build()?;
973972

974973
let mut merged = common::collect(merge_stream).await.unwrap();
975974

datafusion/physical-plan/src/sorts/streaming_merge.rs

Lines changed: 111 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -49,49 +49,120 @@ macro_rules! merge_helper {
4949
}};
5050
}
5151

52-
/// Perform a streaming merge of [`SendableRecordBatchStream`] based on provided sort expressions
53-
/// while preserving order.
54-
pub fn streaming_merge(
52+
#[derive(Default)]
53+
pub struct StreamingMergeBuilder<'a> {
5554
streams: Vec<SendableRecordBatchStream>,
56-
schema: SchemaRef,
57-
expressions: &[PhysicalSortExpr],
58-
metrics: BaselineMetrics,
59-
batch_size: usize,
55+
schema: Option<SchemaRef>,
56+
expressions: &'a [PhysicalSortExpr],
57+
metrics: Option<BaselineMetrics>,
58+
batch_size: Option<usize>,
6059
fetch: Option<usize>,
61-
reservation: MemoryReservation,
62-
) -> Result<SendableRecordBatchStream> {
63-
// If there are no sort expressions, preserving the order
64-
// doesn't mean anything (and result in infinite loops)
65-
if expressions.is_empty() {
66-
return internal_err!("Sort expressions cannot be empty for streaming merge");
60+
reservation: Option<MemoryReservation>,
61+
}
62+
63+
impl<'a> StreamingMergeBuilder<'a> {
64+
pub fn new() -> Self {
65+
Self::default()
6766
}
68-
// Special case single column comparisons with optimized cursor implementations
69-
if expressions.len() == 1 {
70-
let sort = expressions[0].clone();
71-
let data_type = sort.expr.data_type(schema.as_ref())?;
72-
downcast_primitive! {
73-
data_type => (primitive_merge_helper, sort, streams, schema, metrics, batch_size, fetch, reservation),
74-
DataType::Utf8 => merge_helper!(StringArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
75-
DataType::LargeUtf8 => merge_helper!(LargeStringArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
76-
DataType::Binary => merge_helper!(BinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
77-
DataType::LargeBinary => merge_helper!(LargeBinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
78-
_ => {}
79-
}
67+
68+
pub fn with_streams(mut self, streams: Vec<SendableRecordBatchStream>) -> Self {
69+
self.streams = streams;
70+
self
8071
}
8172

82-
let streams = RowCursorStream::try_new(
83-
schema.as_ref(),
84-
expressions,
85-
streams,
86-
reservation.new_empty(),
87-
)?;
88-
89-
Ok(Box::pin(SortPreservingMergeStream::new(
90-
Box::new(streams),
91-
schema,
92-
metrics,
93-
batch_size,
94-
fetch,
95-
reservation,
96-
)))
73+
pub fn with_schema(mut self, schema: SchemaRef) -> Self {
74+
self.schema = Some(schema);
75+
self
76+
}
77+
78+
pub fn with_expressions(mut self, expressions: &'a [PhysicalSortExpr]) -> Self {
79+
self.expressions = expressions;
80+
self
81+
}
82+
83+
pub fn with_metrics(mut self, metrics: BaselineMetrics) -> Self {
84+
self.metrics = Some(metrics);
85+
self
86+
}
87+
88+
pub fn with_batch_size(mut self, batch_size: usize) -> Self {
89+
self.batch_size = Some(batch_size);
90+
self
91+
}
92+
93+
pub fn with_fetch(mut self, fetch: Option<usize>) -> Self {
94+
self.fetch = fetch;
95+
self
96+
}
97+
98+
pub fn with_reservation(mut self, reservation: MemoryReservation) -> Self {
99+
self.reservation = Some(reservation);
100+
self
101+
}
102+
103+
pub fn build(self) -> Result<SendableRecordBatchStream> {
104+
let Self {
105+
streams,
106+
schema,
107+
metrics,
108+
batch_size,
109+
reservation,
110+
fetch,
111+
expressions,
112+
} = self;
113+
114+
// Early return if streams or expressions are empty
115+
let checks = [
116+
(
117+
streams.is_empty(),
118+
"Streams cannot be empty for streaming merge",
119+
),
120+
(
121+
expressions.is_empty(),
122+
"Sort expressions cannot be empty for streaming merge",
123+
),
124+
];
125+
126+
if let Some((_, error_message)) = checks.iter().find(|(condition, _)| *condition)
127+
{
128+
return internal_err!("{}", error_message);
129+
}
130+
131+
// Unwrapping mandatory fields
132+
let schema = schema.expect("Schema cannot be empty for streaming merge");
133+
let metrics = metrics.expect("Metrics cannot be empty for streaming merge");
134+
let batch_size =
135+
batch_size.expect("Batch size cannot be empty for streaming merge");
136+
let reservation =
137+
reservation.expect("Reservation cannot be empty for streaming merge");
138+
139+
// Special case single column comparisons with optimized cursor implementations
140+
if expressions.len() == 1 {
141+
let sort = expressions[0].clone();
142+
let data_type = sort.expr.data_type(schema.as_ref())?;
143+
downcast_primitive! {
144+
data_type => (primitive_merge_helper, sort, streams, schema, metrics, batch_size, fetch, reservation),
145+
DataType::Utf8 => merge_helper!(StringArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
146+
DataType::LargeUtf8 => merge_helper!(LargeStringArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
147+
DataType::Binary => merge_helper!(BinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
148+
DataType::LargeBinary => merge_helper!(LargeBinaryArray, sort, streams, schema, metrics, batch_size, fetch, reservation)
149+
_ => {}
150+
}
151+
}
152+
153+
let streams = RowCursorStream::try_new(
154+
schema.as_ref(),
155+
expressions,
156+
streams,
157+
reservation.new_empty(),
158+
)?;
159+
Ok(Box::pin(SortPreservingMergeStream::new(
160+
Box::new(streams),
161+
schema,
162+
metrics,
163+
batch_size,
164+
fetch,
165+
reservation,
166+
)))
167+
}
97168
}

0 commit comments

Comments
 (0)