15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
+ use std:: mem;
19
+ use std:: sync:: Arc ;
20
+
18
21
use super :: listing:: PartitionedFile ;
19
22
use crate :: arrow:: datatypes:: { Schema , SchemaRef } ;
20
23
use crate :: error:: Result ;
@@ -26,16 +29,14 @@ use datafusion_common::stats::Precision;
26
29
use datafusion_common:: ScalarValue ;
27
30
28
31
use futures:: { Stream , StreamExt } ;
29
- use itertools:: izip;
30
- use itertools:: multiunzip;
31
32
32
33
/// Get all files as well as the file level summary statistics (no statistic for partition columns).
33
34
/// If the optional `limit` is provided, includes only sufficient files. Needed to read up to
34
35
/// `limit` number of rows. `collect_stats` is passed down from the configuration parameter on
35
36
/// `ListingTable`. If it is false we only construct bare statistics and skip a potentially expensive
36
37
/// call to `multiunzip` for constructing file level summary statistics.
37
38
pub async fn get_statistics_with_limit (
38
- all_files : impl Stream < Item = Result < ( PartitionedFile , Statistics ) > > ,
39
+ all_files : impl Stream < Item = Result < ( PartitionedFile , Arc < Statistics > ) > > ,
39
40
file_schema : SchemaRef ,
40
41
limit : Option < usize > ,
41
42
collect_stats : bool ,
@@ -48,26 +49,27 @@ pub async fn get_statistics_with_limit(
48
49
// - zero for summations, and
49
50
// - neutral element for extreme points.
50
51
let size = file_schema. fields ( ) . len ( ) ;
51
- let mut null_counts: Vec < Precision < usize > > = vec ! [ Precision :: Absent ; size] ;
52
- let mut max_values: Vec < Precision < ScalarValue > > = vec ! [ Precision :: Absent ; size] ;
53
- let mut min_values: Vec < Precision < ScalarValue > > = vec ! [ Precision :: Absent ; size] ;
52
+ let mut col_stats_set = vec ! [ ColumnStatistics :: default ( ) ; size] ;
54
53
let mut num_rows = Precision :: < usize > :: Absent ;
55
54
let mut total_byte_size = Precision :: < usize > :: Absent ;
56
55
57
56
// Fusing the stream allows us to call next safely even once it is finished.
58
57
let mut all_files = Box :: pin ( all_files. fuse ( ) ) ;
59
58
60
59
if let Some ( first_file) = all_files. next ( ) . await {
61
- let ( file, file_stats) = first_file?;
60
+ let ( mut file, file_stats) = first_file?;
61
+ file. statistics = Some ( file_stats. as_ref ( ) . clone ( ) ) ;
62
62
result_files. push ( file) ;
63
63
64
64
// First file, we set them directly from the file statistics.
65
- num_rows = file_stats. num_rows ;
66
- total_byte_size = file_stats. total_byte_size ;
67
- for ( index, file_column) in file_stats. column_statistics . into_iter ( ) . enumerate ( ) {
68
- null_counts[ index] = file_column. null_count ;
69
- max_values[ index] = file_column. max_value ;
70
- min_values[ index] = file_column. min_value ;
65
+ num_rows = file_stats. num_rows . clone ( ) ;
66
+ total_byte_size = file_stats. total_byte_size . clone ( ) ;
67
+ for ( index, file_column) in
68
+ file_stats. column_statistics . clone ( ) . into_iter ( ) . enumerate ( )
69
+ {
70
+ col_stats_set[ index] . null_count = file_column. null_count ;
71
+ col_stats_set[ index] . max_value = file_column. max_value ;
72
+ col_stats_set[ index] . min_value = file_column. min_value ;
71
73
}
72
74
73
75
// If the number of rows exceeds the limit, we can stop processing
@@ -80,7 +82,8 @@ pub async fn get_statistics_with_limit(
80
82
} ;
81
83
if conservative_num_rows <= limit. unwrap_or ( usize:: MAX ) {
82
84
while let Some ( current) = all_files. next ( ) . await {
83
- let ( file, file_stats) = current?;
85
+ let ( mut file, file_stats) = current?;
86
+ file. statistics = Some ( file_stats. as_ref ( ) . clone ( ) ) ;
84
87
result_files. push ( file) ;
85
88
if !collect_stats {
86
89
continue ;
@@ -90,38 +93,28 @@ pub async fn get_statistics_with_limit(
90
93
// counts across all the files in question. If any file does not
91
94
// provide any information or provides an inexact value, we demote
92
95
// the statistic precision to inexact.
93
- num_rows = add_row_stats ( file_stats. num_rows , num_rows) ;
96
+ num_rows = add_row_stats ( file_stats. num_rows . clone ( ) , num_rows) ;
94
97
95
98
total_byte_size =
96
- add_row_stats ( file_stats. total_byte_size , total_byte_size) ;
99
+ add_row_stats ( file_stats. total_byte_size . clone ( ) , total_byte_size) ;
97
100
98
- ( null_counts, max_values, min_values) = multiunzip (
99
- izip ! (
100
- file_stats. column_statistics. into_iter( ) ,
101
- null_counts. into_iter( ) ,
102
- max_values. into_iter( ) ,
103
- min_values. into_iter( )
104
- )
105
- . map (
106
- |(
107
- ColumnStatistics {
108
- null_count : file_nc,
109
- max_value : file_max,
110
- min_value : file_min,
111
- distinct_count : _,
112
- } ,
113
- null_count,
114
- max_value,
115
- min_value,
116
- ) | {
117
- (
118
- add_row_stats ( file_nc, null_count) ,
119
- set_max_if_greater ( file_max, max_value) ,
120
- set_min_if_lesser ( file_min, min_value) ,
121
- )
122
- } ,
123
- ) ,
124
- ) ;
101
+ for ( file_col_stats, col_stats) in file_stats
102
+ . column_statistics
103
+ . iter ( )
104
+ . zip ( col_stats_set. iter_mut ( ) )
105
+ {
106
+ let ColumnStatistics {
107
+ null_count : file_nc,
108
+ max_value : file_max,
109
+ min_value : file_min,
110
+ distinct_count : _,
111
+ } = file_col_stats;
112
+
113
+ col_stats. null_count =
114
+ add_row_stats ( file_nc. clone ( ) , col_stats. null_count . clone ( ) ) ;
115
+ set_max_if_greater ( file_max, & mut col_stats. max_value ) ;
116
+ set_min_if_lesser ( file_min, & mut col_stats. min_value )
117
+ }
125
118
126
119
// If the number of rows exceeds the limit, we can stop processing
127
120
// files. This only applies when we know the number of rows. It also
@@ -139,7 +132,7 @@ pub async fn get_statistics_with_limit(
139
132
let mut statistics = Statistics {
140
133
num_rows,
141
134
total_byte_size,
142
- column_statistics : get_col_stats_vec ( null_counts , max_values , min_values ) ,
135
+ column_statistics : col_stats_set ,
143
136
} ;
144
137
if all_files. next ( ) . await . is_some ( ) {
145
138
// If we still have files in the stream, it means that the limit kicked
@@ -182,21 +175,6 @@ fn add_row_stats(
182
175
}
183
176
}
184
177
185
- pub ( crate ) fn get_col_stats_vec (
186
- null_counts : Vec < Precision < usize > > ,
187
- max_values : Vec < Precision < ScalarValue > > ,
188
- min_values : Vec < Precision < ScalarValue > > ,
189
- ) -> Vec < ColumnStatistics > {
190
- izip ! ( null_counts, max_values, min_values)
191
- . map ( |( null_count, max_value, min_value) | ColumnStatistics {
192
- null_count,
193
- max_value,
194
- min_value,
195
- distinct_count : Precision :: Absent ,
196
- } )
197
- . collect ( )
198
- }
199
-
200
178
pub ( crate ) fn get_col_stats (
201
179
schema : & Schema ,
202
180
null_counts : Vec < Precision < usize > > ,
@@ -238,45 +216,61 @@ fn min_max_aggregate_data_type(input_type: &DataType) -> &DataType {
238
216
/// If the given value is numerically greater than the original maximum value,
239
217
/// return the new maximum value with appropriate exactness information.
240
218
fn set_max_if_greater (
241
- max_nominee : Precision < ScalarValue > ,
242
- max_values : Precision < ScalarValue > ,
243
- ) -> Precision < ScalarValue > {
244
- match ( & max_values, & max_nominee) {
245
- ( Precision :: Exact ( val1) , Precision :: Exact ( val2) ) if val1 < val2 => max_nominee,
219
+ max_nominee : & Precision < ScalarValue > ,
220
+ max_value : & mut Precision < ScalarValue > ,
221
+ ) {
222
+ match ( & max_value, max_nominee) {
223
+ ( Precision :: Exact ( val1) , Precision :: Exact ( val2) ) if val1 < val2 => {
224
+ * max_value = max_nominee. clone ( ) ;
225
+ }
246
226
( Precision :: Exact ( val1) , Precision :: Inexact ( val2) )
247
227
| ( Precision :: Inexact ( val1) , Precision :: Inexact ( val2) )
248
228
| ( Precision :: Inexact ( val1) , Precision :: Exact ( val2) )
249
229
if val1 < val2 =>
250
230
{
251
- max_nominee. to_inexact ( )
231
+ * max_value = max_nominee. clone ( ) . to_inexact ( ) ;
232
+ }
233
+ ( Precision :: Exact ( _) , Precision :: Absent ) => {
234
+ let exact_max = mem:: take ( max_value) ;
235
+ * max_value = exact_max. to_inexact ( ) ;
236
+ }
237
+ ( Precision :: Absent , Precision :: Exact ( _) ) => {
238
+ * max_value = max_nominee. clone ( ) . to_inexact ( ) ;
239
+ }
240
+ ( Precision :: Absent , Precision :: Inexact ( _) ) => {
241
+ * max_value = max_nominee. clone ( ) ;
252
242
}
253
- ( Precision :: Exact ( _) , Precision :: Absent ) => max_values. to_inexact ( ) ,
254
- ( Precision :: Absent , Precision :: Exact ( _) ) => max_nominee. to_inexact ( ) ,
255
- ( Precision :: Absent , Precision :: Inexact ( _) ) => max_nominee,
256
- ( Precision :: Absent , Precision :: Absent ) => Precision :: Absent ,
257
- _ => max_values,
243
+ _ => { }
258
244
}
259
245
}
260
246
261
247
/// If the given value is numerically lesser than the original minimum value,
262
248
/// return the new minimum value with appropriate exactness information.
263
249
fn set_min_if_lesser (
264
- min_nominee : Precision < ScalarValue > ,
265
- min_values : Precision < ScalarValue > ,
266
- ) -> Precision < ScalarValue > {
267
- match ( & min_values, & min_nominee) {
268
- ( Precision :: Exact ( val1) , Precision :: Exact ( val2) ) if val1 > val2 => min_nominee,
250
+ min_nominee : & Precision < ScalarValue > ,
251
+ min_value : & mut Precision < ScalarValue > ,
252
+ ) {
253
+ match ( & min_value, min_nominee) {
254
+ ( Precision :: Exact ( val1) , Precision :: Exact ( val2) ) if val1 > val2 => {
255
+ * min_value = min_nominee. clone ( ) ;
256
+ }
269
257
( Precision :: Exact ( val1) , Precision :: Inexact ( val2) )
270
258
| ( Precision :: Inexact ( val1) , Precision :: Inexact ( val2) )
271
259
| ( Precision :: Inexact ( val1) , Precision :: Exact ( val2) )
272
260
if val1 > val2 =>
273
261
{
274
- min_nominee. to_inexact ( )
262
+ * min_value = min_nominee. clone ( ) . to_inexact ( ) ;
263
+ }
264
+ ( Precision :: Exact ( _) , Precision :: Absent ) => {
265
+ let exact_min = mem:: take ( min_value) ;
266
+ * min_value = exact_min. to_inexact ( ) ;
267
+ }
268
+ ( Precision :: Absent , Precision :: Exact ( _) ) => {
269
+ * min_value = min_nominee. clone ( ) . to_inexact ( ) ;
270
+ }
271
+ ( Precision :: Absent , Precision :: Inexact ( _) ) => {
272
+ * min_value = min_nominee. clone ( ) ;
275
273
}
276
- ( Precision :: Exact ( _) , Precision :: Absent ) => min_values. to_inexact ( ) ,
277
- ( Precision :: Absent , Precision :: Exact ( _) ) => min_nominee. to_inexact ( ) ,
278
- ( Precision :: Absent , Precision :: Inexact ( _) ) => min_nominee,
279
- ( Precision :: Absent , Precision :: Absent ) => Precision :: Absent ,
280
- _ => min_values,
274
+ _ => { }
281
275
}
282
276
}
0 commit comments