Skip to content

Commit 64a9280

Browse files
authored
fix approx_percentile_cont() bug (#11934)
1 parent ee6910b commit 64a9280

File tree

2 files changed

+52
-2
lines changed

2 files changed

+52
-2
lines changed

datafusion/functions-aggregate-common/src/tdigest.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ impl TDigest {
233233
}
234234

235235
fn clamp(v: f64, lo: f64, hi: f64) -> f64 {
236-
if lo.is_nan() && hi.is_nan() {
236+
if lo.is_nan() || hi.is_nan() {
237237
return v;
238238
}
239239
v.clamp(lo, hi)
@@ -539,6 +539,18 @@ impl TDigest {
539539
let value = self.centroids[pos].mean()
540540
+ ((rank - t) / self.centroids[pos].weight() - 0.5) * delta;
541541

542+
// In `merge_digests()`: `min` is initialized to Inf, `max` is initialized to -Inf
543+
// and gets updated according to different `TDigest`s
544+
// However, `min`/`max` won't get updated if there is only one `NaN` within `TDigest`
545+
// The following two checks is for such edge case
546+
if !min.is_finite() && min.is_sign_positive() {
547+
min = f64::NEG_INFINITY;
548+
}
549+
550+
if !max.is_finite() && max.is_sign_negative() {
551+
max = f64::INFINITY;
552+
}
553+
542554
Self::clamp(value, min, max)
543555
}
544556

datafusion/sqllogictest/test_files/aggregate.slt

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1249,6 +1249,44 @@ SELECT APPROX_PERCENTILE_CONT(v, 0.5) FROM (VALUES (CAST(NULL as INT))) as t (v)
12491249
----
12501250
NULL
12511251

1252+
#
1253+
# percentile_cont edge cases
1254+
#
1255+
1256+
statement ok
1257+
CREATE TABLE tmp_percentile_cont(v1 INT, v2 DOUBLE);
1258+
1259+
statement ok
1260+
INSERT INTO tmp_percentile_cont VALUES (1, 'NaN'::Double), (2, 'NaN'::Double), (3, 'NaN'::Double);
1261+
1262+
# ISSUE: https://github.com/apache/datafusion/issues/11871
1263+
# Note `approx_median()` is using the same implementation as `approx_percentile_cont()`
1264+
query R
1265+
select APPROX_MEDIAN(v2) from tmp_percentile_cont WHERE v1 = 1;
1266+
----
1267+
NaN
1268+
1269+
# ISSUE: https://github.com/apache/datafusion/issues/11870
1270+
query R
1271+
select APPROX_PERCENTILE_CONT(v2, 0.8) from tmp_percentile_cont;
1272+
----
1273+
NaN
1274+
1275+
# ISSUE: https://github.com/apache/datafusion/issues/11869
1276+
# Note: `approx_percentile_cont_with_weight()` uses the same implementation as `approx_percentile_cont()`
1277+
query R
1278+
SELECT APPROX_PERCENTILE_CONT_WITH_WEIGHT(
1279+
v2,
1280+
'+Inf'::Double,
1281+
0.9
1282+
)
1283+
FROM tmp_percentile_cont;
1284+
----
1285+
NaN
1286+
1287+
statement ok
1288+
DROP TABLE tmp_percentile_cont;
1289+
12521290
# csv_query_cube_avg
12531291
query TIR
12541292
SELECT c1, c2, AVG(c3) FROM aggregate_test_100 GROUP BY CUBE (c1, c2) ORDER BY c1, c2
@@ -5553,4 +5591,4 @@ drop table employee_csv;
55535591
query I??III?T
55545592
select count(null), min(null), max(null), bit_and(NULL), bit_or(NULL), bit_xor(NULL), nth_value(NULL, 1), string_agg(NULL, ',');
55555593
----
5556-
0 NULL NULL NULL NULL NULL NULL NULL
5594+
0 NULL NULL NULL NULL NULL NULL NULL

0 commit comments

Comments
 (0)