Skip to content

Commit 7ca25b7

Browse files
committed
Fix neqjoinsel's behavior for semi/anti join cases.
Previously, this function estimated the selectivity as 1 minus eqjoinsel() for the negator equality operator, regardless of join type (I think there was an expectation that eqjoinsel would handle the join type). But actually this is completely wrong for semijoin cases: the fraction of the LHS that has a non-matching row is not one minus the fraction of the LHS that has a matching row. In reality a semijoin with <> will nearly always succeed: it can only fail when the RHS is empty, or it contains a single distinct value that is equal to the particular LHS value, or the LHS value is null. The only one of those things we should have much confidence in estimating is the fraction of LHS values that are null, so let's just take the selectivity as 1 minus outer nullfrac. Per coding convention, antijoin should be estimated the same as semijoin. Arguably this is a bug fix, but in view of the lack of field complaints and the risk of destabilizing plans, no back-patch. Thomas Munro, reviewed by Ashutosh Bapat Discussion: https://postgr.es/m/CAEepm=270ze2hVxWkJw-5eKzc3AB4C9KpH3L2kih75R5pdSogg@mail.gmail.com
1 parent 1145acc commit 7ca25b7

File tree

3 files changed

+85
-16
lines changed

3 files changed

+85
-16
lines changed

src/backend/utils/adt/selfuncs.c

+54-16
Original file line numberDiff line numberDiff line change
@@ -2767,29 +2767,67 @@ neqjoinsel(PG_FUNCTION_ARGS)
27672767
List *args = (List *) PG_GETARG_POINTER(2);
27682768
JoinType jointype = (JoinType) PG_GETARG_INT16(3);
27692769
SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
2770-
Oid eqop;
27712770
float8 result;
27722771

2773-
/*
2774-
* We want 1 - eqjoinsel() where the equality operator is the one
2775-
* associated with this != operator, that is, its negator.
2776-
*/
2777-
eqop = get_negator(operator);
2778-
if (eqop)
2772+
if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
27792773
{
2780-
result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel,
2781-
PointerGetDatum(root),
2782-
ObjectIdGetDatum(eqop),
2783-
PointerGetDatum(args),
2784-
Int16GetDatum(jointype),
2785-
PointerGetDatum(sjinfo)));
2774+
/*
2775+
* For semi-joins, if there is more than one distinct value in the RHS
2776+
* relation then every non-null LHS row must find a row to join since
2777+
* it can only be equal to one of them. We'll assume that there is
2778+
* always more than one distinct RHS value for the sake of stability,
2779+
* though in theory we could have special cases for empty RHS
2780+
* (selectivity = 0) and single-distinct-value RHS (selectivity =
2781+
* fraction of LHS that has the same value as the single RHS value).
2782+
*
2783+
* For anti-joins, if we use the same assumption that there is more
2784+
* than one distinct key in the RHS relation, then every non-null LHS
2785+
* row must be suppressed by the anti-join.
2786+
*
2787+
* So either way, the selectivity estimate should be 1 - nullfrac.
2788+
*/
2789+
VariableStatData leftvar;
2790+
VariableStatData rightvar;
2791+
bool reversed;
2792+
HeapTuple statsTuple;
2793+
double nullfrac;
2794+
2795+
get_join_variables(root, args, sjinfo, &leftvar, &rightvar, &reversed);
2796+
statsTuple = reversed ? rightvar.statsTuple : leftvar.statsTuple;
2797+
if (HeapTupleIsValid(statsTuple))
2798+
nullfrac = ((Form_pg_statistic) GETSTRUCT(statsTuple))->stanullfrac;
2799+
else
2800+
nullfrac = 0.0;
2801+
ReleaseVariableStats(leftvar);
2802+
ReleaseVariableStats(rightvar);
2803+
2804+
result = 1.0 - nullfrac;
27862805
}
27872806
else
27882807
{
2789-
/* Use default selectivity (should we raise an error instead?) */
2790-
result = DEFAULT_EQ_SEL;
2808+
/*
2809+
* We want 1 - eqjoinsel() where the equality operator is the one
2810+
* associated with this != operator, that is, its negator.
2811+
*/
2812+
Oid eqop = get_negator(operator);
2813+
2814+
if (eqop)
2815+
{
2816+
result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel,
2817+
PointerGetDatum(root),
2818+
ObjectIdGetDatum(eqop),
2819+
PointerGetDatum(args),
2820+
Int16GetDatum(jointype),
2821+
PointerGetDatum(sjinfo)));
2822+
}
2823+
else
2824+
{
2825+
/* Use default selectivity (should we raise an error instead?) */
2826+
result = DEFAULT_EQ_SEL;
2827+
}
2828+
result = 1.0 - result;
27912829
}
2792-
result = 1.0 - result;
2830+
27932831
PG_RETURN_FLOAT8(result);
27942832
}
27952833

src/test/regress/expected/join.out

+22
Original file line numberDiff line numberDiff line change
@@ -1845,6 +1845,28 @@ SELECT '' AS "xxx", *
18451845
| 1 | 4 | one | -1
18461846
(1 row)
18471847

1848+
--
1849+
-- semijoin selectivity for <>
1850+
--
1851+
explain (costs off)
1852+
select * from int4_tbl i4, tenk1 a
1853+
where exists(select * from tenk1 b
1854+
where a.twothousand = b.twothousand and a.fivethous <> b.fivethous)
1855+
and i4.f1 = a.tenthous;
1856+
QUERY PLAN
1857+
----------------------------------------------
1858+
Hash Semi Join
1859+
Hash Cond: (a.twothousand = b.twothousand)
1860+
Join Filter: (a.fivethous <> b.fivethous)
1861+
-> Hash Join
1862+
Hash Cond: (a.tenthous = i4.f1)
1863+
-> Seq Scan on tenk1 a
1864+
-> Hash
1865+
-> Seq Scan on int4_tbl i4
1866+
-> Hash
1867+
-> Seq Scan on tenk1 b
1868+
(10 rows)
1869+
18481870
--
18491871
-- More complicated constructs
18501872
--

src/test/regress/sql/join.sql

+9
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,15 @@ SELECT '' AS "xxx", *
193193
SELECT '' AS "xxx", *
194194
FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (i = 1);
195195

196+
--
197+
-- semijoin selectivity for <>
198+
--
199+
explain (costs off)
200+
select * from int4_tbl i4, tenk1 a
201+
where exists(select * from tenk1 b
202+
where a.twothousand = b.twothousand and a.fivethous <> b.fivethous)
203+
and i4.f1 = a.tenthous;
204+
196205

197206
--
198207
-- More complicated constructs

0 commit comments

Comments
 (0)