Skip to content

Commit 56e7f4a

Browse files
committed
feat: make Table.fill_null() have consistent semantics with .select(), etc
This is breaking. Before, doing `t.fill_null("a")` filled with the string literal "a". It also was impossible to use column references such as `t.fill_null(t.other_col)` or `t.fill_null(ibis._.other_col)`. Now, it is consistent with how .select(), .mutate(), etc all work. This also simplifies the internal representation so that the ops.FillNull always stores a `Mapping[str, <bound ops.Value>]` This also changes the behavior of the polars backend. Before, Table.fill_null() would fill in NaN. Now it does not, consistent with other backends, and the rest of ibis's Column.fill_null() semantics, including how polars treats Column.fill_null()
1 parent d7083c2 commit 56e7f4a

File tree

11 files changed

+196
-145
lines changed

11 files changed

+196
-145
lines changed

ibis/backends/polars/compiler.py

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
import datetime
55
import math
66
import operator
7-
from collections.abc import Mapping
87
from functools import partial, reduce, singledispatch
98
from math import isnan
109

@@ -375,36 +374,11 @@ def fill_null(op, **kw):
375374
table = translate(op.parent, **kw)
376375

377376
columns = []
378-
379-
repls = op.replacements
380-
381-
if isinstance(repls, Mapping):
382-
383-
def get_replacement(name):
384-
repl = repls.get(name)
385-
if repl is not None:
386-
return repl.value
387-
else:
388-
return None
389-
390-
else:
391-
value = repls.value
392-
393-
def get_replacement(_):
394-
return value
395-
396377
for name, dtype in op.parent.schema.items():
397378
column = pl.col(name)
398-
if isinstance(op.replacements, Mapping):
399-
value = op.replacements.get(name)
400-
else:
401-
value = _literal_value(op.replacements)
402-
403-
if value is not None:
404-
if dtype.is_floating():
405-
column = column.fill_nan(value)
379+
if (repl := op.replacements.get(name)) is not None:
380+
value = translate(repl, **kw)
406381
column = column.fill_null(value)
407-
408382
# requires special treatment if the fill value has different datatype
409383
if dtype.is_timestamp():
410384
column = column.cast(pl.Datetime)

ibis/backends/sql/rewrites.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import operator
66
import sys
7-
from collections.abc import Mapping
87
from functools import reduce
98
from typing import TYPE_CHECKING, Any
109

@@ -24,7 +23,7 @@
2423
from ibis.expr.schema import Schema
2524

2625
if TYPE_CHECKING:
27-
from collections.abc import Sequence
26+
from collections.abc import Mapping, Sequence
2827

2928
x = var("x")
3029
y = var("y")
@@ -150,22 +149,13 @@ def drop_columns_to_select(_, **kwargs):
150149
@replace(p.FillNull)
151150
def fill_null_to_select(_, **kwargs):
152151
"""Rewrite FillNull to a Select node."""
153-
if isinstance(_.replacements, Mapping):
154-
mapping = _.replacements
155-
else:
156-
mapping = {
157-
name: _.replacements
158-
for name, type in _.parent.schema.items()
159-
if type.nullable
160-
}
161-
162-
if not mapping:
152+
if not _.replacements:
163153
return _.parent
164154

165155
selections = {}
166156
for name in _.parent.schema.names:
167157
col = ops.Field(_.parent, name)
168-
if (value := mapping.get(name)) is not None:
158+
if (value := _.replacements.get(name)) is not None:
169159
col = ops.Coalesce((col, value))
170160
selections[name] = col
171161

ibis/backends/tests/test_generic.py

Lines changed: 86 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -450,37 +450,70 @@ def test_table_fill_null_invalid(alltypes):
450450
com.IbisTypeError, match=r"Column 'invalid_col' is not found in table"
451451
):
452452
alltypes.fill_null({"invalid_col": 0.0})
453-
454453
with pytest.raises(
455454
com.IbisTypeError, match=r"Cannot fill_null on column 'string_col' of type.*"
456455
):
457-
alltypes[["int_col", "string_col"]].fill_null(0)
458-
456+
alltypes.select("int_col", "string_col").fill_null(0)
457+
with pytest.raises(AttributeError, match=r"'Table' object has no attribute 'oops'"):
458+
alltypes.fill_null(_.oops)
459+
with pytest.raises(AttributeError, match=r"'Table' object has no attribute 'oops'"):
460+
alltypes.fill_null({"int_col": _.oops})
461+
with pytest.raises(com.IbisTypeError, match=r"Column 'oops' is not found in table"):
462+
alltypes.fill_null("oops")
463+
with pytest.raises(com.IbisTypeError, match=r"Column 'oops' is not found in table"):
464+
alltypes.fill_null({"int_col": "oops"})
459465
with pytest.raises(
460466
com.IbisTypeError, match=r"Cannot fill_null on column 'int_col' of type.*"
461467
):
462-
alltypes.fill_null({"int_col": "oops"})
468+
alltypes.fill_null({"int_col": ibis.literal("oops")})
463469

464470

465471
@pytest.mark.parametrize(
466-
"replacements",
472+
("ibis_replacements", "pd_replacements"),
467473
[
468-
param({"int_col": 20}, id="int"),
469-
param({"double_col": -1, "string_col": "missing"}, id="double-int-str"),
470-
param({"double_col": -1.5, "string_col": "missing"}, id="double-str"),
471-
param({}, id="empty"),
474+
param(
475+
lambda _t: {"int_col": 20},
476+
lambda _t: {"int_col": 20},
477+
id="int",
478+
),
479+
param(
480+
lambda _t: {"double_col": -1, "string_col": ibis.literal("missing")},
481+
lambda _t: {"double_col": -1, "string_col": "missing"},
482+
id="double-int-str",
483+
),
484+
param(
485+
lambda _t: {"double_col": -1.5, "string_col": ibis.literal("missing")},
486+
lambda _t: {"double_col": -1.5, "string_col": "missing"},
487+
id="double-str",
488+
),
489+
param(
490+
lambda _t: {"double_col": "int_col"},
491+
lambda t: {"double_col": t["int_col"]},
492+
id="column-name",
493+
),
494+
param(
495+
lambda _t: {"double_col": ibis._.int_col},
496+
lambda t: {"double_col": t["int_col"]},
497+
id="column",
498+
),
499+
param(
500+
lambda t: {"double_col": t.int_col},
501+
lambda t: {"double_col": t["int_col"]},
502+
id="deferred",
503+
),
504+
param(lambda _t: {}, lambda _t: {}, id="empty"),
472505
],
473506
)
474-
def test_table_fill_null_mapping(backend, alltypes, replacements):
507+
def test_table_fill_null_mapping(backend, alltypes, ibis_replacements, pd_replacements):
475508
table = alltypes.mutate(
476509
int_col=alltypes.int_col.nullif(1),
477510
double_col=alltypes.double_col.nullif(3.0),
478511
string_col=alltypes.string_col.nullif("2"),
479512
).select("id", "int_col", "double_col", "string_col")
480513
pd_table = table.execute()
481514

482-
result = table.fill_null(replacements).execute().reset_index(drop=True)
483-
expected = pd_table.fillna(replacements).reset_index(drop=True)
515+
result = table.fill_null(ibis_replacements(table)).execute().reset_index(drop=True)
516+
expected = pd_table.fillna(pd_replacements(pd_table)).reset_index(drop=True)
484517

485518
backend.assert_frame_equal(result, expected, check_dtype=False)
486519

@@ -493,14 +526,53 @@ def test_table_fill_null_scalar(backend, alltypes):
493526
).select("id", "int_col", "double_col", "string_col")
494527
pd_table = table.execute()
495528

496-
res = table[["int_col", "double_col"]].fill_null(0).execute().reset_index(drop=True)
529+
res = (
530+
table.select("int_col", "double_col")
531+
.fill_null(0)
532+
.execute()
533+
.reset_index(drop=True)
534+
)
497535
sol = pd_table[["int_col", "double_col"]].fillna(0).reset_index(drop=True)
498536
backend.assert_frame_equal(res, sol, check_dtype=False)
499537

500-
res = table[["string_col"]].fill_null("missing").execute().reset_index(drop=True)
538+
res = (
539+
table.select("string_col")
540+
.fill_null(ibis.literal("missing"))
541+
.execute()
542+
.reset_index(drop=True)
543+
)
501544
sol = pd_table[["string_col"]].fillna("missing").reset_index(drop=True)
502545
backend.assert_frame_equal(res, sol, check_dtype=False)
503546

547+
t = table.select("int_col", "double_col")
548+
sol = (
549+
pd_table[["int_col", "double_col"]]
550+
.fillna(pd_table.int_col)
551+
.reset_index(drop=True)
552+
)
553+
res = t.fill_null(t.int_col).execute().reset_index(drop=True)
554+
backend.assert_frame_equal(res, sol, check_dtype=False)
555+
res = t.fill_null(ibis._.int_col).execute().reset_index(drop=True)
556+
backend.assert_frame_equal(res, sol, check_dtype=False)
557+
res = t.fill_null("int_col").execute().reset_index(drop=True)
558+
backend.assert_frame_equal(res, sol, check_dtype=False)
559+
560+
561+
def test_table_fill_null_nans_are_untouched(con):
562+
# Test that NaNs are not filled when using fill_null
563+
t = ibis.memtable({"f": ["1.0", "nan", None]}).cast({"f": "float64"})
564+
e = t.fill_null(0.0)
565+
orig = set(con.to_pyarrow(t.f).to_pylist())
566+
assert len(orig) == 3
567+
assert 1.0 in orig
568+
assert None in orig
569+
assert any(isinstance(x, float) and np.isnan(x) for x in orig)
570+
after = set(con.to_pyarrow(e.f).to_pylist())
571+
assert len(after) == 3
572+
assert 1.0 in after
573+
assert 0.0 in after
574+
assert any(isinstance(x, float) and np.isnan(x) for x in after)
575+
504576

505577
def test_mutate_rename(alltypes):
506578
table = alltypes.select(["bool_col", "string_col"])

ibis/expr/operations/relations.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ def schema(self):
471471
class FillNull(Simple):
472472
"""Fill null values in the table."""
473473

474-
replacements: typing.Union[Value[dt.Numeric | dt.String], FrozenDict[str, Any]]
474+
replacements: FrozenDict[str, Value]
475475

476476

477477
@public
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
r0 := UnboundTable: t
2+
i int64
3+
f float64
4+
5+
FillNull[r0]
6+
replacements:
7+
i: r0.i
8+
f: r0.i
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
r0 := UnboundTable: t
2-
a int64
3-
b string
2+
i int64
3+
f float64
44

55
FillNull[r0]
66
replacements:
7-
a: 3
7+
i: 3
Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
r0 := UnboundTable: t
2-
a int64
3-
b string
2+
i int64
3+
f float64
44

5-
r1 := Project[r0]
6-
a: r0.a
7-
8-
FillNull[r1]
5+
FillNull[r0]
96
replacements:
10-
3
7+
i: 3
8+
f: 3

ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_str_repr.txt

Lines changed: 0 additions & 10 deletions
This file was deleted.

ibis/expr/tests/test_format.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -294,16 +294,16 @@ def test_window_group_by(snapshot):
294294

295295

296296
def test_fill_null(snapshot):
297-
t = ibis.table(dict(a="int64", b="string"), name="t")
297+
t = ibis.table(dict(i="int64", f="float64"), name="t")
298298

299-
expr = t.fill_null({"a": 3})
299+
expr = t.fill_null({"i": 3})
300300
snapshot.assert_match(repr(expr), "fill_null_dict_repr.txt")
301301

302-
expr = t[["a"]].fill_null(3)
302+
expr = t.fill_null(3)
303303
snapshot.assert_match(repr(expr), "fill_null_int_repr.txt")
304304

305-
expr = t[["b"]].fill_null("foo")
306-
snapshot.assert_match(repr(expr), "fill_null_str_repr.txt")
305+
expr = t.fill_null(t.i)
306+
snapshot.assert_match(repr(expr), "fill_null_col_repr.txt")
307307

308308

309309
def test_asof_join(snapshot):

ibis/expr/types/generic.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,7 @@ def fill_null(self, fill_value: Scalar, /) -> Self:
450450
[`Value.isnull()`](./expression-generic.qmd#ibis.expr.types.generic.Value.isnull)
451451
[`FloatingValue.isnan()`](./expression-numeric.qmd#ibis.expr.types.numeric.FloatingValue.isnan)
452452
[`FloatingValue.isinf()`](./expression-numeric.qmd#ibis.expr.types.numeric.FloatingValue.isinf)
453+
[`Table.fill_null()`](./expression-tables.qmd#ibis.expr.types.relations.Table.fill_null)
453454
454455
Examples
455456
--------

0 commit comments

Comments
 (0)