From d5b6c2f33b077f0c2a748ed924a9e6b7e8db798b Mon Sep 17 00:00:00 2001 From: Nick Crews Date: Fri, 17 Oct 2025 00:47:35 -0600 Subject: [PATCH] feat: make Table.fill_null() have consistent semantics with .select(), etc This is breaking. Before, doing `t.fill_null("a")` filled with the string literal "a". It also was impossible to use column references such as `t.fill_null(t.other_col)` or `t.fill_null(ibis._.other_col)`. Now, it is consistent with how .select(), .mutate(), etc all work. This also simplifies the internal representation so that the ops.FillNull always stores a `Mapping[str, ]` This also changes the behavior of the polars backend. Before, Table.fill_null() would fill in NaN. Now it does not, consistent with other backends, and the rest of ibis's Column.fill_null() semantics, including how polars treats Column.fill_null() --- ibis/backends/polars/compiler.py | 30 +--- ibis/backends/sql/rewrites.py | 16 +- ibis/backends/tests/conftest.py | 8 + ibis/backends/tests/test_generic.py | 103 ++++++++++-- ibis/expr/operations/relations.py | 2 +- .../test_fill_null/fill_null_col_repr.txt | 8 + .../test_fill_null/fill_null_dict_repr.txt | 6 +- .../test_fill_null/fill_null_int_repr.txt | 12 +- .../test_fill_null/fill_null_str_repr.txt | 10 -- ibis/expr/tests/test_format.py | 10 +- ibis/expr/types/generic.py | 1 + ibis/expr/types/relations.py | 146 ++++++++++-------- ibis/tests/expr/test_table.py | 4 +- 13 files changed, 209 insertions(+), 147 deletions(-) create mode 100644 ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_col_repr.txt delete mode 100644 ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_str_repr.txt diff --git a/ibis/backends/polars/compiler.py b/ibis/backends/polars/compiler.py index 8734c86674e4..bd4812401ce8 100644 --- a/ibis/backends/polars/compiler.py +++ b/ibis/backends/polars/compiler.py @@ -4,7 +4,6 @@ import datetime import math import operator -from collections.abc import Mapping from functools import partial, reduce, singledispatch from math import isnan @@ -375,36 +374,11 @@ def fill_null(op, **kw): table = translate(op.parent, **kw) columns = [] - - repls = op.replacements - - if isinstance(repls, Mapping): - - def get_replacement(name): - repl = repls.get(name) - if repl is not None: - return repl.value - else: - return None - - else: - value = repls.value - - def get_replacement(_): - return value - for name, dtype in op.parent.schema.items(): column = pl.col(name) - if isinstance(op.replacements, Mapping): - value = op.replacements.get(name) - else: - value = _literal_value(op.replacements) - - if value is not None: - if dtype.is_floating(): - column = column.fill_nan(value) + if (repl := op.replacements.get(name)) is not None: + value = translate(repl, **kw) column = column.fill_null(value) - # requires special treatment if the fill value has different datatype if dtype.is_timestamp(): column = column.cast(pl.Datetime) diff --git a/ibis/backends/sql/rewrites.py b/ibis/backends/sql/rewrites.py index addc87ef7fea..f3b66e711197 100644 --- a/ibis/backends/sql/rewrites.py +++ b/ibis/backends/sql/rewrites.py @@ -4,7 +4,6 @@ import operator import sys -from collections.abc import Mapping from functools import reduce from typing import TYPE_CHECKING, Any @@ -24,7 +23,7 @@ from ibis.expr.schema import Schema if TYPE_CHECKING: - from collections.abc import Sequence + from collections.abc import Mapping, Sequence x = var("x") y = var("y") @@ -150,22 +149,13 @@ def drop_columns_to_select(_, **kwargs): @replace(p.FillNull) def fill_null_to_select(_, **kwargs): """Rewrite FillNull to a Select node.""" - if isinstance(_.replacements, Mapping): - mapping = _.replacements - else: - mapping = { - name: _.replacements - for name, type in _.parent.schema.items() - if type.nullable - } - - if not mapping: + if not _.replacements: return _.parent selections = {} for name in _.parent.schema.names: col = ops.Field(_.parent, name) - if (value := mapping.get(name)) is not None: + if (value := _.replacements.get(name)) is not None: col = ops.Coalesce((col, value)) selections[name] = col diff --git a/ibis/backends/tests/conftest.py b/ibis/backends/tests/conftest.py index a9c69b64b57b..be2c4481ce69 100644 --- a/ibis/backends/tests/conftest.py +++ b/ibis/backends/tests/conftest.py @@ -78,3 +78,11 @@ def decorator(func): pytest.mark.notimpl(["datafusion", "exasol", "mssql", "druid", "oracle"]), ] NO_JSON_SUPPORT = combine_marks(NO_JSON_SUPPORT_MARKS) + + +NAN_TREATED_AS_NULL_MARKS = [ + pytest.mark.never( + ["sqlite", "mssql", "mysql"], reason="Treats NaN as NULL", raises=Exception + ), +] +NAN_TREATED_AS_NULL = combine_marks(NAN_TREATED_AS_NULL_MARKS) diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 16ad4686295a..b26ae0693638 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -16,6 +16,7 @@ import ibis.expr.datatypes as dt import ibis.selectors as s from ibis import _ +from ibis.backends.tests.conftest import NAN_TREATED_AS_NULL from ibis.backends.tests.errors import ( ClickHouseDatabaseError, ExaQueryError, @@ -450,28 +451,61 @@ def test_table_fill_null_invalid(alltypes): com.IbisTypeError, match=r"Column 'invalid_col' is not found in table" ): alltypes.fill_null({"invalid_col": 0.0}) - with pytest.raises( com.IbisTypeError, match=r"Cannot fill_null on column 'string_col' of type.*" ): - alltypes[["int_col", "string_col"]].fill_null(0) - + alltypes.select("int_col", "string_col").fill_null(0) + with pytest.raises(AttributeError, match=r"'Table' object has no attribute 'oops'"): + alltypes.fill_null(_.oops) + with pytest.raises(AttributeError, match=r"'Table' object has no attribute 'oops'"): + alltypes.fill_null({"int_col": _.oops}) + with pytest.raises(com.IbisTypeError, match=r"Column 'oops' is not found in table"): + alltypes.fill_null("oops") + with pytest.raises(com.IbisTypeError, match=r"Column 'oops' is not found in table"): + alltypes.fill_null({"int_col": "oops"}) with pytest.raises( com.IbisTypeError, match=r"Cannot fill_null on column 'int_col' of type.*" ): - alltypes.fill_null({"int_col": "oops"}) + alltypes.fill_null({"int_col": ibis.literal("oops")}) @pytest.mark.parametrize( - "replacements", + ("ibis_replacements", "pd_replacements"), [ - param({"int_col": 20}, id="int"), - param({"double_col": -1, "string_col": "missing"}, id="double-int-str"), - param({"double_col": -1.5, "string_col": "missing"}, id="double-str"), - param({}, id="empty"), + param( + lambda _t: {"int_col": 20}, + lambda _t: {"int_col": 20}, + id="int", + ), + param( + lambda _t: {"double_col": -1, "string_col": ibis.literal("missing")}, + lambda _t: {"double_col": -1, "string_col": "missing"}, + id="double-int-str", + ), + param( + lambda _t: {"double_col": -1.5, "string_col": ibis.literal("missing")}, + lambda _t: {"double_col": -1.5, "string_col": "missing"}, + id="double-str", + ), + param( + lambda _t: {"double_col": "int_col"}, + lambda t: {"double_col": t["int_col"]}, + id="column-name", + ), + param( + lambda _t: {"double_col": ibis._.int_col}, + lambda t: {"double_col": t["int_col"]}, + id="column", + ), + param( + lambda t: {"double_col": t.int_col}, + lambda t: {"double_col": t["int_col"]}, + id="deferred", + ), + param(lambda _t: {}, lambda _t: {}, id="empty"), ], ) -def test_table_fill_null_mapping(backend, alltypes, replacements): +def test_table_fill_null_mapping(backend, alltypes, ibis_replacements, pd_replacements): table = alltypes.mutate( int_col=alltypes.int_col.nullif(1), double_col=alltypes.double_col.nullif(3.0), @@ -479,8 +513,8 @@ def test_table_fill_null_mapping(backend, alltypes, replacements): ).select("id", "int_col", "double_col", "string_col") pd_table = table.execute() - result = table.fill_null(replacements).execute().reset_index(drop=True) - expected = pd_table.fillna(replacements).reset_index(drop=True) + result = table.fill_null(ibis_replacements(table)).execute().reset_index(drop=True) + expected = pd_table.fillna(pd_replacements(pd_table)).reset_index(drop=True) backend.assert_frame_equal(result, expected, check_dtype=False) @@ -493,14 +527,55 @@ def test_table_fill_null_scalar(backend, alltypes): ).select("id", "int_col", "double_col", "string_col") pd_table = table.execute() - res = table[["int_col", "double_col"]].fill_null(0).execute().reset_index(drop=True) + res = ( + table.select("int_col", "double_col") + .fill_null(0) + .execute() + .reset_index(drop=True) + ) sol = pd_table[["int_col", "double_col"]].fillna(0).reset_index(drop=True) backend.assert_frame_equal(res, sol, check_dtype=False) - res = table[["string_col"]].fill_null("missing").execute().reset_index(drop=True) + res = ( + table.select("string_col") + .fill_null(ibis.literal("missing")) + .execute() + .reset_index(drop=True) + ) sol = pd_table[["string_col"]].fillna("missing").reset_index(drop=True) backend.assert_frame_equal(res, sol, check_dtype=False) + t = table.select("int_col", "double_col") + sol = ( + pd_table[["int_col", "double_col"]] + .fillna(pd_table.int_col) + .reset_index(drop=True) + ) + res = t.fill_null(t.int_col).execute().reset_index(drop=True) + backend.assert_frame_equal(res, sol, check_dtype=False) + res = t.fill_null(ibis._.int_col).execute().reset_index(drop=True) + backend.assert_frame_equal(res, sol, check_dtype=False) + res = t.fill_null("int_col").execute().reset_index(drop=True) + backend.assert_frame_equal(res, sol, check_dtype=False) + + +@NAN_TREATED_AS_NULL +def test_table_fill_null_nans_are_untouched(con): + # Test that NaNs are not filled when using fill_null + + def make_comparable(vals): + return {"nan" if (isinstance(x, float) and np.isnan(x)) else x for x in vals} + + pa_table = pa.table({"f": pa.array([1.0, float("nan"), None])}) + + before = ibis.memtable(pa_table) + actual_before = make_comparable(con.to_pyarrow(before.f).to_pylist()) + assert actual_before == {1.0, "nan", None} + + after = before.fill_null(0.0) + actual_after = make_comparable(con.to_pyarrow(after.f).to_pylist()) + assert actual_after == {1.0, "nan", 0.0} + def test_mutate_rename(alltypes): table = alltypes.select(["bool_col", "string_col"]) diff --git a/ibis/expr/operations/relations.py b/ibis/expr/operations/relations.py index 90dc400a8ded..871eb5186bc6 100644 --- a/ibis/expr/operations/relations.py +++ b/ibis/expr/operations/relations.py @@ -471,7 +471,7 @@ def schema(self): class FillNull(Simple): """Fill null values in the table.""" - replacements: typing.Union[Value[dt.Numeric | dt.String], FrozenDict[str, Any]] + replacements: FrozenDict[str, Value] @public diff --git a/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_col_repr.txt b/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_col_repr.txt new file mode 100644 index 000000000000..57dbdfb66f75 --- /dev/null +++ b/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_col_repr.txt @@ -0,0 +1,8 @@ +r0 := UnboundTable: t + i int64 + f float64 + +FillNull[r0] + replacements: + i: r0.i + f: r0.i \ No newline at end of file diff --git a/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_dict_repr.txt b/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_dict_repr.txt index 3ddc4fa7edaa..8af61d012903 100644 --- a/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_dict_repr.txt +++ b/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_dict_repr.txt @@ -1,7 +1,7 @@ r0 := UnboundTable: t - a int64 - b string + i int64 + f float64 FillNull[r0] replacements: - a: 3 \ No newline at end of file + i: 3 \ No newline at end of file diff --git a/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_int_repr.txt b/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_int_repr.txt index b138e3a5fe4b..d8913b5f1ef3 100644 --- a/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_int_repr.txt +++ b/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_int_repr.txt @@ -1,10 +1,8 @@ r0 := UnboundTable: t - a int64 - b string + i int64 + f float64 -r1 := Project[r0] - a: r0.a - -FillNull[r1] +FillNull[r0] replacements: - 3 \ No newline at end of file + i: 3 + f: 3 \ No newline at end of file diff --git a/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_str_repr.txt b/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_str_repr.txt deleted file mode 100644 index 3ee225e52931..000000000000 --- a/ibis/expr/tests/snapshots/test_format/test_fill_null/fill_null_str_repr.txt +++ /dev/null @@ -1,10 +0,0 @@ -r0 := UnboundTable: t - a int64 - b string - -r1 := Project[r0] - b: r0.b - -FillNull[r1] - replacements: - 'foo' \ No newline at end of file diff --git a/ibis/expr/tests/test_format.py b/ibis/expr/tests/test_format.py index 0ccc778bbc33..68a253656781 100644 --- a/ibis/expr/tests/test_format.py +++ b/ibis/expr/tests/test_format.py @@ -294,16 +294,16 @@ def test_window_group_by(snapshot): def test_fill_null(snapshot): - t = ibis.table(dict(a="int64", b="string"), name="t") + t = ibis.table(dict(i="int64", f="float64"), name="t") - expr = t.fill_null({"a": 3}) + expr = t.fill_null({"i": 3}) snapshot.assert_match(repr(expr), "fill_null_dict_repr.txt") - expr = t[["a"]].fill_null(3) + expr = t.fill_null(3) snapshot.assert_match(repr(expr), "fill_null_int_repr.txt") - expr = t[["b"]].fill_null("foo") - snapshot.assert_match(repr(expr), "fill_null_str_repr.txt") + expr = t.fill_null(t.i) + snapshot.assert_match(repr(expr), "fill_null_col_repr.txt") def test_asof_join(snapshot): diff --git a/ibis/expr/types/generic.py b/ibis/expr/types/generic.py index 63b1c264623d..4a6025a6f0dc 100644 --- a/ibis/expr/types/generic.py +++ b/ibis/expr/types/generic.py @@ -450,6 +450,7 @@ def fill_null(self, fill_value: Scalar, /) -> Self: [`Value.isnull()`](./expression-generic.qmd#ibis.expr.types.generic.Value.isnull) [`FloatingValue.isnan()`](./expression-numeric.qmd#ibis.expr.types.numeric.FloatingValue.isnan) [`FloatingValue.isinf()`](./expression-numeric.qmd#ibis.expr.types.numeric.FloatingValue.isinf) + [`Table.fill_null()`](./expression-tables.qmd#ibis.expr.types.relations.Table.fill_null) Examples -------- diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index c1164e672fee..e63aa2af05da 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -3075,9 +3075,14 @@ def drop_null( subset = tuple(self.bind(subset)) return ops.DropNull(self, how, subset).to_expr() - def fill_null(self, replacements: ir.Scalar | Mapping[str, ir.Scalar], /) -> Table: + def fill_null( + self, replacements: ir.Value | Any | Mapping[str, ir.Value | Any], / + ) -> Table: """Fill null values in a table expression. + This only replaces genuine `NULL` values, it does NOT affect + `NaN` and `inf` values for floating point types. + ::: {.callout-note} ## There is potential lack of type stability with the `fill_null` API @@ -3088,9 +3093,10 @@ def fill_null(self, replacements: ir.Scalar | Mapping[str, ir.Scalar], /) -> Tab Parameters ---------- replacements - Value with which to fill nulls. If `replacements` is a mapping, the - keys are column names that map to their replacement value. If - passed as a scalar all columns are filled with that value. + Value with which to fill nulls. + If `replacements` is a mapping, the keys are column names that map to their replacement value. + Otherwise, all columns are filled with this single value. + Values are interpreted similar to in [`Table.select()`](#ibis.expr.types.relations.Table.select). Returns ------- @@ -3101,76 +3107,88 @@ def fill_null(self, replacements: ir.Scalar | Mapping[str, ir.Scalar], /) -> Tab -------- >>> import ibis >>> ibis.options.interactive = True - >>> t = ibis.examples.penguins.fetch() - >>> t.sex - ┏━━━━━━━━┓ - ┃ sex ┃ - ┡━━━━━━━━┩ - │ string │ - ├────────┤ - │ male │ - │ female │ - │ female │ - │ NULL │ - │ female │ - │ male │ - │ female │ - │ male │ - │ NULL │ - │ NULL │ - │ … │ - └────────┘ - >>> t.fill_null({"sex": "unrecorded"}).sex - ┏━━━━━━━━━━━━┓ - ┃ sex ┃ - ┡━━━━━━━━━━━━┩ - │ string │ - ├────────────┤ - │ male │ - │ female │ - │ female │ - │ unrecorded │ - │ female │ - │ male │ - │ female │ - │ male │ - │ unrecorded │ - │ unrecorded │ - │ … │ - └────────────┘ + >>> t = ibis.examples.penguins.fetch().select("sex", "species").head() + >>> t + ┏━━━━━━━━┳━━━━━━━━━┓ + ┃ sex ┃ species ┃ + ┡━━━━━━━━╇━━━━━━━━━┩ + │ string │ string │ + ├────────┼─────────┤ + │ male │ Adelie │ + │ female │ Adelie │ + │ female │ Adelie │ + │ NULL │ Adelie │ + │ female │ Adelie │ + └────────┴─────────┘ + >>> t.fill_null(ibis.literal("unrecorded")) + ┏━━━━━━━━━━━━┳━━━━━━━━━┓ + ┃ sex ┃ species ┃ + ┡━━━━━━━━━━━━╇━━━━━━━━━┩ + │ string │ string │ + ├────────────┼─────────┤ + │ male │ Adelie │ + │ female │ Adelie │ + │ female │ Adelie │ + │ unrecorded │ Adelie │ + │ female │ Adelie │ + └────────────┴─────────┘ + >>> t.fill_null({"sex": "species"}) + ┏━━━━━━━━┳━━━━━━━━━┓ + ┃ sex ┃ species ┃ + ┡━━━━━━━━╇━━━━━━━━━┩ + │ string │ string │ + ├────────┼─────────┤ + │ male │ Adelie │ + │ female │ Adelie │ + │ female │ Adelie │ + │ Adelie │ Adelie │ + │ female │ Adelie │ + └────────┴─────────┘ + + See Also + -------- + [`Value.fill_null()`](./expression-generic.qmd#ibis.expr.types.generic.Value.fill_null) """ schema = self.schema() + errors = [] + bound_replacements = {} + + def check_castable(col: str, val_type: dt.DataType) -> None: + col_type = schema[col] + if not val_type.castable(col_type): + errors.append( + com.IbisTypeError( + f"Cannot fill_null on column {col!r} of type {col_type} with a " + f"value of type {val_type}" + ) + ) if isinstance(replacements, Mapping): for col, val in replacements.items(): + (bound_val,) = self.bind(val) if col not in schema: columns_formatted = ", ".join(map(repr, schema.names)) - raise com.IbisTypeError( - f"Column {col!r} is not found in table. " - f"Existing columns: {columns_formatted}." - ) from None - - col_type = schema[col] - val_type = val.type() if isinstance(val, Expr) else dt.infer(val) - if not val_type.castable(col_type): - raise com.IbisTypeError( - f"Cannot fill_null on column {col!r} of type {col_type} with a " - f"value of type {val_type}" + errors.append( + com.IbisTypeError( + f"Column {col!r} is not found in table. " + f"Existing columns: {columns_formatted}." + ) ) + continue + check_castable(col, bound_val.type()) + bound_replacements[col] = bound_val else: - val_type = ( - replacements.type() - if isinstance(replacements, Expr) - else dt.infer(replacements) - ) + (bound_val,) = self.bind(replacements) for col, col_type in schema.items(): - if col_type.nullable and not val_type.castable(col_type): - raise com.IbisTypeError( - f"Cannot fill_null on column {col!r} of type {col_type} with a " - f"value of type {val_type} - pass in an explicit mapping " - f"of fill values to `fill_null` instead." - ) - return ops.FillNull(self, replacements).to_expr() + if not col_type.nullable: + continue + check_castable(col, bound_val.type()) + bound_replacements[col] = bound_val + if errors: + raise com.IbisTypeError( + "Invalid fill_null replacements:\n" + "\n".join(str(e) for e in errors) + ) + return ops.FillNull(self, bound_replacements).to_expr() @deprecated(as_of="9.1", instead="use drop_null instead") def dropna( diff --git a/ibis/tests/expr/test_table.py b/ibis/tests/expr/test_table.py index 80321d98c273..7537f403275f 100644 --- a/ibis/tests/expr/test_table.py +++ b/ibis/tests/expr/test_table.py @@ -2119,9 +2119,9 @@ def test_table_dropna_depr_warn(): # TODO: remove when fillna is fully deprecated def test_table_fillna_depr_warn(): - t = ibis.table(schema={"a": "int", "b": "str"}) + t = ibis.table(schema={"a": "int"}) with pytest.warns(FutureWarning, match="v9.1"): - t.fillna({"b": "missing"}) + t.fillna({"a": 5}) def test_dummy_table_disallows_aliases():