From 3d17e56136abde2ed265063b141df50c42dd1fe8 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 13 Aug 2025 19:57:37 +0100 Subject: [PATCH 01/30] ENH: Introduce `pandas.col` --- pandas/__init__.py | 2 + pandas/core/col.py | 185 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 pandas/core/col.py diff --git a/pandas/__init__.py b/pandas/__init__.py index 8b92ad6cdfebb..cc786d1141c48 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -105,6 +105,7 @@ Series, DataFrame, ) +from pandas.core.col import col from pandas.core.dtypes.dtypes import SparseDtype @@ -281,6 +282,7 @@ "array", "arrays", "bdate_range", + "col", "concat", "crosstab", "cut", diff --git a/pandas/core/col.py b/pandas/core/col.py new file mode 100644 index 0000000000000..a5d9f8d430c7c --- /dev/null +++ b/pandas/core/col.py @@ -0,0 +1,185 @@ +from __future__ import annotations + +from collections.abc import ( + Callable, + Hashable, +) +from typing import ( + TYPE_CHECKING, + Any, +) + +from pandas.core.series import Series + +if TYPE_CHECKING: + from pandas import DataFrame + + +def parse_args(df: DataFrame, *args) -> tuple[Series]: + return tuple([x(df) if isinstance(x, Expr) else x for x in args]) + + +def parse_kwargs(df: DataFrame, **kwargs) -> dict[Hashable, Series]: + return { + key: val(df) if isinstance(val, Expr) else val for key, val in kwargs.items() + } + + +class Expr: + def __init__(self, func: Callable[[DataFrame], Series]) -> None: + self._func = func + + def __call__(self, df: DataFrame) -> Series: + return self._func(df) + + # namespaces + @property + def dt(self) -> NamespaceExpr: + return NamespaceExpr(self, "dt") + + @property + def str(self) -> NamespaceExpr: + return NamespaceExpr(self, "str") + + @property + def cat(self) -> NamespaceExpr: + return NamespaceExpr(self, "cat") + + @property + def list(self) -> NamespaceExpr: + return NamespaceExpr(self, "list") + + @property + def sparse(self) -> NamespaceExpr: + return NamespaceExpr(self, "sparse") + + @property + def struct(self) -> NamespaceExpr: + return NamespaceExpr(self, "struct") + + # Binary ops + + def __add__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__add__(other(df))) + return Expr(lambda df: self(df).__add__(other)) + + def __radd__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__radd__(other(df))) + return Expr(lambda df: self(df).__radd__(other)) + + def __sub__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__sub__(other(df))) + return Expr(lambda df: self(df).__sub__(other)) + + def __rsub__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__rsub__(other(df))) + return Expr(lambda df: self(df).__rsub__(other)) + + def __mul__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__mul__(other(df))) + return Expr(lambda df: self(df).__mul__(other)) + + def __rmul__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__rmul__(other(df))) + return Expr(lambda df: self(df).__rmul__(other)) + + def __truediv__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__truediv__(other(df))) + return Expr(lambda df: self(df).__truediv__(other)) + + def __rtruediv__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__rtruediv__(other(df))) + return Expr(lambda df: self(df).__rtruediv__(other)) + + def __floordiv__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__floordiv__(other(df))) + return Expr(lambda df: self(df).__floordiv__(other)) + + def __rfloordiv__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__rfloordiv__(other(df))) + return Expr(lambda df: self(df).__rfloordiv__(other)) + + def __ge__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__ge__(other(df))) + return Expr(lambda df: self(df).__ge__(other)) + + def __gt__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__gt__(other(df))) + return Expr(lambda df: self(df).__gt__(other)) + + def __le__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__le__(other(df))) + return Expr(lambda df: self(df).__le__(other)) + + def __lt__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__lt__(other(df))) + return Expr(lambda df: self(df).__lt__(other)) + + def __eq__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__eq__(other(df))) + return Expr(lambda df: self(df).__eq__(other)) + + def __neq__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__neq__(other(df))) + return Expr(lambda df: self(df).__neq__(other)) + + def __mod__(self, other) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: self(df).__mod__(other(df))) + return Expr(lambda df: self(df).__mod__(other)) + + # Everything else + + def __getattr__(self, attr: str) -> Expr: + def func(df: DataFrame, *args: Any, **kwargs: Any) -> Series: + args = parse_args(df, *args) + kwargs = parse_kwargs(df, **kwargs) + return getattr(self(df), attr)(*args, **kwargs) + + return lambda *args, **kwargs: Expr(lambda df: func(df, *args, **kwargs)) + + +class NamespaceExpr: + def __init__(self, func: Callable[[DataFrame], Series], namespace: str) -> None: + self._func = func + self._namespace = namespace + + def __getattr__(self, attr: str) -> Any: + if isinstance(getattr(getattr(Series, self._namespace), attr), property): + + def func(df): + return getattr(getattr(self._func(df), self._namespace), attr) + + return Expr(func) + + def func(df, *args, **kwargs): + args = parse_args(df, *args) + kwargs = parse_kwargs(df, **kwargs) + return getattr(getattr(self._func(df), self._namespace), attr)( + *args, **kwargs + ) + + return lambda *args, **kwargs: Expr(lambda df: func(df, *args, **kwargs)) + + +def col(col_name: Hashable) -> Expr: + if not isinstance(col_name, Hashable): + msg = f"Expected Hashable, got: {type(col_name)}" + raise TypeError(msg) + return Expr(lambda df: df[col_name]) From 9fcaba33e93241145ae464d243e516b5bcd58396 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Wed, 13 Aug 2025 22:33:52 +0100 Subject: [PATCH 02/30] api test, typing --- pandas/core/col.py | 151 ++++++++++++++++++----------------- pandas/tests/api/test_api.py | 1 + 2 files changed, 80 insertions(+), 72 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index a5d9f8d430c7c..aa9c656437df5 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -15,22 +15,30 @@ from pandas import DataFrame -def parse_args(df: DataFrame, *args) -> tuple[Series]: - return tuple([x(df) if isinstance(x, Expr) else x for x in args]) +def parse_args(df: DataFrame, *args: Any) -> tuple[Series]: + return tuple([x._func(df) if isinstance(x, Expr) else x for x in args]) -def parse_kwargs(df: DataFrame, **kwargs) -> dict[Hashable, Series]: +def parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[Hashable, Series]: return { - key: val(df) if isinstance(val, Expr) else val for key, val in kwargs.items() + key: val._func(df) if isinstance(val, Expr) else val + for key, val in kwargs.items() } class Expr: - def __init__(self, func: Callable[[DataFrame], Series]) -> None: + def __init__(self, func: Callable[[DataFrame], Any]) -> None: self._func = func def __call__(self, df: DataFrame) -> Series: - return self._func(df) + result = self._func(df) + if not isinstance(result, Series): + msg = ( + "Expected function which returns Series, " + f"got function which returns: {type(result)}" + ) + raise TypeError(msg) + return result # namespaces @property @@ -59,120 +67,119 @@ def struct(self) -> NamespaceExpr: # Binary ops - def __add__(self, other) -> Expr: + def __add__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__add__(other(df))) - return Expr(lambda df: self(df).__add__(other)) + return Expr(lambda df: self._func(df).__add__(other._func(df))) + return Expr(lambda df: self._func(df).__add__(other)) - def __radd__(self, other) -> Expr: + def __radd__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__radd__(other(df))) - return Expr(lambda df: self(df).__radd__(other)) + return Expr(lambda df: self._func(df).__radd__(other._func(df))) + return Expr(lambda df: self._func(df).__radd__(other)) - def __sub__(self, other) -> Expr: + def __sub__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__sub__(other(df))) - return Expr(lambda df: self(df).__sub__(other)) + return Expr(lambda df: self._func(df).__sub__(other._func(df))) + return Expr(lambda df: self._func(df).__sub__(other)) - def __rsub__(self, other) -> Expr: + def __rsub__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__rsub__(other(df))) - return Expr(lambda df: self(df).__rsub__(other)) + return Expr(lambda df: self._func(df).__rsub__(other._func(df))) + return Expr(lambda df: self._func(df).__rsub__(other)) - def __mul__(self, other) -> Expr: + def __mul__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__mul__(other(df))) - return Expr(lambda df: self(df).__mul__(other)) + return Expr(lambda df: self._func(df).__mul__(other._func(df))) + return Expr(lambda df: self._func(df).__mul__(other)) - def __rmul__(self, other) -> Expr: + def __rmul__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__rmul__(other(df))) - return Expr(lambda df: self(df).__rmul__(other)) + return Expr(lambda df: self._func(df).__rmul__(other._func(df))) + return Expr(lambda df: self._func(df).__rmul__(other)) - def __truediv__(self, other) -> Expr: + def __truediv__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__truediv__(other(df))) - return Expr(lambda df: self(df).__truediv__(other)) + return Expr(lambda df: self._func(df).__truediv__(other._func(df))) + return Expr(lambda df: self._func(df).__truediv__(other)) - def __rtruediv__(self, other) -> Expr: + def __rtruediv__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__rtruediv__(other(df))) - return Expr(lambda df: self(df).__rtruediv__(other)) + return Expr(lambda df: self._func(df).__rtruediv__(other._func(df))) + return Expr(lambda df: self._func(df).__rtruediv__(other)) - def __floordiv__(self, other) -> Expr: + def __floordiv__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__floordiv__(other(df))) - return Expr(lambda df: self(df).__floordiv__(other)) + return Expr(lambda df: self._func(df).__floordiv__(other._func(df))) + return Expr(lambda df: self._func(df).__floordiv__(other)) - def __rfloordiv__(self, other) -> Expr: + def __rfloordiv__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__rfloordiv__(other(df))) - return Expr(lambda df: self(df).__rfloordiv__(other)) + return Expr(lambda df: self._func(df).__rfloordiv__(other._func(df))) + return Expr(lambda df: self._func(df).__rfloordiv__(other)) - def __ge__(self, other) -> Expr: + def __ge__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__ge__(other(df))) - return Expr(lambda df: self(df).__ge__(other)) + return Expr(lambda df: self._func(df).__ge__(other._func(df))) + return Expr(lambda df: self._func(df).__ge__(other)) - def __gt__(self, other) -> Expr: + def __gt__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__gt__(other(df))) - return Expr(lambda df: self(df).__gt__(other)) + return Expr(lambda df: self._func(df).__gt__(other._func(df))) + return Expr(lambda df: self._func(df).__gt__(other)) - def __le__(self, other) -> Expr: + def __le__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__le__(other(df))) - return Expr(lambda df: self(df).__le__(other)) + return Expr(lambda df: self._func(df).__le__(other._func(df))) + return Expr(lambda df: self._func(df).__le__(other)) - def __lt__(self, other) -> Expr: + def __lt__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__lt__(other(df))) - return Expr(lambda df: self(df).__lt__(other)) + return Expr(lambda df: self._func(df).__lt__(other._func(df))) + return Expr(lambda df: self._func(df).__lt__(other)) - def __eq__(self, other) -> Expr: + def __eq__(self, other: object) -> Expr: # type: ignore[override] if isinstance(other, Expr): - return Expr(lambda df: self(df).__eq__(other(df))) - return Expr(lambda df: self(df).__eq__(other)) + return Expr(lambda df: self._func(df).__eq__(other._func(df))) + return Expr(lambda df: self._func(df).__eq__(other)) - def __neq__(self, other) -> Expr: + def __ne__(self, other: object) -> Expr: # type: ignore[override] if isinstance(other, Expr): - return Expr(lambda df: self(df).__neq__(other(df))) - return Expr(lambda df: self(df).__neq__(other)) + return Expr(lambda df: self._func(df).__ne__(other._func(df))) + return Expr(lambda df: self._func(df).__ne__(other)) - def __mod__(self, other) -> Expr: + def __mod__(self, other: Any) -> Expr: if isinstance(other, Expr): - return Expr(lambda df: self(df).__mod__(other(df))) - return Expr(lambda df: self(df).__mod__(other)) + return Expr(lambda df: self._func(df).__mod__(other._func(df))) + return Expr(lambda df: self._func(df).__mod__(other)) # Everything else - def __getattr__(self, attr: str) -> Expr: - def func(df: DataFrame, *args: Any, **kwargs: Any) -> Series: - args = parse_args(df, *args) - kwargs = parse_kwargs(df, **kwargs) - return getattr(self(df), attr)(*args, **kwargs) + # Function "pandas.core.col.Expr.str" is not valid as a type + def __getattr__(self, attr: str, /) -> Any: # type: ignore[valid-type] + def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: + parsed_args = parse_args(df, *args) + parsed_kwargs = parse_kwargs(df, **kwargs) + return getattr(self(df), attr)(*parsed_args, **parsed_kwargs) return lambda *args, **kwargs: Expr(lambda df: func(df, *args, **kwargs)) class NamespaceExpr: - def __init__(self, func: Callable[[DataFrame], Series], namespace: str) -> None: + def __init__(self, func: Callable[[DataFrame], Any], namespace: str) -> None: self._func = func self._namespace = namespace def __getattr__(self, attr: str) -> Any: if isinstance(getattr(getattr(Series, self._namespace), attr), property): + return Expr( + lambda df: getattr(getattr(self._func(df), self._namespace), attr) + ) - def func(df): - return getattr(getattr(self._func(df), self._namespace), attr) - - return Expr(func) - - def func(df, *args, **kwargs): - args = parse_args(df, *args) - kwargs = parse_kwargs(df, **kwargs) + def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: + parsed_args = parse_args(df, *args) + parsed_kwargs = parse_kwargs(df, **kwargs) return getattr(getattr(self._func(df), self._namespace), attr)( - *args, **kwargs + *parsed_args, **parsed_kwargs ) return lambda *args, **kwargs: Expr(lambda df: func(df, *args, **kwargs)) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index c2e77b69aadcb..f5ba784470072 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -107,6 +107,7 @@ class TestPDApi(Base): funcs = [ "array", "bdate_range", + "col", "concat", "crosstab", "cut", From b41b99de55a9927bf0a87c3c89fc1d81520b6a7c Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 14 Aug 2025 10:08:26 +0100 Subject: [PATCH 03/30] typing --- pandas/core/col.py | 132 ++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 81 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index aa9c656437df5..0568b9625ec55 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -9,6 +9,8 @@ Any, ) +from pandas.core.dtypes.common import is_scalar + from pandas.core.series import Series if TYPE_CHECKING: @@ -32,130 +34,73 @@ def __init__(self, func: Callable[[DataFrame], Any]) -> None: def __call__(self, df: DataFrame) -> Series: result = self._func(df) - if not isinstance(result, Series): + if not (isinstance(result, Series) or is_scalar(result)): msg = ( - "Expected function which returns Series, " + "Expected function which returns Series or scalar, " f"got function which returns: {type(result)}" ) raise TypeError(msg) return result - # namespaces - @property - def dt(self) -> NamespaceExpr: - return NamespaceExpr(self, "dt") - - @property - def str(self) -> NamespaceExpr: - return NamespaceExpr(self, "str") - - @property - def cat(self) -> NamespaceExpr: - return NamespaceExpr(self, "cat") - - @property - def list(self) -> NamespaceExpr: - return NamespaceExpr(self, "list") - - @property - def sparse(self) -> NamespaceExpr: - return NamespaceExpr(self, "sparse") - - @property - def struct(self) -> NamespaceExpr: - return NamespaceExpr(self, "struct") + def _with_binary_op(self, op: str, other: Any) -> Expr: + if isinstance(other, Expr): + return Expr(lambda df: getattr(self._func(df), op)(other._func(df))) + return Expr(lambda df: getattr(self._func(df), op)(other)) # Binary ops - def __add__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__add__(other._func(df))) - return Expr(lambda df: self._func(df).__add__(other)) + return self._with_binary_op("__add__", other) def __radd__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__radd__(other._func(df))) - return Expr(lambda df: self._func(df).__radd__(other)) + return self._with_binary_op("__radd__", other) def __sub__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__sub__(other._func(df))) - return Expr(lambda df: self._func(df).__sub__(other)) + return self._with_binary_op("__sub__", other) def __rsub__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__rsub__(other._func(df))) - return Expr(lambda df: self._func(df).__rsub__(other)) + return self._with_binary_op("__rsub__", other) def __mul__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__mul__(other._func(df))) - return Expr(lambda df: self._func(df).__mul__(other)) + return self._with_binary_op("__mul__", other) def __rmul__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__rmul__(other._func(df))) - return Expr(lambda df: self._func(df).__rmul__(other)) + return self._with_binary_op("__rmul__", other) def __truediv__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__truediv__(other._func(df))) - return Expr(lambda df: self._func(df).__truediv__(other)) + return self._with_binary_op("__truediv__", other) def __rtruediv__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__rtruediv__(other._func(df))) - return Expr(lambda df: self._func(df).__rtruediv__(other)) + return self._with_binary_op("__rtruediv__", other) def __floordiv__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__floordiv__(other._func(df))) - return Expr(lambda df: self._func(df).__floordiv__(other)) + return self._with_binary_op("__floordiv__", other) def __rfloordiv__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__rfloordiv__(other._func(df))) - return Expr(lambda df: self._func(df).__rfloordiv__(other)) + return self._with_binary_op("__rfloordiv__", other) def __ge__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__ge__(other._func(df))) - return Expr(lambda df: self._func(df).__ge__(other)) + return self._with_binary_op("__ge__", other) def __gt__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__gt__(other._func(df))) - return Expr(lambda df: self._func(df).__gt__(other)) + return self._with_binary_op("__gt__", other) def __le__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__le__(other._func(df))) - return Expr(lambda df: self._func(df).__le__(other)) + return self._with_binary_op("__le__", other) def __lt__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__lt__(other._func(df))) - return Expr(lambda df: self._func(df).__lt__(other)) + return self._with_binary_op("__lt__", other) def __eq__(self, other: object) -> Expr: # type: ignore[override] - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__eq__(other._func(df))) - return Expr(lambda df: self._func(df).__eq__(other)) + return self._with_binary_op("__eq__", other) def __ne__(self, other: object) -> Expr: # type: ignore[override] - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__ne__(other._func(df))) - return Expr(lambda df: self._func(df).__ne__(other)) + return self._with_binary_op("__ne__", other) def __mod__(self, other: Any) -> Expr: - if isinstance(other, Expr): - return Expr(lambda df: self._func(df).__mod__(other._func(df))) - return Expr(lambda df: self._func(df).__mod__(other)) + return self._with_binary_op("__mod__", other) # Everything else - - # Function "pandas.core.col.Expr.str" is not valid as a type - def __getattr__(self, attr: str, /) -> Any: # type: ignore[valid-type] + def __getattr__(self, attr: str, /) -> Callable[..., Expr]: def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: parsed_args = parse_args(df, *args) parsed_kwargs = parse_kwargs(df, **kwargs) @@ -163,6 +108,31 @@ def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: return lambda *args, **kwargs: Expr(lambda df: func(df, *args, **kwargs)) + # Namespaces + @property + def dt(self) -> NamespaceExpr: + return NamespaceExpr(self, "dt") + + @property + def str(self) -> NamespaceExpr: + return NamespaceExpr(self, "str") + + @property + def cat(self) -> NamespaceExpr: + return NamespaceExpr(self, "cat") + + @property + def list(self) -> NamespaceExpr: + return NamespaceExpr(self, "list") + + @property + def sparse(self) -> NamespaceExpr: + return NamespaceExpr(self, "sparse") + + @property + def struct(self) -> NamespaceExpr: + return NamespaceExpr(self, "struct") + class NamespaceExpr: def __init__(self, func: Callable[[DataFrame], Any], namespace: str) -> None: From 60c09c2228bc568af7e55f59ba696bbb6009d45a Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Thu, 14 Aug 2025 14:44:54 +0100 Subject: [PATCH 04/30] add pretty repr --- pandas/core/col.py | 100 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 92 insertions(+), 8 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index 0568b9625ec55..4bb41f019a9ab 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -17,6 +17,27 @@ from pandas import DataFrame +OP_SYMBOLS = { + "__add__": "+", + "__radd__": "+", + "__sub__": "-", + "__rsub__": "-", + "__mul__": "*", + "__rmul__": "*", + "__truediv__": "/", + "__rtruediv__": "/", + "__floordiv__": "//", + "__rfloordiv__": "//", + "__ge__": ">=", + "__gt__": ">", + "__le__": "<=", + "__lt__": "<", + "__eq__": "==", + "__ne__": "!=", + "__mod__": "%", +} + + def parse_args(df: DataFrame, *args: Any) -> tuple[Series]: return tuple([x._func(df) if isinstance(x, Expr) else x for x in args]) @@ -29,8 +50,11 @@ def parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[Hashable, Series]: class Expr: - def __init__(self, func: Callable[[DataFrame], Any]) -> None: + def __init__( + self, func: Callable[[DataFrame], Any], repr_str: str | None = None + ) -> None: self._func = func + self._repr_str = repr_str def __call__(self, df: DataFrame) -> Series: result = self._func(df) @@ -43,9 +67,22 @@ def __call__(self, df: DataFrame) -> Series: return result def _with_binary_op(self, op: str, other: Any) -> Expr: + op_symbol = OP_SYMBOLS.get(op, op) + if isinstance(other, Expr): - return Expr(lambda df: getattr(self._func(df), op)(other._func(df))) - return Expr(lambda df: getattr(self._func(df), op)(other)) + if op.startswith("__r"): + repr_str = f"({other._repr_str} {op_symbol} {self._repr_str})" + else: + repr_str = f"({self._repr_str} {op_symbol} {other._repr_str})" + return Expr( + lambda df: getattr(self._func(df), op)(other._func(df)), repr_str + ) + else: + if op.startswith("__r"): + repr_str = f"({other!r} {op_symbol} {self._repr_str})" + else: + repr_str = f"({self._repr_str} {op_symbol} {other!r})" + return Expr(lambda df: getattr(self._func(df), op)(other), repr_str) # Binary ops def __add__(self, other: Any) -> Expr: @@ -106,7 +143,31 @@ def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: parsed_kwargs = parse_kwargs(df, **kwargs) return getattr(self(df), attr)(*parsed_args, **parsed_kwargs) - return lambda *args, **kwargs: Expr(lambda df: func(df, *args, **kwargs)) + def wrapper(*args: Any, **kwargs: Any) -> Expr: + # Create a readable representation for method calls + args_repr = ", ".join( + repr(arg._repr_str if isinstance(arg, Expr) else arg) for arg in args + ) + kwargs_repr = ", ".join( + f"{k}={v._repr_str if isinstance(v, Expr) else v!r}" + for k, v in kwargs.items() + ) + + all_args = [] + if args_repr: + all_args.append(args_repr) + if kwargs_repr: + all_args.append(kwargs_repr) + + args_str = ", ".join(all_args) + repr_str = f"{self._repr_str}.{attr}({args_str})" + + return Expr(lambda df: func(df, *args, **kwargs), repr_str) + + return wrapper + + def __repr__(self) -> str: + return self._repr_str or "Expr(...)" # Namespaces @property @@ -135,14 +196,16 @@ def struct(self) -> NamespaceExpr: class NamespaceExpr: - def __init__(self, func: Callable[[DataFrame], Any], namespace: str) -> None: + def __init__(self, func: Expr, namespace: str) -> None: self._func = func self._namespace = namespace def __getattr__(self, attr: str) -> Any: if isinstance(getattr(getattr(Series, self._namespace), attr), property): + repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}" return Expr( - lambda df: getattr(getattr(self._func(df), self._namespace), attr) + lambda df: getattr(getattr(self._func(df), self._namespace), attr), + repr_str, ) def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: @@ -152,11 +215,32 @@ def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: *parsed_args, **parsed_kwargs ) - return lambda *args, **kwargs: Expr(lambda df: func(df, *args, **kwargs)) + def wrapper(*args: Any, **kwargs: Any) -> Expr: + # Create a readable representation for namespace method calls + args_repr = ", ".join( + repr(arg._repr_str if isinstance(arg, Expr) else arg) for arg in args + ) + kwargs_repr = ", ".join( + f"{k}={v._repr_str if isinstance(v, Expr) else v!r}" + for k, v in kwargs.items() + ) + + all_args = [] + if args_repr: + all_args.append(args_repr) + if kwargs_repr: + all_args.append(kwargs_repr) + + args_str = ", ".join(all_args) + repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}({args_str})" + + return Expr(lambda df: func(df, *args, **kwargs), repr_str) + + return wrapper def col(col_name: Hashable) -> Expr: if not isinstance(col_name, Hashable): msg = f"Expected Hashable, got: {type(col_name)}" raise TypeError(msg) - return Expr(lambda df: df[col_name]) + return Expr(lambda df: df[col_name], f"col({col_name!r})") From 9e4e0c5000c41c074074e91bac369e8a38e14104 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Aug 2025 15:52:47 +0100 Subject: [PATCH 05/30] improve error message --- pandas/core/col.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index 4bb41f019a9ab..a9e17538a9c8b 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -243,4 +243,14 @@ def col(col_name: Hashable) -> Expr: if not isinstance(col_name, Hashable): msg = f"Expected Hashable, got: {type(col_name)}" raise TypeError(msg) - return Expr(lambda df: df[col_name], f"col({col_name!r})") + + def func(df: DataFrame) -> Series: + if col_name not in df.columns: + msg = ( + f"Column '{col_name}' not found in given DataFrame.\n\n" + f"Hint: did you mean one of {df.columns.tolist()} instead?" + ) + raise ValueError(msg) + return df[col_name] + + return Expr(func) From fe78aa249390300c7838e06c37c156c79d3f56f3 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Aug 2025 17:10:04 +0100 Subject: [PATCH 06/30] test repr --- pandas/core/col.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index a9e17538a9c8b..d37888ae53a6d 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -28,13 +28,14 @@ "__rtruediv__": "/", "__floordiv__": "//", "__rfloordiv__": "//", + "__mod__": "%", + "__rmod__": "%", "__ge__": ">=", "__gt__": ">", "__le__": "<=", "__lt__": "<", "__eq__": "==", "__ne__": "!=", - "__mod__": "%", } @@ -136,6 +137,9 @@ def __ne__(self, other: object) -> Expr: # type: ignore[override] def __mod__(self, other: Any) -> Expr: return self._with_binary_op("__mod__", other) + def __rmod__(self, other: Any) -> Expr: + return self._with_binary_op("__rmod__", other) + # Everything else def __getattr__(self, attr: str, /) -> Callable[..., Expr]: def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: @@ -253,4 +257,4 @@ def func(df: DataFrame) -> Series: raise ValueError(msg) return df[col_name] - return Expr(func) + return Expr(func, f"col({col_name!r})") From 04044af08120f85095e8fbab2c22403972e30204 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Aug 2025 17:16:18 +0100 Subject: [PATCH 07/30] test namespaces --- pandas/core/col.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/col.py b/pandas/core/col.py index d37888ae53a6d..bd9068599007e 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -17,6 +17,7 @@ from pandas import DataFrame +# Used only for generating the str repr of expressions. OP_SYMBOLS = { "__add__": "+", "__radd__": "+", From a95aeb425de70563f3662613767ecf23a3a384de Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Aug 2025 17:30:43 +0100 Subject: [PATCH 08/30] docs --- pandas/__init__.py | 3 +- pandas/core/col.py | 88 +++++++++++++++++++++++++++++++--------------- 2 files changed, 62 insertions(+), 29 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index cc786d1141c48..2eb0def1e7ce7 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -105,7 +105,7 @@ Series, DataFrame, ) -from pandas.core.col import col +from pandas.core.col import col, Expr from pandas.core.dtypes.dtypes import SparseDtype @@ -247,6 +247,7 @@ "DatetimeTZDtype", "ExcelFile", "ExcelWriter", + "Expr", "Flags", "Float32Dtype", "Float64Dtype", diff --git a/pandas/core/col.py b/pandas/core/col.py index bd9068599007e..f5c94a704ad8e 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -9,8 +9,6 @@ Any, ) -from pandas.core.dtypes.common import is_scalar - from pandas.core.series import Series if TYPE_CHECKING: @@ -18,7 +16,7 @@ # Used only for generating the str repr of expressions. -OP_SYMBOLS = { +_OP_SYMBOLS = { "__add__": "+", "__radd__": "+", "__sub__": "-", @@ -40,51 +38,49 @@ } -def parse_args(df: DataFrame, *args: Any) -> tuple[Series]: - return tuple([x._func(df) if isinstance(x, Expr) else x for x in args]) +def _parse_args(df: DataFrame, *args: Any) -> tuple[Series]: + # Parse `args`, evaluating any expressions we encounter. + return tuple([x(df) if isinstance(x, Expr) else x for x in args]) -def parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[Hashable, Series]: +def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[Hashable, Series]: + # Parse `kwargs`, evaluating any expressions we encounter. return { - key: val._func(df) if isinstance(val, Expr) else val - for key, val in kwargs.items() + key: val(df) if isinstance(val, Expr) else val for key, val in kwargs.items() } class Expr: + """ + Class representing a deferred column. + + This is not meant to be instantiated directly. Instead, use :meth:`pandas.col`. + """ + def __init__( self, func: Callable[[DataFrame], Any], repr_str: str | None = None ) -> None: self._func = func self._repr_str = repr_str - def __call__(self, df: DataFrame) -> Series: - result = self._func(df) - if not (isinstance(result, Series) or is_scalar(result)): - msg = ( - "Expected function which returns Series or scalar, " - f"got function which returns: {type(result)}" - ) - raise TypeError(msg) - return result + def __call__(self, df: DataFrame) -> Any: + return self._func(df) def _with_binary_op(self, op: str, other: Any) -> Expr: - op_symbol = OP_SYMBOLS.get(op, op) + op_symbol = _OP_SYMBOLS.get(op, op) if isinstance(other, Expr): if op.startswith("__r"): repr_str = f"({other._repr_str} {op_symbol} {self._repr_str})" else: repr_str = f"({self._repr_str} {op_symbol} {other._repr_str})" - return Expr( - lambda df: getattr(self._func(df), op)(other._func(df)), repr_str - ) + return Expr(lambda df: getattr(self(df), op)(other(df)), repr_str) else: if op.startswith("__r"): repr_str = f"({other!r} {op_symbol} {self._repr_str})" else: repr_str = f"({self._repr_str} {op_symbol} {other!r})" - return Expr(lambda df: getattr(self._func(df), op)(other), repr_str) + return Expr(lambda df: getattr(self(df), op)(other), repr_str) # Binary ops def __add__(self, other: Any) -> Expr: @@ -144,8 +140,8 @@ def __rmod__(self, other: Any) -> Expr: # Everything else def __getattr__(self, attr: str, /) -> Callable[..., Expr]: def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: - parsed_args = parse_args(df, *args) - parsed_kwargs = parse_kwargs(df, **kwargs) + parsed_args = _parse_args(df, *args) + parsed_kwargs = _parse_kwargs(df, **kwargs) return getattr(self(df), attr)(*parsed_args, **parsed_kwargs) def wrapper(*args: Any, **kwargs: Any) -> Expr: @@ -205,18 +201,21 @@ def __init__(self, func: Expr, namespace: str) -> None: self._func = func self._namespace = namespace + def __call__(self, df: DataFrame) -> Any: + return self._func(df) + def __getattr__(self, attr: str) -> Any: if isinstance(getattr(getattr(Series, self._namespace), attr), property): repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}" return Expr( - lambda df: getattr(getattr(self._func(df), self._namespace), attr), + lambda df: getattr(getattr(self(df), self._namespace), attr), repr_str, ) def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: - parsed_args = parse_args(df, *args) - parsed_kwargs = parse_kwargs(df, **kwargs) - return getattr(getattr(self._func(df), self._namespace), attr)( + parsed_args = _parse_args(df, *args) + parsed_kwargs = _parse_kwargs(df, **kwargs) + return getattr(getattr(self(df), self._namespace), attr)( *parsed_args, **parsed_kwargs ) @@ -245,6 +244,39 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: def col(col_name: Hashable) -> Expr: + """ + Generate deferred objected representing a dataframe's column. + + Any place which accepts ``lambda df: df[col_name]``, such as + :meth:`DataFrame.assign` or :meth:`DataFrame.loc`, can also accept + ``pd.col(col_name)``. + + Arguments + --------- + col_name : Hashable + Column name. + + Returns + ------- + Expr + + Examples + -------- + + You can use `col` in `assign`. + + >>> df = pd.DataFrame({"name": ["beluga", "narwhal"], "speed": [100, 110]}) + >>> df.assign(name_titlecase=pd.col("name").str.title()) + name speed name_titlecase + 0 beluga 100 Beluga + 1 narwhal 110 Narwhal + + You can also use it for filtering. + + >>> df.loc[pd.col("speed") > 105] + name speed + 1 narwhal 110 + """ if not isinstance(col_name, Hashable): msg = f"Expected Hashable, got: {type(col_name)}" raise TypeError(msg) From 4dc8e55e378766dfb7d3672c338ef8ef3bd8a89d Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Aug 2025 17:39:04 +0100 Subject: [PATCH 09/30] reference in dsintro --- doc/source/user_guide/dsintro.rst | 6 ++++++ doc/source/whatsnew/v3.0.0.rst | 19 +++++++++++++++++-- pandas/core/frame.py | 7 +++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/dsintro.rst b/doc/source/user_guide/dsintro.rst index 89981786d60b5..919dafb291b86 100644 --- a/doc/source/user_guide/dsintro.rst +++ b/doc/source/user_guide/dsintro.rst @@ -553,6 +553,12 @@ a function of one argument to be evaluated on the DataFrame being assigned to. iris.assign(sepal_ratio=lambda x: (x["SepalWidth"] / x["SepalLength"])).head() +or, using :meth:`pandas.col`: + +.. ipython:: python + + iris.assign(sepal_ratio=pd.col("SepalWidth") / pd.col("SepalLength")).head() + :meth:`~pandas.DataFrame.assign` **always** returns a copy of the data, leaving the original DataFrame untouched. diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 7eb7605a47aa1..5f816ed2b1eee 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -16,8 +16,23 @@ Enhancements .. _whatsnew_300.enhancements.enhancement1: -Enhancement1 -^^^^^^^^^^^^ +``pd.col`` syntax can now be used in :meth:`DataFrame.assign` and :meth:`DataFrame.loc`. +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can now use ``pd.col`` to create callables for use in dataframe methods which accept them. For example, if you have a dataframe + +.. ipython:: python + df = pd.DataFrame({'a': [1, 1, 2], 'b': [4, 5, 6]) + +and you want to create a new column ``'c'`` by summing ``'a'`` and ``'b'``, then instead of + +.. ipython:: python + df.assign(c = lambda df: df['a'] + df['b']) + +you can now write: + +.. ipython:: python + df.assign(c = pd.col('a') + pd.col('b')) .. _whatsnew_300.enhancements.enhancement2: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ec8c8116e5aee..b95dba1694ca0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5304,6 +5304,13 @@ def assign(self, **kwargs) -> DataFrame: Portland 17.0 62.6 Berkeley 25.0 77.0 + or by using :meth:`pandas.col`: + + >>> df.assign(temp_f=pd.col("temp_c") * 9 / 5 + 32) + temp_c temp_f + Portland 17.0 62.6 + Berkeley 25.0 77.0 + You can create multiple columns within the same assign where one of the columns depends on another one defined within the same assign: From e2aeb4f3cdf142196a390ed984aa19d909f2574e Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Aug 2025 17:50:23 +0100 Subject: [PATCH 10/30] fixup link --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index d127ff6987be4..49cdd3289d9f1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -117,7 +117,7 @@ process in more detail. `PDEP-7: Consistent copy/view semantics in pandas with Copy-on-Write `__ -.. _whatsnew_300.enhancements.enhancement2: +.. _whatsnew_300.enhancements.col: ``pd.col`` syntax can now be used in :meth:`DataFrame.assign` and :meth:`DataFrame.loc`. ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From fa3e7931f68488112da6bf294bddf5466a58f5b9 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Aug 2025 18:17:25 +0100 Subject: [PATCH 11/30] fixup docs --- doc/source/whatsnew/v3.0.0.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 49cdd3289d9f1..372e93b216e26 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -119,22 +119,25 @@ process in more detail. .. _whatsnew_300.enhancements.col: -``pd.col`` syntax can now be used in :meth:`DataFrame.assign` and :meth:`DataFrame.loc`. -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +``pd.col`` syntax can now be used in :meth:`DataFrame.assign` and :meth:`DataFrame.loc` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ You can now use ``pd.col`` to create callables for use in dataframe methods which accept them. For example, if you have a dataframe .. ipython:: python - df = pd.DataFrame({'a': [1, 1, 2], 'b': [4, 5, 6]) + + df = pd.DataFrame({'a': [1, 1, 2], 'b': [4, 5, 6]}) and you want to create a new column ``'c'`` by summing ``'a'`` and ``'b'``, then instead of .. ipython:: python + df.assign(c = lambda df: df['a'] + df['b']) you can now write: .. ipython:: python + df.assign(c = pd.col('a') + pd.col('b')) New Deprecation Policy From 0bc918a4d11046295dcbbcbc9a77706c430c5de7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Aug 2025 18:19:11 +0100 Subject: [PATCH 12/30] fixup --- pandas/tests/api/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index f5ba784470072..496d29c74aa0c 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -63,6 +63,7 @@ class TestPDApi(Base): "DatetimeIndex", "ExcelFile", "ExcelWriter", + "Expr", "Flags", "Grouper", "HDFStore", From a0939f926fc6d097f3422e6848546f95d0ce21f4 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sat, 16 Aug 2025 19:44:40 +0100 Subject: [PATCH 13/30] add test file --- pandas/tests/test_col.py | 67 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 pandas/tests/test_col.py diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py new file mode 100644 index 0000000000000..d9292f15d7b7c --- /dev/null +++ b/pandas/tests/test_col.py @@ -0,0 +1,67 @@ +from datetime import datetime + +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + ("expr", "expected_values", "expected_str"), + [ + (pd.col("a"), [1, 2], "col('a')"), + (pd.col("a") * 2, [2, 4], "(col('a') * 2)"), + (pd.col("a").sum(), [3, 3], "col('a').sum()"), + (pd.col("a") + 1, [2, 3], "(col('a') + 1)"), + (1 + pd.col("a"), [2, 3], "(1 + col('a'))"), + (pd.col("a") - 1, [0, 1], "(col('a') - 1)"), + (1 - pd.col("a"), [0, -1], "(1 - col('a'))"), + (pd.col("a") * 1, [1, 2], "(col('a') * 1)"), + (1 * pd.col("a"), [1, 2], "(1 * col('a'))"), + (pd.col("a") / 1, [1.0, 2.0], "(col('a') / 1)"), + (1 / pd.col("a"), [1.0, 0.5], "(1 / col('a'))"), + (pd.col("a") // 1, [1, 2], "(col('a') // 1)"), + (1 // pd.col("a"), [1, 0], "(1 // col('a'))"), + (pd.col("a") % 1, [0, 0], "(col('a') % 1)"), + (1 % pd.col("a"), [0, 1], "(1 % col('a'))"), + (pd.col("a") > 1, [False, True], "(col('a') > 1)"), + (pd.col("a") >= 1, [True, True], "(col('a') >= 1)"), + (pd.col("a") < 1, [False, False], "(col('a') < 1)"), + (pd.col("a") <= 1, [True, False], "(col('a') <= 1)"), + (pd.col("a") == 1, [True, False], "(col('a') == 1)"), + ], +) +def test_col_simple( + expr: pd.Expr, expected_values: list[object], expected_str: str +) -> None: + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + result = df.assign(c=expr) + expected = pd.DataFrame({"a": [1, 2], "b": [3, 4], "c": expected_values}) + tm.assert_frame_equal(result, expected) + assert str(expr) == expected_str + + +@pytest.mark.parametrize( + ("expr", "expected_values", "expected_str"), + [ + (pd.col("a").dt.year, [2020], "col('a').dt.year"), + (pd.col("a").dt.strftime("%B"), ["January"], "col('a').dt.strftime('%B')"), + (pd.col("b").str.upper(), ["FOO"], "col('b').str.upper()"), + ], +) +def test_namespaces( + expr: pd.Expr, expected_values: list[object], expected_str: str +) -> None: + df = pd.DataFrame({"a": [datetime(2020, 1, 1)], "b": ["foo"]}) + result = df.assign(c=expr) + expected = pd.DataFrame( + {"a": [datetime(2020, 1, 1)], "b": ["foo"], "c": expected_values} + ) + tm.assert_frame_equal(result, expected, check_dtype=False) + assert str(expr) == expected_str + + +def test_invalid() -> None: + df = pd.DataFrame({"a": [1, 2]}) + with pytest.raises(ValueError, match="did you mean"): + df.assign(c=pd.col("b").mean()) From a7039822c17b6abf0f9b329e640ce99731c4521d Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 17 Aug 2025 10:07:31 +0100 Subject: [PATCH 14/30] simplify, support custom series extensions too --- pandas/core/col.py | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index f5c94a704ad8e..75ab0a35bbc4a 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -139,6 +139,9 @@ def __rmod__(self, other: Any) -> Expr: # Everything else def __getattr__(self, attr: str, /) -> Callable[..., Expr]: + if attr in Series._accessors: + return NamespaceExpr(self, attr) + def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: parsed_args = _parse_args(df, *args) parsed_kwargs = _parse_kwargs(df, **kwargs) @@ -170,31 +173,6 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: def __repr__(self) -> str: return self._repr_str or "Expr(...)" - # Namespaces - @property - def dt(self) -> NamespaceExpr: - return NamespaceExpr(self, "dt") - - @property - def str(self) -> NamespaceExpr: - return NamespaceExpr(self, "str") - - @property - def cat(self) -> NamespaceExpr: - return NamespaceExpr(self, "cat") - - @property - def list(self) -> NamespaceExpr: - return NamespaceExpr(self, "list") - - @property - def sparse(self) -> NamespaceExpr: - return NamespaceExpr(self, "sparse") - - @property - def struct(self) -> NamespaceExpr: - return NamespaceExpr(self, "struct") - class NamespaceExpr: def __init__(self, func: Expr, namespace: str) -> None: From 48228cc9ad7fb89559cd5a249019bc70b7e98d33 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 17 Aug 2025 10:10:51 +0100 Subject: [PATCH 15/30] test accessor --- pandas/tests/test_col.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index d9292f15d7b7c..f6f173d182bde 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -4,6 +4,7 @@ import pandas as pd import pandas._testing as tm +from pandas.tests.test_register_accessor import ensure_removed @pytest.mark.parametrize( @@ -65,3 +66,20 @@ def test_invalid() -> None: df = pd.DataFrame({"a": [1, 2]}) with pytest.raises(ValueError, match="did you mean"): df.assign(c=pd.col("b").mean()) + + +def test_custom_accessor() -> None: + df = pd.DataFrame({"a": [1, 2, 3]}) + + class XYZAccessor: + def __init__(self, pandas_obj): + self._obj = pandas_obj + + def mean(self): + return self._obj.mean() + + with ensure_removed(pd.Series, "xyz"): + pd.api.extensions.register_series_accessor("xyz")(XYZAccessor) + result = df.assign(b=pd.col("a").xyz.mean()) + expected = pd.DataFrame({"a": [1, 2, 3], "b": [2.0, 2.0, 2.0]}) + tm.assert_frame_equal(result, expected) From d6f55a1ca223361b25e7d496751542cd29abf1fc Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 17 Aug 2025 10:11:31 +0100 Subject: [PATCH 16/30] :pencil: fix typo --- pandas/core/col.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index 75ab0a35bbc4a..4277022364c4c 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -223,7 +223,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: def col(col_name: Hashable) -> Expr: """ - Generate deferred objected representing a dataframe's column. + Generate deferred object representing a column of a `DataFrame`. Any place which accepts ``lambda df: df[col_name]``, such as :meth:`DataFrame.assign` or :meth:`DataFrame.loc`, can also accept From b2ed136611459cee05393ca889b831d899b23930 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 17 Aug 2025 10:34:08 +0100 Subject: [PATCH 17/30] typing --- pandas/core/col.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index 4277022364c4c..3dd41eaae9f8f 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -138,7 +138,7 @@ def __rmod__(self, other: Any) -> Expr: return self._with_binary_op("__rmod__", other) # Everything else - def __getattr__(self, attr: str, /) -> Callable[..., Expr]: + def __getattr__(self, attr: str, /) -> Any: if attr in Series._accessors: return NamespaceExpr(self, attr) From c8f01934313bc5e440fa8f2ad5a136a3babeb66c Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 17 Aug 2025 19:34:02 +0100 Subject: [PATCH 18/30] move Expr to api.typing --- pandas/__init__.py | 3 +-- pandas/api/typing/__init__.py | 2 ++ pandas/tests/test_col.py | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/__init__.py b/pandas/__init__.py index 2eb0def1e7ce7..cc786d1141c48 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -105,7 +105,7 @@ Series, DataFrame, ) -from pandas.core.col import col, Expr +from pandas.core.col import col from pandas.core.dtypes.dtypes import SparseDtype @@ -247,7 +247,6 @@ "DatetimeTZDtype", "ExcelFile", "ExcelWriter", - "Expr", "Flags", "Float32Dtype", "Float64Dtype", diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index c1178c72f3edc..ea5dc04329c9f 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -6,6 +6,7 @@ from pandas._libs.lib import NoDefault from pandas._libs.missing import NAType +from pandas.core.col import Expr from pandas.core.groupby import ( DataFrameGroupBy, SeriesGroupBy, @@ -41,6 +42,7 @@ "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", + "Expr", "FrozenList", "JsonReader", "NAType", diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index f6f173d182bde..d741fd468adeb 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -4,6 +4,7 @@ import pandas as pd import pandas._testing as tm +from pandas.api.typing import Expr from pandas.tests.test_register_accessor import ensure_removed @@ -33,7 +34,7 @@ ], ) def test_col_simple( - expr: pd.Expr, expected_values: list[object], expected_str: str + expr: Expr, expected_values: list[object], expected_str: str ) -> None: df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) result = df.assign(c=expr) @@ -51,7 +52,7 @@ def test_col_simple( ], ) def test_namespaces( - expr: pd.Expr, expected_values: list[object], expected_str: str + expr: Expr, expected_values: list[object], expected_str: str ) -> None: df = pd.DataFrame({"a": [datetime(2020, 1, 1)], "b": ["foo"]}) result = df.assign(c=expr) From e6ea343d979c8a9d085a243e59db3ade188ccd67 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 17 Aug 2025 19:40:22 +0100 Subject: [PATCH 19/30] move Expr to api/typing --- pandas/core/col.py | 3 +++ pandas/tests/api/test_api.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index 3dd41eaae9f8f..eb836d03f2d8b 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -269,3 +269,6 @@ def func(df: DataFrame) -> Series: return df[col_name] return Expr(func, f"col({col_name!r})") + + +__all__ = ["Expr", "col"] diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index 496d29c74aa0c..d1e5ba401879c 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -63,7 +63,6 @@ class TestPDApi(Base): "DatetimeIndex", "ExcelFile", "ExcelWriter", - "Expr", "Flags", "Grouper", "HDFStore", @@ -262,6 +261,7 @@ class TestApi(Base): "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", + "Expr", "FrozenList", "JsonReader", "NaTType", From 96990d63eb983f2707e50d12941c08b55d401d3f Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 17 Aug 2025 19:44:41 +0100 Subject: [PATCH 20/30] rename Expr to Expression --- pandas/api/typing/__init__.py | 4 +- pandas/core/col.py | 83 ++++++++++++++++++----------------- pandas/tests/api/test_api.py | 2 +- pandas/tests/test_col.py | 6 +-- 4 files changed, 49 insertions(+), 46 deletions(-) diff --git a/pandas/api/typing/__init__.py b/pandas/api/typing/__init__.py index ea5dc04329c9f..de6657b58ee80 100644 --- a/pandas/api/typing/__init__.py +++ b/pandas/api/typing/__init__.py @@ -6,7 +6,7 @@ from pandas._libs.lib import NoDefault from pandas._libs.missing import NAType -from pandas.core.col import Expr +from pandas.core.col import Expression from pandas.core.groupby import ( DataFrameGroupBy, SeriesGroupBy, @@ -42,7 +42,7 @@ "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", - "Expr", + "Expression", "FrozenList", "JsonReader", "NAType", diff --git a/pandas/core/col.py b/pandas/core/col.py index eb836d03f2d8b..3e17c89a1b7af 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -40,17 +40,18 @@ def _parse_args(df: DataFrame, *args: Any) -> tuple[Series]: # Parse `args`, evaluating any expressions we encounter. - return tuple([x(df) if isinstance(x, Expr) else x for x in args]) + return tuple([x(df) if isinstance(x, Expression) else x for x in args]) def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[Hashable, Series]: # Parse `kwargs`, evaluating any expressions we encounter. return { - key: val(df) if isinstance(val, Expr) else val for key, val in kwargs.items() + key: val(df) if isinstance(val, Expression) else val + for key, val in kwargs.items() } -class Expr: +class Expression: """ Class representing a deferred column. @@ -66,94 +67,95 @@ def __init__( def __call__(self, df: DataFrame) -> Any: return self._func(df) - def _with_binary_op(self, op: str, other: Any) -> Expr: + def _with_binary_op(self, op: str, other: Any) -> Expression: op_symbol = _OP_SYMBOLS.get(op, op) - if isinstance(other, Expr): + if isinstance(other, Expression): if op.startswith("__r"): repr_str = f"({other._repr_str} {op_symbol} {self._repr_str})" else: repr_str = f"({self._repr_str} {op_symbol} {other._repr_str})" - return Expr(lambda df: getattr(self(df), op)(other(df)), repr_str) + return Expression(lambda df: getattr(self(df), op)(other(df)), repr_str) else: if op.startswith("__r"): repr_str = f"({other!r} {op_symbol} {self._repr_str})" else: repr_str = f"({self._repr_str} {op_symbol} {other!r})" - return Expr(lambda df: getattr(self(df), op)(other), repr_str) + return Expression(lambda df: getattr(self(df), op)(other), repr_str) # Binary ops - def __add__(self, other: Any) -> Expr: + def __add__(self, other: Any) -> Expression: return self._with_binary_op("__add__", other) - def __radd__(self, other: Any) -> Expr: + def __radd__(self, other: Any) -> Expression: return self._with_binary_op("__radd__", other) - def __sub__(self, other: Any) -> Expr: + def __sub__(self, other: Any) -> Expression: return self._with_binary_op("__sub__", other) - def __rsub__(self, other: Any) -> Expr: + def __rsub__(self, other: Any) -> Expression: return self._with_binary_op("__rsub__", other) - def __mul__(self, other: Any) -> Expr: + def __mul__(self, other: Any) -> Expression: return self._with_binary_op("__mul__", other) - def __rmul__(self, other: Any) -> Expr: + def __rmul__(self, other: Any) -> Expression: return self._with_binary_op("__rmul__", other) - def __truediv__(self, other: Any) -> Expr: + def __truediv__(self, other: Any) -> Expression: return self._with_binary_op("__truediv__", other) - def __rtruediv__(self, other: Any) -> Expr: + def __rtruediv__(self, other: Any) -> Expression: return self._with_binary_op("__rtruediv__", other) - def __floordiv__(self, other: Any) -> Expr: + def __floordiv__(self, other: Any) -> Expression: return self._with_binary_op("__floordiv__", other) - def __rfloordiv__(self, other: Any) -> Expr: + def __rfloordiv__(self, other: Any) -> Expression: return self._with_binary_op("__rfloordiv__", other) - def __ge__(self, other: Any) -> Expr: + def __ge__(self, other: Any) -> Expression: return self._with_binary_op("__ge__", other) - def __gt__(self, other: Any) -> Expr: + def __gt__(self, other: Any) -> Expression: return self._with_binary_op("__gt__", other) - def __le__(self, other: Any) -> Expr: + def __le__(self, other: Any) -> Expression: return self._with_binary_op("__le__", other) - def __lt__(self, other: Any) -> Expr: + def __lt__(self, other: Any) -> Expression: return self._with_binary_op("__lt__", other) - def __eq__(self, other: object) -> Expr: # type: ignore[override] + def __eq__(self, other: object) -> Expression: # type: ignore[override] return self._with_binary_op("__eq__", other) - def __ne__(self, other: object) -> Expr: # type: ignore[override] + def __ne__(self, other: object) -> Expression: # type: ignore[override] return self._with_binary_op("__ne__", other) - def __mod__(self, other: Any) -> Expr: + def __mod__(self, other: Any) -> Expression: return self._with_binary_op("__mod__", other) - def __rmod__(self, other: Any) -> Expr: + def __rmod__(self, other: Any) -> Expression: return self._with_binary_op("__rmod__", other) # Everything else def __getattr__(self, attr: str, /) -> Any: if attr in Series._accessors: - return NamespaceExpr(self, attr) + return NamespaceExpression(self, attr) def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: parsed_args = _parse_args(df, *args) parsed_kwargs = _parse_kwargs(df, **kwargs) return getattr(self(df), attr)(*parsed_args, **parsed_kwargs) - def wrapper(*args: Any, **kwargs: Any) -> Expr: + def wrapper(*args: Any, **kwargs: Any) -> Expression: # Create a readable representation for method calls args_repr = ", ".join( - repr(arg._repr_str if isinstance(arg, Expr) else arg) for arg in args + repr(arg._repr_str if isinstance(arg, Expression) else arg) + for arg in args ) kwargs_repr = ", ".join( - f"{k}={v._repr_str if isinstance(v, Expr) else v!r}" + f"{k}={v._repr_str if isinstance(v, Expression) else v!r}" for k, v in kwargs.items() ) @@ -166,7 +168,7 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: args_str = ", ".join(all_args) repr_str = f"{self._repr_str}.{attr}({args_str})" - return Expr(lambda df: func(df, *args, **kwargs), repr_str) + return Expression(lambda df: func(df, *args, **kwargs), repr_str) return wrapper @@ -174,8 +176,8 @@ def __repr__(self) -> str: return self._repr_str or "Expr(...)" -class NamespaceExpr: - def __init__(self, func: Expr, namespace: str) -> None: +class NamespaceExpression: + def __init__(self, func: Expression, namespace: str) -> None: self._func = func self._namespace = namespace @@ -185,7 +187,7 @@ def __call__(self, df: DataFrame) -> Any: def __getattr__(self, attr: str) -> Any: if isinstance(getattr(getattr(Series, self._namespace), attr), property): repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}" - return Expr( + return Expression( lambda df: getattr(getattr(self(df), self._namespace), attr), repr_str, ) @@ -197,13 +199,14 @@ def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: *parsed_args, **parsed_kwargs ) - def wrapper(*args: Any, **kwargs: Any) -> Expr: + def wrapper(*args: Any, **kwargs: Any) -> Expression: # Create a readable representation for namespace method calls args_repr = ", ".join( - repr(arg._repr_str if isinstance(arg, Expr) else arg) for arg in args + repr(arg._repr_str if isinstance(arg, Expression) else arg) + for arg in args ) kwargs_repr = ", ".join( - f"{k}={v._repr_str if isinstance(v, Expr) else v!r}" + f"{k}={v._repr_str if isinstance(v, Expression) else v!r}" for k, v in kwargs.items() ) @@ -216,12 +219,12 @@ def wrapper(*args: Any, **kwargs: Any) -> Expr: args_str = ", ".join(all_args) repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}({args_str})" - return Expr(lambda df: func(df, *args, **kwargs), repr_str) + return Expression(lambda df: func(df, *args, **kwargs), repr_str) return wrapper -def col(col_name: Hashable) -> Expr: +def col(col_name: Hashable) -> Expression: """ Generate deferred object representing a column of a `DataFrame`. @@ -268,7 +271,7 @@ def func(df: DataFrame) -> Series: raise ValueError(msg) return df[col_name] - return Expr(func, f"col({col_name!r})") + return Expression(func, f"col({col_name!r})") -__all__ = ["Expr", "col"] +__all__ = ["Expression", "col"] diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py index d1e5ba401879c..2c26f77102df1 100644 --- a/pandas/tests/api/test_api.py +++ b/pandas/tests/api/test_api.py @@ -261,7 +261,7 @@ class TestApi(Base): "ExpandingGroupby", "ExponentialMovingWindow", "ExponentialMovingWindowGroupby", - "Expr", + "Expression", "FrozenList", "JsonReader", "NaTType", diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index d741fd468adeb..05ca654bd4461 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -4,7 +4,7 @@ import pandas as pd import pandas._testing as tm -from pandas.api.typing import Expr +from pandas.api.typing import Expression from pandas.tests.test_register_accessor import ensure_removed @@ -34,7 +34,7 @@ ], ) def test_col_simple( - expr: Expr, expected_values: list[object], expected_str: str + expr: Expression, expected_values: list[object], expected_str: str ) -> None: df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) result = df.assign(c=expr) @@ -52,7 +52,7 @@ def test_col_simple( ], ) def test_namespaces( - expr: Expr, expected_values: list[object], expected_str: str + expr: Expression, expected_values: list[object], expected_str: str ) -> None: df = pd.DataFrame({"a": [datetime(2020, 1, 1)], "b": ["foo"]}) result = df.assign(c=expr) From 548ee20ceb0daa06aa75d8a384c8c8b82c346d32 Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Sun, 17 Aug 2025 21:47:52 +0100 Subject: [PATCH 21/30] fix return type Co-authored-by: Irv Lustig --- pandas/core/col.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index 3e17c89a1b7af..511ebb3932100 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -239,7 +239,7 @@ def col(col_name: Hashable) -> Expression: Returns ------- - Expr + `pandas.api.typing.Expression` Examples -------- From cfbd5a377ac73c07c6bd1ffc9279a9603cc22f4a Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 19 Aug 2025 18:39:43 +0100 Subject: [PATCH 22/30] support NumPy ufuncs --- pandas/core/col.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/pandas/core/col.py b/pandas/core/col.py index 3e17c89a1b7af..a13f38d3fc6c2 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -138,6 +138,38 @@ def __mod__(self, other: Any) -> Expression: def __rmod__(self, other: Any) -> Expression: return self._with_binary_op("__rmod__", other) + def __array_ufunc__( + self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any + ) -> Expression: + if method != "__call__": + msg = f"Only `__call__` ufuncs are currently supported, got: '{method}'" + raise NotImplementedError(msg) + + def func(df: DataFrame) -> Any: + parsed_inputs = _parse_args(df, *inputs) + parsed_kwargs = _parse_kwargs(df, *kwargs) + return ufunc(*parsed_inputs, **parsed_kwargs) + + inputs_repr = ", ".join( + arg._repr_str if isinstance(arg, Expression) else repr(arg) + for arg in inputs + ) + kwargs_repr = ", ".join( + f"{k}={v._repr_str if isinstance(v, Expression) else v!r}" + for k, v in kwargs.items() + ) + + all_args = [] + if inputs_repr: + all_args.append(inputs_repr) + if kwargs_repr: + all_args.append(kwargs_repr) + + args_str = ", ".join(all_args) + repr_str = f"{ufunc.__name__}({args_str})" + + return Expression(func, repr_str) + # Everything else def __getattr__(self, attr: str, /) -> Any: if attr in Series._accessors: From e74438c9f2629325676a79fdd3e02ffba2117bb7 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 19 Aug 2025 18:44:59 +0100 Subject: [PATCH 23/30] support NumPy ufuncs too --- pandas/core/col.py | 76 +++++++++++----------------------------- pandas/tests/test_col.py | 3 ++ 2 files changed, 24 insertions(+), 55 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index a13f38d3fc6c2..f6e17febe8954 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -51,6 +51,24 @@ def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[Hashable, Series]: } +def _pretty_print_args_kwargs(*args: Any, **kwargs: Any) -> str: + inputs_repr = ", ".join( + arg._repr_str if isinstance(arg, Expression) else repr(arg) for arg in args + ) + kwargs_repr = ", ".join( + f"{k}={v._repr_str if isinstance(v, Expression) else v!r}" + for k, v in kwargs.items() + ) + + all_args = [] + if inputs_repr: + all_args.append(inputs_repr) + if kwargs_repr: + all_args.append(kwargs_repr) + + return ", ".join(all_args) + + class Expression: """ Class representing a deferred column. @@ -141,31 +159,12 @@ def __rmod__(self, other: Any) -> Expression: def __array_ufunc__( self, ufunc: Callable[..., Any], method: str, *inputs: Any, **kwargs: Any ) -> Expression: - if method != "__call__": - msg = f"Only `__call__` ufuncs are currently supported, got: '{method}'" - raise NotImplementedError(msg) - def func(df: DataFrame) -> Any: parsed_inputs = _parse_args(df, *inputs) parsed_kwargs = _parse_kwargs(df, *kwargs) return ufunc(*parsed_inputs, **parsed_kwargs) - inputs_repr = ", ".join( - arg._repr_str if isinstance(arg, Expression) else repr(arg) - for arg in inputs - ) - kwargs_repr = ", ".join( - f"{k}={v._repr_str if isinstance(v, Expression) else v!r}" - for k, v in kwargs.items() - ) - - all_args = [] - if inputs_repr: - all_args.append(inputs_repr) - if kwargs_repr: - all_args.append(kwargs_repr) - - args_str = ", ".join(all_args) + args_str = _pretty_print_args_kwargs(*inputs, **kwargs) repr_str = f"{ufunc.__name__}({args_str})" return Expression(func, repr_str) @@ -181,23 +180,7 @@ def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: return getattr(self(df), attr)(*parsed_args, **parsed_kwargs) def wrapper(*args: Any, **kwargs: Any) -> Expression: - # Create a readable representation for method calls - args_repr = ", ".join( - repr(arg._repr_str if isinstance(arg, Expression) else arg) - for arg in args - ) - kwargs_repr = ", ".join( - f"{k}={v._repr_str if isinstance(v, Expression) else v!r}" - for k, v in kwargs.items() - ) - - all_args = [] - if args_repr: - all_args.append(args_repr) - if kwargs_repr: - all_args.append(kwargs_repr) - - args_str = ", ".join(all_args) + args_str = _pretty_print_args_kwargs(*args, **kwargs) repr_str = f"{self._repr_str}.{attr}({args_str})" return Expression(lambda df: func(df, *args, **kwargs), repr_str) @@ -232,25 +215,8 @@ def func(df: DataFrame, *args: Any, **kwargs: Any) -> Any: ) def wrapper(*args: Any, **kwargs: Any) -> Expression: - # Create a readable representation for namespace method calls - args_repr = ", ".join( - repr(arg._repr_str if isinstance(arg, Expression) else arg) - for arg in args - ) - kwargs_repr = ", ".join( - f"{k}={v._repr_str if isinstance(v, Expression) else v!r}" - for k, v in kwargs.items() - ) - - all_args = [] - if args_repr: - all_args.append(args_repr) - if kwargs_repr: - all_args.append(kwargs_repr) - - args_str = ", ".join(all_args) + args_str = _pretty_print_args_kwargs(*args, **kwargs) repr_str = f"{self._func._repr_str}.{self._namespace}.{attr}({args_str})" - return Expression(lambda df: func(df, *args, **kwargs), repr_str) return wrapper diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index 05ca654bd4461..d07c1e7c1a2e5 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -1,5 +1,6 @@ from datetime import datetime +import numpy as np import pytest import pandas as pd @@ -31,6 +32,8 @@ (pd.col("a") < 1, [False, False], "(col('a') < 1)"), (pd.col("a") <= 1, [True, False], "(col('a') <= 1)"), (pd.col("a") == 1, [True, False], "(col('a') == 1)"), + (np.log(pd.col("a")), [0.0, 0.6931471805599453], "log(col('a'))"), + (np.divide(pd.col("a"), pd.col("a")), [1.0, 1.0], "divide(col('a'), col('a'))"), ], ) def test_col_simple( From 83b70e8f201b415e447d28086f40559f678cd431 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 19 Aug 2025 19:04:24 +0100 Subject: [PATCH 24/30] simplify repr_str type --- pandas/core/col.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index 3882133da6bef..3cc8cf4e063ba 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -76,9 +76,7 @@ class Expression: This is not meant to be instantiated directly. Instead, use :meth:`pandas.col`. """ - def __init__( - self, func: Callable[[DataFrame], Any], repr_str: str | None = None - ) -> None: + def __init__(self, func: Callable[[DataFrame], Any], repr_str: str) -> None: self._func = func self._repr_str = repr_str From 3b6906b8c1102fe2127e29b692fee46d0f9a50b0 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 19 Aug 2025 19:10:03 +0100 Subject: [PATCH 25/30] fix typing, avoid floating point inaccuracies --- pandas/core/col.py | 2 +- pandas/tests/test_col.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index 3cc8cf4e063ba..94c615a2b0a03 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -43,7 +43,7 @@ def _parse_args(df: DataFrame, *args: Any) -> tuple[Series]: return tuple([x(df) if isinstance(x, Expression) else x for x in args]) -def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[Hashable, Series]: +def _parse_kwargs(df: DataFrame, **kwargs: Any) -> dict[str, Any]: # Parse `kwargs`, evaluating any expressions we encounter. return { key: val(df) if isinstance(val, Expression) else val diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index d07c1e7c1a2e5..38193c7de3094 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -32,7 +32,7 @@ (pd.col("a") < 1, [False, False], "(col('a') < 1)"), (pd.col("a") <= 1, [True, False], "(col('a') <= 1)"), (pd.col("a") == 1, [True, False], "(col('a') == 1)"), - (np.log(pd.col("a")), [0.0, 0.6931471805599453], "log(col('a'))"), + (np.power(pd.col("a"), 2), [1, 4], "power(col('a'), 2)"), (np.divide(pd.col("a"), pd.col("a")), [1.0, 1.0], "divide(col('a'), col('a'))"), ], ) From 9fed80e592afd8dd8e109d859391a319d1e5c6ef Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 19 Aug 2025 19:29:27 +0100 Subject: [PATCH 26/30] add to api reference --- doc/source/reference/general_functions.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/reference/general_functions.rst b/doc/source/reference/general_functions.rst index e93514de5f762..a76e51ace86d2 100644 --- a/doc/source/reference/general_functions.rst +++ b/doc/source/reference/general_functions.rst @@ -71,6 +71,7 @@ Top-level evaluation .. autosummary:: :toctree: api/ + col eval Datetime formats From edb0e3846e8c6430485c522ed3525a4955215464 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 19 Aug 2025 19:42:09 +0100 Subject: [PATCH 27/30] truncate output for wide dataframes --- pandas/core/col.py | 7 ++++++- pandas/tests/test_col.py | 16 +++++++++++++--- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index 94c615a2b0a03..c6bb96ad6ebfc 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -260,9 +260,14 @@ def col(col_name: Hashable) -> Expression: def func(df: DataFrame) -> Series: if col_name not in df.columns: + columns_list = df.columns.tolist() + if len(columns_list) > 10: + columns_hint = columns_list[:10] + ["..."] + else: + columns_hint = columns_list msg = ( f"Column '{col_name}' not found in given DataFrame.\n\n" - f"Hint: did you mean one of {df.columns.tolist()} instead?" + f"Hint: did you mean one of {columns_hint} instead?" ) raise ValueError(msg) return df[col_name] diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index 38193c7de3094..a2de912734962 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -67,9 +67,19 @@ def test_namespaces( def test_invalid() -> None: - df = pd.DataFrame({"a": [1, 2]}) - with pytest.raises(ValueError, match="did you mean"): - df.assign(c=pd.col("b").mean()) + df = pd.DataFrame({"a": [1, 2], "b": [3, 4]}) + with pytest.raises(ValueError, match=r"did you mean one of \['a', 'b'\] instead"): + df.assign(c=pd.col("c").mean()) + df = pd.DataFrame({f"col_{i}": [0] for i in range(11)}) + msg = ( + "did you mean one of " + r"\['col_0', 'col_1', 'col_2', 'col_3', " + "'col_4', 'col_5', 'col_6', 'col_7', " + r"'col_8', 'col_9', '\.\.\.'\] instead" + ) + "" + with pytest.raises(ValueError, match=msg): + df.assign(c=pd.col("c").mean()) def test_custom_accessor() -> None: From 72faba9d70e8e18b17aa38c09ef5804a054f6a60 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 19 Aug 2025 19:42:59 +0100 Subject: [PATCH 28/30] make `max_cols` variable --- pandas/core/col.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index c6bb96ad6ebfc..a104f026fd8ba 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -261,8 +261,9 @@ def col(col_name: Hashable) -> Expression: def func(df: DataFrame) -> Series: if col_name not in df.columns: columns_list = df.columns.tolist() - if len(columns_list) > 10: - columns_hint = columns_list[:10] + ["..."] + max_cols = 10 + if len(columns_list) > max_cols: + columns_hint = columns_list[:max_cols] + ["..."] else: columns_hint = columns_list msg = ( From b6f49616f2d5ebdfaf6b9cf32b2b5e4d1d7b3e8c Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 19 Aug 2025 20:04:09 +0100 Subject: [PATCH 29/30] truncate based on message length rather than number of columns --- pandas/core/col.py | 20 ++++++++++---------- pandas/tests/test_col.py | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/core/col.py b/pandas/core/col.py index a104f026fd8ba..ca72950e1feba 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -222,20 +222,21 @@ def wrapper(*args: Any, **kwargs: Any) -> Expression: def col(col_name: Hashable) -> Expression: """ - Generate deferred object representing a column of a `DataFrame`. + Generate deferred object representing a column of a DataFrame. Any place which accepts ``lambda df: df[col_name]``, such as :meth:`DataFrame.assign` or :meth:`DataFrame.loc`, can also accept ``pd.col(col_name)``. - Arguments - --------- + Parameters + ---------- col_name : Hashable Column name. Returns ------- `pandas.api.typing.Expression` + A deferred object representing a column of a DataFrame. Examples -------- @@ -260,15 +261,14 @@ def col(col_name: Hashable) -> Expression: def func(df: DataFrame) -> Series: if col_name not in df.columns: - columns_list = df.columns.tolist() - max_cols = 10 - if len(columns_list) > max_cols: - columns_hint = columns_list[:max_cols] + ["..."] - else: - columns_hint = columns_list + columns_str = str(df.columns.tolist()) + max_len = 90 + if len(columns_str) > max_len: + columns_str = columns_str[:max_len] + "...]" + msg = ( f"Column '{col_name}' not found in given DataFrame.\n\n" - f"Hint: did you mean one of {columns_hint} instead?" + f"Hint: did you mean one of {columns_str} instead?" ) raise ValueError(msg) return df[col_name] diff --git a/pandas/tests/test_col.py b/pandas/tests/test_col.py index a2de912734962..c884540abfed0 100644 --- a/pandas/tests/test_col.py +++ b/pandas/tests/test_col.py @@ -75,7 +75,7 @@ def test_invalid() -> None: "did you mean one of " r"\['col_0', 'col_1', 'col_2', 'col_3', " "'col_4', 'col_5', 'col_6', 'col_7', " - r"'col_8', 'col_9', '\.\.\.'\] instead" + r"'col_8', 'col_9',\.\.\.\] instead" ) "" with pytest.raises(ValueError, match=msg): From 3791cf63fcd6b50986a897acdc76f9befde1da49 Mon Sep 17 00:00:00 2001 From: Marco Gorelli <33491632+MarcoGorelli@users.noreply.github.com> Date: Tue, 19 Aug 2025 20:06:18 +0100 Subject: [PATCH 30/30] fixup docstring --- pandas/core/col.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/col.py b/pandas/core/col.py index ca72950e1feba..eec1d241df92d 100644 --- a/pandas/core/col.py +++ b/pandas/core/col.py @@ -238,6 +238,10 @@ def col(col_name: Hashable) -> Expression: `pandas.api.typing.Expression` A deferred object representing a column of a DataFrame. + See Also + -------- + DataFrame.query : Query columns of a dataframe using string expressions. + Examples --------