diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index feaea311db8..d64523a4a9f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,13 +24,13 @@ repos: - id: rst-inline-touching-normal - id: text-unicode-replacement-char - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.3 + rev: v0.14.6 hooks: - id: ruff-check args: ["--fix", "--show-fixes"] - id: ruff-format - repo: https://github.com/keewis/blackdoc - rev: v0.4.5 + rev: v0.4.6 hooks: - id: blackdoc exclude: "generate_aggregations.py" @@ -76,6 +76,6 @@ repos: - id: validate-pyproject additional_dependencies: ["validate-pyproject-schema-store[all]"] - repo: https://github.com/adhtruong/mirrors-typos - rev: v1.39.0 + rev: v1.39.2 hooks: - id: typos diff --git a/doc/api/dataarray.rst b/doc/api/dataarray.rst index 9d4e81c8677..8e4c2e77e11 100644 --- a/doc/api/dataarray.rst +++ b/doc/api/dataarray.rst @@ -162,6 +162,7 @@ Aggregation DataArray.min DataArray.mean DataArray.median + DataArray.nunique DataArray.prod DataArray.sum DataArray.std diff --git a/doc/api/dataset.rst b/doc/api/dataset.rst index 733c9768d2f..0c8e1e49679 100644 --- a/doc/api/dataset.rst +++ b/doc/api/dataset.rst @@ -169,6 +169,7 @@ Aggregation Dataset.min Dataset.mean Dataset.median + Dataset.nunique Dataset.prod Dataset.sum Dataset.std diff --git a/doc/api/datatree.rst b/doc/api/datatree.rst index 8501440b7d7..487e47c5927 100644 --- a/doc/api/datatree.rst +++ b/doc/api/datatree.rst @@ -266,6 +266,7 @@ Aggregate data in all nodes in the subtree simultaneously. DataTree.min DataTree.mean DataTree.median + DataTree.nunique DataTree.prod DataTree.sum DataTree.std diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 677b2194a55..fe3b4f58cc0 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,6 +22,10 @@ New Features - :py:func:`combine_nested` now support :py:class:`DataTree` objects (:pull:`10849`). By `Stephan Hoyer `_. +- Add :py:func:`nunique` reduction function (:issue:`9548`), which behaves like + :py:func:`pandas.DataFrame.nunique` applied along specific dimensions. + By `Ewan Short `_. + Breaking Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/_aggregations.py b/xarray/core/_aggregations.py index adc064840de..ff5576bcbae 100644 --- a/xarray/core/_aggregations.py +++ b/xarray/core/_aggregations.py @@ -513,7 +513,7 @@ def mean( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -618,7 +618,7 @@ def prod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -733,7 +733,7 @@ def sum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -845,7 +845,7 @@ def std( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -957,7 +957,7 @@ def var( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -1065,7 +1065,7 @@ def median( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -1116,6 +1116,120 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equal_nan: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> Self: + """ + Reduce this DataTree's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If "..." or None, will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equal_nan : bool or None, default: True + If ``skipna == False``, ``equal_nan`` determines whether null values + are counted as distinct values or not. Set ``equal_nan = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equal_nan = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : DataTree + New DataTree with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + Dataset.nunique + DataArray.nunique + :ref:`agg` + User guide on reduction or aggregation operations. + + Notes + ----- + For dask arrays, there must be a single chunk in each dimension + nunique is being applied over. + + Examples + -------- + >>> dt = xr.DataTree( + ... xr.Dataset( + ... data_vars=dict(foo=("time", np.array([1, 2, 3, 0, 2, np.nan]))), + ... coords=dict( + ... time=( + ... "time", + ... pd.date_range("2001-01-01", freq="ME", periods=6), + ... ), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ), + ... ) + >>> dt + + Group: / + Dimensions: (time: 6) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> dt.nunique() + + Group: / + Dimensions: () + Data variables: + foo int64 8B 5 + + Use ``skipna`` to control whether NaNs are ignored. + + >>> dt.nunique(skipna=False) + + Group: / + Dimensions: () + Data variables: + foo int64 8B 5 + + Use ``equal_nan`` to control whether NaNs are counted as distinct values. + + >>> dt.nunique(skipna=False, equal_nan=False) + + Group: / + Dimensions: () + Data variables: + foo int64 8B 5 + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equal_nan=equal_nan, + numeric_only=False, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -1164,7 +1278,7 @@ def cumsum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -1269,7 +1383,7 @@ def cumprod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -1776,6 +1890,10 @@ def mean( :ref:`agg` User guide on reduction or aggregation operations. + Notes + ----- + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -1872,7 +1990,7 @@ def prod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -1979,7 +2097,7 @@ def sum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -2083,7 +2201,7 @@ def std( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -2187,7 +2305,7 @@ def var( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -2287,7 +2405,7 @@ def median( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -2332,6 +2450,112 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equal_nan: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> Self: + """ + Reduce this Dataset's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If "..." or None, will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equal_nan : bool or None, default: True + If ``skipna == False``, ``equal_nan`` determines whether null values + are counted as distinct values or not. Set ``equal_nan = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equal_nan = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : Dataset + New Dataset with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + DataArray.nunique + :ref:`agg` + User guide on reduction or aggregation operations. + + Notes + ----- + For dask arrays, there must be a single chunk in each dimension + nunique is being applied over. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> ds = xr.Dataset(dict(da=da)) + >>> ds + Size: 120B + Dimensions: (time: 6) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> ds.nunique() + Size: 8B + Dimensions: () + Data variables: + da int64 8B 5 + + Use ``skipna`` to control whether NaNs are ignored. + + >>> ds.nunique(skipna=False) + Size: 8B + Dimensions: () + Data variables: + da int64 8B 5 + + Use ``equal_nan`` to control whether NaNs are counted as distinct values. + + >>> ds.nunique(skipna=False, equal_nan=False) + Size: 8B + Dimensions: () + Data variables: + da int64 8B 5 + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equal_nan=equal_nan, + numeric_only=False, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -2379,7 +2603,7 @@ def cumsum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -2477,7 +2701,7 @@ def cumprod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -2944,6 +3168,10 @@ def mean( :ref:`agg` User guide on reduction or aggregation operations. + Notes + ----- + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -3032,7 +3260,7 @@ def prod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -3129,7 +3357,7 @@ def sum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -3223,7 +3451,7 @@ def std( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -3317,7 +3545,7 @@ def var( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -3407,7 +3635,7 @@ def median( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -3444,6 +3672,102 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equal_nan: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> Self: + """ + Reduce this DataArray's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If "..." or None, will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equal_nan : bool or None, default: True + If ``skipna == False``, ``equal_nan`` determines whether null values + are counted as distinct values or not. Set ``equal_nan = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equal_nan = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : DataArray + New DataArray with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + Dataset.nunique + :ref:`agg` + User guide on reduction or aggregation operations. + + Notes + ----- + For dask arrays, there must be a single chunk in each dimension + nunique is being applied over. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> da + Size: 48B + array([ 1., 2., 3., 0., 2., nan]) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> da.nunique() + Size: 8B + array(5) + + Use ``skipna`` to control whether NaNs are ignored. + + >>> da.nunique(skipna=False) + Size: 8B + array(5) + + Use ``equal_nan`` to control whether NaNs are counted as distinct values. + + >>> da.nunique(skipna=False, equal_nan=False) + Size: 8B + array(5) + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equal_nan=equal_nan, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -3491,7 +3815,7 @@ def cumsum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -3585,7 +3909,7 @@ def cumprod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -4223,6 +4547,8 @@ def mean( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -4344,7 +4670,7 @@ def prod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -4479,7 +4805,7 @@ def sum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -4611,7 +4937,7 @@ def std( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -4743,7 +5069,7 @@ def var( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -4871,7 +5197,7 @@ def median( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -4920,6 +5246,124 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equal_nan: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> Dataset: + """ + Reduce this Dataset's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If None, will reduce over the GroupBy dimensions. + If "...", will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equal_nan : bool or None, default: True + If ``skipna == False``, ``equal_nan`` determines whether null values + are counted as distinct values or not. Set ``equal_nan = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equal_nan = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : Dataset + New Dataset with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + Dataset.nunique + :ref:`groupby` + User guide on groupby operations. + + Notes + ----- + Use the ``flox`` package to significantly speed up groupby computations, + especially with dask arrays. Xarray will use flox by default if installed. + Pass flox-specific keyword arguments in ``**kwargs``. + See the `flox documentation `_ for more. + + For dask arrays, there must be a single chunk in each dimension + nunique is being applied over. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> ds = xr.Dataset(dict(da=da)) + >>> ds + Size: 120B + Dimensions: (time: 6) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> ds.groupby("labels").nunique() + Size: 48B + Dimensions: (labels: 3) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + Data variables: + da (labels) int64 24B 2 1 2 + + Use ``skipna`` to control whether NaNs are ignored. + + >>> ds.groupby("labels").nunique(skipna=False) + Size: 48B + Dimensions: (labels: 3) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + Data variables: + da (labels) int64 24B 2 1 2 + + Use ``equal_nan`` to control whether NaNs are counted as distinct values. + + >>> ds.groupby("labels").nunique(skipna=False, equal_nan=False) + Size: 48B + Dimensions: (labels: 3) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + Data variables: + da (labels) int64 24B 2 1 2 + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equal_nan=equal_nan, + numeric_only=False, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -4973,7 +5417,7 @@ def cumsum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -5077,7 +5521,7 @@ def cumprod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -5719,6 +6163,8 @@ def mean( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -5840,7 +6286,7 @@ def prod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -5975,7 +6421,7 @@ def sum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -6107,7 +6553,7 @@ def std( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -6206,29 +6652,157 @@ def var( skips missing values for float dtypes; other dtypes either do not have a sentinel missing value (int) or ``skipna=True`` has not been implemented (object, datetime64 or timedelta64). - ddof : int, default: 0 - “Delta Degrees of Freedom”: the divisor used in the calculation is ``N - ddof``, - where ``N`` represents the number of elements. + ddof : int, default: 0 + “Delta Degrees of Freedom”: the divisor used in the calculation is ``N - ddof``, + where ``N`` represents the number of elements. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``var`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : Dataset + New Dataset with ``var`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + numpy.var + dask.array.var + Dataset.var + :ref:`resampling` + User guide on resampling operations. + + Notes + ----- + Use the ``flox`` package to significantly speed up resampling computations, + especially with dask arrays. Xarray will use flox by default if installed. + Pass flox-specific keyword arguments in ``**kwargs``. + See the `flox documentation `_ for more. + + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> ds = xr.Dataset(dict(da=da)) + >>> ds + Size: 120B + Dimensions: (time: 6) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> ds.resample(time="3ME").var() + Size: 48B + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + Data variables: + da (time) float64 24B 0.0 1.556 0.0 + + Use ``skipna`` to control whether NaNs are ignored. + + >>> ds.resample(time="3ME").var(skipna=False) + Size: 48B + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + Data variables: + da (time) float64 24B 0.0 1.556 nan + + Specify ``ddof=1`` for an unbiased estimate. + + >>> ds.resample(time="3ME").var(skipna=True, ddof=1) + Size: 48B + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + Data variables: + da (time) float64 24B nan 2.333 nan + """ + if ( + flox_available + and OPTIONS["use_flox"] + and contains_only_chunked_or_numpy(self._obj) + ): + return self._flox_reduce( + func="var", + dim=dim, + skipna=skipna, + ddof=ddof, + numeric_only=True, + # fill_value=fill_value, + keep_attrs=keep_attrs, + **kwargs, + ) + else: + return self.reduce( + duck_array_ops.var, + dim=dim, + skipna=skipna, + ddof=ddof, + numeric_only=True, + keep_attrs=keep_attrs, + **kwargs, + ) + + def median( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> Dataset: + """ + Reduce this Dataset's data by applying ``median`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``median``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If None, will reduce over the Resample dimensions. + If "...", will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). keep_attrs : bool or None, optional If True, ``attrs`` will be copied from the original object to the new one. If False, the new object will be returned without attributes. **kwargs : Any Additional keyword arguments passed on to the appropriate array - function for calculating ``var`` on this object's data. + function for calculating ``median`` on this object's data. These could include dask-specific kwargs like ``split_every``. Returns ------- reduced : Dataset - New Dataset with ``var`` applied to its data and the + New Dataset with ``median`` applied to its data and the indicated dimension(s) removed See Also -------- - numpy.var - dask.array.var - Dataset.var + numpy.median + dask.array.median + Dataset.median :ref:`resampling` User guide on resampling operations. @@ -6239,7 +6813,7 @@ def var( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -6261,75 +6835,49 @@ def var( Data variables: da (time) float64 48B 1.0 2.0 3.0 0.0 2.0 nan - >>> ds.resample(time="3ME").var() + >>> ds.resample(time="3ME").median() Size: 48B Dimensions: (time: 3) Coordinates: * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 24B 0.0 1.556 0.0 + da (time) float64 24B 1.0 2.0 2.0 Use ``skipna`` to control whether NaNs are ignored. - >>> ds.resample(time="3ME").var(skipna=False) - Size: 48B - Dimensions: (time: 3) - Coordinates: - * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 - Data variables: - da (time) float64 24B 0.0 1.556 nan - - Specify ``ddof=1`` for an unbiased estimate. - - >>> ds.resample(time="3ME").var(skipna=True, ddof=1) + >>> ds.resample(time="3ME").median(skipna=False) Size: 48B Dimensions: (time: 3) Coordinates: * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 24B nan 2.333 nan + da (time) float64 24B 1.0 2.0 nan """ - if ( - flox_available - and OPTIONS["use_flox"] - and contains_only_chunked_or_numpy(self._obj) - ): - return self._flox_reduce( - func="var", - dim=dim, - skipna=skipna, - ddof=ddof, - numeric_only=True, - # fill_value=fill_value, - keep_attrs=keep_attrs, - **kwargs, - ) - else: - return self.reduce( - duck_array_ops.var, - dim=dim, - skipna=skipna, - ddof=ddof, - numeric_only=True, - keep_attrs=keep_attrs, - **kwargs, - ) + return self.reduce( + duck_array_ops.median, + dim=dim, + skipna=skipna, + numeric_only=True, + keep_attrs=keep_attrs, + **kwargs, + ) - def median( + def nunique( self, dim: Dims = None, *, skipna: bool | None = None, + equal_nan: bool | None = True, keep_attrs: bool | None = None, **kwargs: Any, ) -> Dataset: """ - Reduce this Dataset's data by applying ``median`` along some dimension(s). + Reduce this Dataset's data by applying ``nunique`` along some dimension(s). Parameters ---------- dim : str, Iterable of Hashable, "..." or None, default: None - Name of dimension[s] along which to apply ``median``. For e.g. ``dim="x"`` + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` or ``dim=["x", "y"]``. If None, will reduce over the Resample dimensions. If "...", will reduce over all dimensions. skipna : bool or None, optional @@ -6337,26 +6885,30 @@ def median( skips missing values for float dtypes; other dtypes either do not have a sentinel missing value (int) or ``skipna=True`` has not been implemented (object, datetime64 or timedelta64). + equal_nan : bool or None, default: True + If ``skipna == False``, ``equal_nan`` determines whether null values + are counted as distinct values or not. Set ``equal_nan = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equal_nan = False`` + for consistency with the `Python array API `_. keep_attrs : bool or None, optional If True, ``attrs`` will be copied from the original object to the new one. If False, the new object will be returned without attributes. **kwargs : Any Additional keyword arguments passed on to the appropriate array - function for calculating ``median`` on this object's data. + function for calculating ``nunique`` on this object's data. These could include dask-specific kwargs like ``split_every``. Returns ------- reduced : Dataset - New Dataset with ``median`` applied to its data and the + New Dataset with ``nunique`` applied to its data and the indicated dimension(s) removed See Also -------- - numpy.median - dask.array.median - Dataset.median + pandas.DataFrame.nunique + Dataset.nunique :ref:`resampling` User guide on resampling operations. @@ -6367,7 +6919,8 @@ def median( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + For dask arrays, there must be a single chunk in each dimension + nunique is being applied over. Examples -------- @@ -6389,29 +6942,40 @@ def median( Data variables: da (time) float64 48B 1.0 2.0 3.0 0.0 2.0 nan - >>> ds.resample(time="3ME").median() + >>> ds.resample(time="3ME").nunique() Size: 48B Dimensions: (time: 3) Coordinates: * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 24B 1.0 2.0 2.0 + da (time) int64 24B 1 3 2 Use ``skipna`` to control whether NaNs are ignored. - >>> ds.resample(time="3ME").median(skipna=False) + >>> ds.resample(time="3ME").nunique(skipna=False) Size: 48B Dimensions: (time: 3) Coordinates: * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 Data variables: - da (time) float64 24B 1.0 2.0 nan + da (time) int64 24B 1 3 2 + + Use ``equal_nan`` to control whether NaNs are counted as distinct values. + + >>> ds.resample(time="3ME").nunique(skipna=False, equal_nan=False) + Size: 48B + Dimensions: (time: 3) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + Data variables: + da (time) int64 24B 1 3 2 """ return self.reduce( - duck_array_ops.median, + duck_array_ops.nunique, dim=dim, skipna=skipna, - numeric_only=True, + equal_nan=equal_nan, + numeric_only=False, keep_attrs=keep_attrs, **kwargs, ) @@ -6469,7 +7033,7 @@ def cumsum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -6573,7 +7137,7 @@ def cumprod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -7176,6 +7740,8 @@ def mean( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -7288,7 +7854,7 @@ def prod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -7412,7 +7978,7 @@ def sum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -7533,7 +8099,7 @@ def std( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -7654,7 +8220,7 @@ def var( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -7771,7 +8337,7 @@ def median( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -7812,6 +8378,114 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equal_nan: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> DataArray: + """ + Reduce this DataArray's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If None, will reduce over the GroupBy dimensions. + If "...", will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equal_nan : bool or None, default: True + If ``skipna == False``, ``equal_nan`` determines whether null values + are counted as distinct values or not. Set ``equal_nan = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equal_nan = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : DataArray + New DataArray with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + DataArray.nunique + :ref:`groupby` + User guide on groupby operations. + + Notes + ----- + Use the ``flox`` package to significantly speed up groupby computations, + especially with dask arrays. Xarray will use flox by default if installed. + Pass flox-specific keyword arguments in ``**kwargs``. + See the `flox documentation `_ for more. + + For dask arrays, there must be a single chunk in each dimension + nunique is being applied over. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> da + Size: 48B + array([ 1., 2., 3., 0., 2., nan]) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> da.groupby("labels").nunique() + Size: 24B + array([2, 1, 2]) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + + Use ``skipna`` to control whether NaNs are ignored. + + >>> da.groupby("labels").nunique(skipna=False) + Size: 24B + array([2, 1, 2]) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + + Use ``equal_nan`` to control whether NaNs are counted as distinct values. + + >>> da.groupby("labels").nunique(skipna=False, equal_nan=False) + Size: 24B + array([2, 1, 2]) + Coordinates: + * labels (labels) object 24B 'a' 'b' 'c' + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equal_nan=equal_nan, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -7865,7 +8539,7 @@ def cumsum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -7965,7 +8639,7 @@ def cumprod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -8564,6 +9238,8 @@ def mean( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> da = xr.DataArray( @@ -8676,7 +9352,7 @@ def prod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -8800,7 +9476,7 @@ def sum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -8921,7 +9597,7 @@ def std( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -9042,7 +9718,7 @@ def var( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -9159,7 +9835,7 @@ def median( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -9200,6 +9876,114 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equal_nan: bool | None = True, + keep_attrs: bool | None = None, + **kwargs: Any, + ) -> DataArray: + """ + Reduce this DataArray's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If None, will reduce over the Resample dimensions. + If "...", will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equal_nan : bool or None, default: True + If ``skipna == False``, ``equal_nan`` determines whether null values + are counted as distinct values or not. Set ``equal_nan = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equal_nan = False`` + for consistency with the `Python array API `_. + keep_attrs : bool or None, optional + If True, ``attrs`` will be copied from the original + object to the new one. If False, the new object will be + returned without attributes. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : DataArray + New DataArray with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + DataArray.nunique + :ref:`resampling` + User guide on resampling operations. + + Notes + ----- + Use the ``flox`` package to significantly speed up resampling computations, + especially with dask arrays. Xarray will use flox by default if installed. + Pass flox-specific keyword arguments in ``**kwargs``. + See the `flox documentation `_ for more. + + For dask arrays, there must be a single chunk in each dimension + nunique is being applied over. + + Examples + -------- + >>> da = xr.DataArray( + ... np.array([1, 2, 3, 0, 2, np.nan]), + ... dims="time", + ... coords=dict( + ... time=("time", pd.date_range("2001-01-01", freq="ME", periods=6)), + ... labels=("time", np.array(["a", "b", "c", "c", "b", "a"])), + ... ), + ... ) + >>> da + Size: 48B + array([ 1., 2., 3., 0., 2., nan]) + Coordinates: + * time (time) datetime64[ns] 48B 2001-01-31 2001-02-28 ... 2001-06-30 + labels (time) >> da.resample(time="3ME").nunique() + Size: 24B + array([1, 3, 2]) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + + Use ``skipna`` to control whether NaNs are ignored. + + >>> da.resample(time="3ME").nunique(skipna=False) + Size: 24B + array([1, 3, 2]) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + + Use ``equal_nan`` to control whether NaNs are counted as distinct values. + + >>> da.resample(time="3ME").nunique(skipna=False, equal_nan=False) + Size: 24B + array([1, 3, 2]) + Coordinates: + * time (time) datetime64[ns] 24B 2001-01-31 2001-04-30 2001-07-31 + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equal_nan=equal_nan, + keep_attrs=keep_attrs, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -9253,7 +10037,7 @@ def cumsum( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -9353,7 +10137,7 @@ def cumprod( Pass flox-specific keyword arguments in ``**kwargs``. See the `flox documentation `_ for more. - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index b8a4011a72e..6188206b73d 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -183,7 +183,7 @@ def isnull(data): dtype = xp.bool_ if hasattr(xp, "bool_") else xp.bool return full_like(data, dtype=dtype, fill_value=False) # at this point, array should have dtype=object - elif isinstance(data, np.ndarray) or pd.api.types.is_extension_array_dtype(data): # noqa: TID251 + elif isinstance(data, np.ndarray) or pd.api.types.is_extension_array_dtype(data): return pandas_isnull(data) else: # Not reachable yet, but intended for use with other duck array @@ -276,10 +276,14 @@ def as_shared_dtype(scalars_or_arrays, xp=None): isinstance(x, type(extension_array_types[0])) for x in extension_array_types ): return [ - x - if not isna(x) - else PandasExtensionArray( - type(non_nans[0].array)._from_sequence([x], dtype=non_nans[0].dtype) + ( + x + if not isna(x) + else PandasExtensionArray( + type(non_nans[0].array)._from_sequence( + [x], dtype=non_nans[0].dtype + ) + ) ) for x in scalars_or_arrays ] @@ -386,6 +390,94 @@ def count(data, axis=None): return xp.sum(xp.logical_not(isnull(data)), axis=axis) +def _factorize(data): + """Helper function for nunique to factorize mixed type arrays to float.""" + if not isinstance(data, np.ndarray): + message = "nunique with object dtype only implemented for np.ndarray." + raise NotImplementedError(message) + data = pd.factorize(data.reshape(-1))[0].reshape(data.shape) + data = data.astype(float) + data[data == -1] = np.nan + return data + + +def _permute_dims(data, axes): + """Helper function to get a suitable permute dims function.""" + xp = get_array_namespace(data) + if hasattr(xp, "permute_dims"): + return xp.permute_dims(data, axes) + elif hasattr(xp, "transpose"): + return xp.transpose(data, axes) + else: + raise NotImplementedError(f"Unknown transpose method for namespace {xp}") + + +def nunique(data, axis=None, skipna=True, equal_nan=True, **kwargs): + """ + Count the number of unique values in this array along the given dimensions + """ + + xp = get_array_namespace(data) + + if axis is None: + axis = list(range(data.ndim)) + elif isinstance(axis, (int, tuple)): + axis = [axis] if isinstance(axis, int) else list(axis) + if axis == []: + # Return unchanged so downstream aggregation functions work as expected. + return data + # Normalize negative axes + axis = [ax % data.ndim for ax in axis] + shape = data.shape + if is_duck_dask_array(data): + # Store the original chunksizes along axis before reshaping array + axis_chunksizes = [s for i, s in enumerate(data.chunksize) if i in axis] + + # If mixed type array, convert to float first + if is_duck_array(data) and data.dtype == np.object_: + data = _factorize(data) + + # Move axes to be aggregated to the end and stack. + # Note dask arrays will get rechunked in the natural way. + new_order = [i for i in range(len(shape)) if i not in axis] + axis + new_shape = [s for i, s in enumerate(shape) if i not in axis] + [-1] + data = xp.reshape(_permute_dims(data, new_order), new_shape) + + def nunique_chunk(data): + """Compute nunique for a single chunk.""" + sorted_data = xp.sort(data, axis=-1) + unique_counts = xp.not_equal(sorted_data[..., :-1], sorted_data[..., 1:]) + unique_counts = xp.sum(unique_counts, axis=-1) + 1 + + # Subtract off na values as required + if skipna or (not skipna and equal_nan): + na_counts = isnull(data).astype(int) + na_counts = xp.sum(na_counts, axis=-1) + if not skipna and equal_nan: + na_counts = xp.clip(na_counts - 1, 0, None) + unique_counts = unique_counts - na_counts + + return unique_counts + + if is_duck_dask_array(data): + # Use map_blocks to preserve lazy evaluation + import dask.array as da + + allow_rechunk = kwargs.get("allow_rechunk", False) + if len(data.chunks[-1]) != 1 and not allow_rechunk: + message = f"""nunique requires a single chunk along aggregated dimension(s), + but input array has shape {shape!s} with chunksize(s) {axis_chunksizes!s} + along dimension(s) {axis!s}. To fix, either rechunk your array manually, + or pass ``allow_rechunk=True``, but be aware this may significantly increase + memory usage.""" + raise ValueError(message) + + kwargs = {"output_dtypes": [int], "allow_rechunk": True, "vectorize": True} + return da.apply_gufunc(nunique_chunk, "(i)->()", data, **kwargs) + else: + return nunique_chunk(data) + + def sum_where(data, axis=None, dtype=None, where=None): xp = get_array_namespace(data) if where is not None: diff --git a/xarray/namedarray/_aggregations.py b/xarray/namedarray/_aggregations.py index c5726ef9251..03fe84b980d 100644 --- a/xarray/namedarray/_aggregations.py +++ b/xarray/namedarray/_aggregations.py @@ -1,5 +1,4 @@ """Mixin classes with reduction operations.""" - # This file was generated using xarray.util.generate_aggregations. Do not edit manually. from __future__ import annotations @@ -352,6 +351,10 @@ def mean( :ref:`agg` User guide on reduction or aggregation operations. + Notes + ----- + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. + Examples -------- >>> from xarray.namedarray.core import NamedArray @@ -426,7 +429,7 @@ def prod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -509,7 +512,7 @@ def sum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -589,7 +592,7 @@ def std( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -669,7 +672,7 @@ def var( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -745,7 +748,7 @@ def median( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Examples -------- @@ -772,6 +775,88 @@ def median( **kwargs, ) + def nunique( + self, + dim: Dims = None, + *, + skipna: bool | None = None, + equal_nan: bool | None = True, + **kwargs: Any, + ) -> Self: + """ + Reduce this NamedArray's data by applying ``nunique`` along some dimension(s). + + Parameters + ---------- + dim : str, Iterable of Hashable, "..." or None, default: None + Name of dimension[s] along which to apply ``nunique``. For e.g. ``dim="x"`` + or ``dim=["x", "y"]``. If "..." or None, will reduce over all dimensions. + skipna : bool or None, optional + If True, skip missing values (as marked by NaN). By default, only + skips missing values for float dtypes; other dtypes either do not + have a sentinel missing value (int) or ``skipna=True`` has not been + implemented (object, datetime64 or timedelta64). + equal_nan : bool or None, default: True + If ``skipna == False``, ``equal_nan`` determines whether null values + are counted as distinct values or not. Set ``equal_nan = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equal_nan = False`` + for consistency with the `Python array API `_. + **kwargs : Any + Additional keyword arguments passed on to the appropriate array + function for calculating ``nunique`` on this object's data. + These could include dask-specific kwargs like ``split_every``. + + Returns + ------- + reduced : NamedArray + New NamedArray with ``nunique`` applied to its data and the + indicated dimension(s) removed + + See Also + -------- + pandas.DataFrame.nunique + Dataset.nunique + DataArray.nunique + :ref:`agg` + User guide on reduction or aggregation operations. + + Notes + ----- + For dask arrays, there must be a single chunk in each dimension + nunique is being applied over. + + Examples + -------- + >>> from xarray.namedarray.core import NamedArray + >>> na = NamedArray("x", np.array([1, 2, 3, 0, 2, np.nan])) + >>> na + Size: 48B + array([ 1., 2., 3., 0., 2., nan]) + + >>> na.nunique() + Size: 8B + array(5) + + Use ``skipna`` to control whether NaNs are ignored. + + >>> na.nunique(skipna=False) + Size: 8B + array(5) + + Use ``equal_nan`` to control whether NaNs are counted as distinct values. + + >>> na.nunique(skipna=False, equal_nan=False) + Size: 8B + array(5) + """ + return self.reduce( + duck_array_ops.nunique, + dim=dim, + skipna=skipna, + equal_nan=equal_nan, + **kwargs, + ) + def cumsum( self, dim: Dims = None, @@ -815,7 +900,7 @@ def cumsum( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated @@ -889,7 +974,7 @@ def cumprod( Notes ----- - Non-numeric variables will be removed prior to reducing. + Non-numeric variables will be removed prior to reducing. datetime64 and timedelta64 dtypes are treated as numeric for aggregation operations. Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 5eec7b8a2fd..d0bab53330d 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4916,6 +4916,45 @@ def line(x, a, b): assert_allclose(fit.curvefit_coefficients, expected) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("dim", ["c", None, ("b", "c")]) + def test_nunique(self, skipna, dim): + x = np.array( + [ + [ + [np.nan, np.nan, 2.0, np.nan], + [np.nan, 5.0, 6.0, np.nan], + [8.0, 9.0, 10.0, np.nan], + ], + [ + [np.nan, 13.0, 14.0, 15.0], + [np.nan, 17.0, 18.0, np.nan], + [np.nan, 21.0, np.nan, np.nan], + ], + ] + ) + coords = { + "a": range(x.shape[0]), + "b": range(x.shape[1]), + "c": range(x.shape[2]), + } + da = DataArray(x, coords=coords) + + coords_1 = {"a": range(x.shape[0]), "b": range(x.shape[1])} + coords_3 = {"a": range(x.shape[0])} + + expected_results = { + (True, "c"): DataArray([[1, 2, 3], [3, 2, 1]], coords=coords_1), + (True, None): DataArray(12), + (True, ("b", "c")): DataArray([6, 6], coords=coords_3), + (False, "c"): DataArray([[2, 3, 4], [4, 3, 2]], coords=coords_1), + (False, None): DataArray(13), + (False, ("b", "c")): DataArray([7, 7], coords=coords_3), + } + + result = da.nunique(dim=dim, skipna=skipna) + assert_identical(result, expected_results[(skipna, dim)]) + class TestReduce: @pytest.fixture(autouse=True) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index e677430dfbf..209920b8f0d 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -7482,6 +7482,67 @@ def test_query(self, backend, engine, parser) -> None: # pytest tests — new tests should go here, rather than in the class. +@pytest.mark.parametrize("skipna", [True, False]) +@pytest.mark.parametrize("dim", [("c", "dim_0", "dim_1"), None, ("a", "b")]) +def test_nunique(skipna, dim): + # Create test data + x = np.array( + [ + [ + [np.nan, np.nan, 2.0, np.nan], + [np.nan, 5.0, 6.0, np.nan], + [8.0, 9.0, 10.0, np.nan], + ], + [ + [np.nan, 13.0, 14.0, 15.0], + [np.nan, 17.0, 18.0, np.nan], + [np.nan, 21.0, np.nan, np.nan], + ], + ] + ) + coords = {"a": range(x.shape[0]), "b": range(x.shape[1]), "c": range(x.shape[2])} + da_1 = DataArray(x, coords=coords) + da_2 = DataArray(x) + ds = Dataset({"da_1": da_1, "da_2": da_2}) + + # Specify the coordinates and arrays we expect for each test case + coords_1 = {"a": range(x.shape[0]), "b": range(x.shape[1])} + coords_3 = {"c": range(x.shape[2])} + arr_1 = np.array([[1, 2, 3], [3, 2, 1]]) + arr_3 = np.array([1, 5, 5, 1]) + expected_results = { + (True, ("c", "dim_0", "dim_1")): (arr_1, coords_1, arr_3, ["dim_2"]), + (True, None): (12, None, 12, None), + (True, ("a", "b")): (arr_3, coords_3, x, None), + (False, ("c", "dim_0", "dim_1")): (arr_1 + 1, coords_1, arr_3 + 1, ["dim_2"]), + (False, None): (13, None, 13, None), + (False, ("a", "b")): (arr_3 + 1, coords_3, x, None), + } + + # Get the expected result for the current parameters + expected_result = expected_results[(skipna, dim)] + expected_ds = Dataset( + { + "da_1": DataArray(expected_result[0], coords=expected_result[1]), + "da_2": DataArray(expected_result[2], dims=expected_result[3]), + } + ) + + # Get the actual result and compare + result = ds.nunique(dim=dim, skipna=skipna) + assert_identical(result, expected_ds) + + +@pytest.mark.parametrize("skipna", [True, False]) +def test_nunique_pandas(skipna): + get_col = lambda: np.random.randint(0, 100, size=100) + get_da = lambda: xr.DataArray(get_col(), coords={"x": np.arange(100)}) + ds = xr.Dataset({"a": get_da(), "b": get_da(), "c": get_da(), "d": get_da()}) + xr_result = ds.nunique(skipna=skipna).to_array().values + pd_result = ds.to_dataframe().nunique(dropna=skipna).values + assert_array_equal(xr_result, pd_result) + + @pytest.mark.parametrize("parser", ["pandas", "python"]) def test_eval(ds, parser) -> None: """Currently much more minimal testing that `query` above, and much of the setup diff --git a/xarray/tests/test_datatree.py b/xarray/tests/test_datatree.py index 0cd888f5782..21e1a6e0435 100644 --- a/xarray/tests/test_datatree.py +++ b/xarray/tests/test_datatree.py @@ -2312,6 +2312,18 @@ def test_subtree(self) -> None: actual = tree.children["child"].mean() assert_identical(expected, actual) + def test_nunique(self) -> None: + arr = np.array([[1, 2, 2], [3, 3, 3]]) + da = xr.DataArray(arr, coords={"x": [0, 1], "y": [0, 1, 2]}) + ds = xr.Dataset({"a": da}) + dt = DataTree.from_dict({"root": ds, "root/child": 2 * ds}) + expected_da = xr.DataArray(np.array([2, 1]), coords={"x": [0, 1]}) + expected_ds = xr.Dataset({"a": expected_da}) + expected_dt = DataTree.from_dict( + {"root": expected_ds, "root/child": expected_ds} + ) + assert_identical(expected_dt, dt.nunique(dim="y")) + class TestOps: def test_unary_op(self) -> None: diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 83c7c2bb207..4390e5d17c8 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -24,6 +24,7 @@ least_squares, mean, np_timedelta64_to_float, + nunique, pd_timedelta_to_float, push, py_timedelta_to_float, @@ -165,6 +166,45 @@ def test_count(self): assert 1 == count(np.datetime64("2000-01-01")) + @pytest.mark.parametrize("equalna", [True, False]) + @pytest.mark.parametrize("mixed_type", [True, False]) + @pytest.mark.parametrize("string_array", [True, False]) + @pytest.mark.parametrize("skipna", [True, False]) + @pytest.mark.parametrize("axis", [2, None, (1, 2)]) + def test_nunique(self, axis, skipna, equalna, string_array, mixed_type): + expected_results = { + (True, True, 2): np.array([[1, 2, 3], [3, 2, 1]]), + (True, True, None): np.array(12), + (True, True, (1, 2)): np.array([6, 6]), + (True, False, 2): np.array([[2, 3, 4], [4, 3, 2]]), + (True, False, None): np.array(13), + (True, False, (1, 2)): np.array([7, 7]), + (False, True, 2): np.array([[1, 2, 3], [3, 2, 1]]), + (False, True, None): np.array(12), + (False, True, (1, 2)): np.array([6, 6]), + (False, False, 2): np.array([[4, 4, 4], [4, 4, 4]]), + (False, False, None): np.array(24), + (False, False, (1, 2)): np.array([12, 12]), + } + x = self.x.copy() + if string_array: + # Convert to str + x = x.astype(str) + # Convert to object and put nans back in + x = x.astype(object) + x[x == "nan"] = np.nan + if mixed_type: + x = x.astype(object) + x[(x == 10.0) | (x == "10.0")] = True + x[(x == 2.0) | (x == "2.0")] = np.sum + # Object arrays currently only supported for np.ndarray + if (mixed_type or string_array) and not isinstance(x, np.ndarray): + with pytest.raises(NotImplementedError): + nunique(x, axis=axis, skipna=skipna, equalna=equalna) + return + result = nunique(x, axis=axis, skipna=skipna, equalna=equalna) + assert_array_equal(result, expected_results[(equalna, skipna, axis)]) + def test_where_type_promotion(self): result = where(np.array([True, False]), np.array([1, 2]), np.array(["a", "b"])) assert_array_equal(result, np.array([1, "b"], dtype=object)) @@ -263,6 +303,10 @@ def setUp(self): chunks=(2, 1, 2), ) + def test_nunique_dask_lazy(self): + with raise_if_dask_computes(): + nunique(self.x, axis=0) + def test_cumsum_1d(): inputs = np.array([0, 1, 2, 3]) diff --git a/xarray/util/generate_aggregations.py b/xarray/util/generate_aggregations.py index e386b96f63d..57e4535dacf 100644 --- a/xarray/util/generate_aggregations.py +++ b/xarray/util/generate_aggregations.py @@ -194,6 +194,12 @@ def {method}( have a sentinel missing value (int) or ``skipna=True`` has not been implemented (object, datetime64 or timedelta64).""" +_EQUAL_NAN_DOCSTRING = """equal_nan : bool or None, default: True + If ``skipna == False``, ``equal_nan`` determines whether null values + are counted as distinct values or not. Set ``equal_nan = True`` for + consistency with ``pandas.DataFrame.nunique``, or ``equal_nan = False`` + for consistency with the `Python array API `_.""" + _MINCOUNT_DOCSTRING = """min_count : int or None, optional The required number of valid values to perform the operation. If fewer than min_count non-NA values are present the result will be @@ -226,6 +232,8 @@ def {method}( _CUM_NOTES = """Note that the methods on the ``cumulative`` method are more performant (with numbagg installed) and better supported. ``cumsum`` and ``cumprod`` may be deprecated in the future.""" +_NUNIQUE_NOTES = """For dask arrays, there must be a single chunk in each dimension +nunique is being applied over.""" class ExtraKwarg(NamedTuple): @@ -239,28 +247,45 @@ class ExtraKwarg(NamedTuple): docs=_SKIPNA_DOCSTRING, kwarg="skipna: bool | None = None,", call="skipna=skipna,", - example="""\n - Use ``skipna`` to control whether NaNs are ignored. - - >>> {calculation}(skipna=False)""", + example=( + "\n \n" + " Use ``skipna`` to control whether NaNs are ignored.\n" + " \n" + " >>> {calculation}(skipna=False)" + ), +) +equal_nan = ExtraKwarg( + docs=_EQUAL_NAN_DOCSTRING, + kwarg="equal_nan: bool | None = True,", + call="equal_nan=equal_nan,", + example=( + "\n \n" + " Use ``equal_nan`` to control whether NaNs are counted as distinct values.\n" + " \n" + " >>> {calculation}(skipna=False, equal_nan=False)" + ), ) min_count = ExtraKwarg( docs=_MINCOUNT_DOCSTRING, kwarg="min_count: int | None = None,", call="min_count=min_count,", - example="""\n - Specify ``min_count`` for finer control over when NaNs are ignored. - - >>> {calculation}(skipna=True, min_count=2)""", + example=( + "\n \n" + " Specify ``min_count`` for finer control over when NaNs are ignored.\n" + " \n" + " >>> {calculation}(skipna=True, min_count=2)" + ), ) ddof = ExtraKwarg( docs=_DDOF_DOCSTRING, kwarg="ddof: int = 0,", call="ddof=ddof,", - example="""\n - Specify ``ddof=1`` for an unbiased estimate. - - >>> {calculation}(skipna=True, ddof=1)""", + example=( + "\n \n" + " Specify ``ddof=1`` for an unbiased estimate.\n" + " \n" + " >>> {calculation}(skipna=True, ddof=1)" + ), ) @@ -424,11 +449,11 @@ def generate_example(self, method): else: extra_examples = "" + blank_line = 8 * " " return f""" Examples --------{created} - >>> {self.datastructure.example_var_name} - + >>> {self.datastructure.example_var_name}\n{blank_line} >>> {calculation}(){extra_examples}""" @@ -444,7 +469,12 @@ def generate_code(self, method, has_keep_attrs): # median isn't enabled yet, because it would break if a single group was present in multiple # chunks. The non-flox code path will just rechunk every group to a single chunk and execute the median - method_is_not_flox_supported = method.name in ("median", "cumsum", "cumprod") + method_is_not_flox_supported = method.name in ( + "median", + "cumsum", + "cumprod", + "nunique", + ) if method_is_not_flox_supported: indent = 12 else: @@ -530,6 +560,12 @@ def generate_code(self, method, has_keep_attrs): Method( "median", extra_kwargs=(skipna,), numeric_only=True, min_flox_version="0.9.2" ), + Method( + "nunique", + extra_kwargs=(skipna, equal_nan), + see_also_modules=("pandas.DataFrame",), + additional_notes=_NUNIQUE_NOTES, + ), # Cumulatives: Method( "cumsum",