Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,7 @@ Other Deprecations
- Deprecated :meth:`Timestamp.utcfromtimestamp`, use ``Timestamp.fromtimestamp(ts, "UTC")`` instead (:issue:`56680`)
- Deprecated :meth:`Timestamp.utcnow`, use ``Timestamp.now("UTC")`` instead (:issue:`56680`)
- Deprecated ``pd.core.internals.api.maybe_infer_ndim`` (:issue:`40226`)
- Deprecated allowing constructing or casting to :class:`Categorical` with non-NA values that are not present in specified ``dtype.categories`` (:issue:`40996`)
- Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`, :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`)
- Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`)
- Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`)
Expand Down
66 changes: 57 additions & 9 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
cast,
overload,
)
import warnings

import numpy as np

Expand All @@ -23,6 +24,7 @@
)
from pandas._libs.arrays import NDArrayBacked
from pandas.compat.numpy import function as nv
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

from pandas.core.dtypes.cast import (
Expand Down Expand Up @@ -479,7 +481,11 @@ def __init__(
elif isinstance(values.dtype, CategoricalDtype):
old_codes = extract_array(values)._codes
codes = recode_for_categories(
old_codes, values.dtype.categories, dtype.categories, copy=copy
old_codes,
values.dtype.categories,
dtype.categories,
copy=copy,
warn=True,
)

else:
Expand Down Expand Up @@ -535,7 +541,13 @@ def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
# The _from_scalars strictness doesn't make much sense in this case.
raise NotImplementedError

res = cls._from_sequence(scalars, dtype=dtype)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Constructing a Categorical with a dtype and values",
FutureWarning,
)
res = cls._from_sequence(scalars, dtype=dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens here once the deprecation is enforced?


# if there are any non-category elements in scalars, these will be
# converted to NAs in res.
Expand Down Expand Up @@ -576,6 +588,15 @@ def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
dtype = self.dtype.update_dtype(dtype)
self = self.copy() if copy else self
result = self._set_dtype(dtype, copy=False)
wrong = result.isna() & ~self.isna()
if wrong.any():
warnings.warn(
"Constructing a Categorical with a dtype and values containing "
"non-null entries not in that dtype's categories is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)

elif isinstance(dtype, ExtensionDtype):
return super().astype(dtype, copy=copy)
Expand Down Expand Up @@ -670,14 +691,16 @@ def _from_inferred_categories(
if known_categories:
# Recode from observation order to dtype.categories order.
categories = dtype.categories
codes = recode_for_categories(inferred_codes, cats, categories, copy=False)
codes = recode_for_categories(
inferred_codes, cats, categories, copy=False, warn=True
)
elif not cats.is_monotonic_increasing:
# Sort categories and recode for unknown categories.
unsorted = cats.copy()
categories = cats.sort_values()

codes = recode_for_categories(
inferred_codes, unsorted, categories, copy=False
inferred_codes, unsorted, categories, copy=False, warn=True
)
dtype = CategoricalDtype(categories, ordered=False)
else:
Expand Down Expand Up @@ -1156,7 +1179,7 @@ def set_categories(
codes = cat._codes
else:
codes = recode_for_categories(
cat.codes, cat.categories, new_dtype.categories, copy=False
cat.codes, cat.categories, new_dtype.categories, copy=False, warn=False
)
NDArrayBacked.__init__(cat, codes, new_dtype)
return cat
Expand Down Expand Up @@ -3004,11 +3027,25 @@ def _get_codes_for_values(
If `values` is known to be a Categorical, use recode_for_categories instead.
"""
codes = categories.get_indexer_for(values)
wrong = (codes == -1) & ~isna(values)
if wrong.any():
warnings.warn(
"Constructing a Categorical with a dtype and values containing "
"non-null entries not in that dtype's categories is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)
return coerce_indexer_dtype(codes, categories)


def recode_for_categories(
codes: np.ndarray, old_categories, new_categories, *, copy: bool
codes: np.ndarray,
old_categories,
new_categories,
*,
copy: bool = True,
warn: bool = False,
) -> np.ndarray:
"""
Convert a set of codes for to a new set of categories
Expand All @@ -3019,6 +3056,8 @@ def recode_for_categories(
old_categories, new_categories : Index
copy: bool, default True
Whether to copy if the codes are unchanged.
warn : bool, default False
Whether to warn on silent-NA mapping.

Returns
-------
Expand All @@ -3043,9 +3082,18 @@ def recode_for_categories(
return codes.copy()
return codes

indexer = coerce_indexer_dtype(
new_categories.get_indexer_for(old_categories), new_categories
)
codes_in_old_cats = new_categories.get_indexer_for(old_categories)
if warn:
wrong = codes_in_old_cats == -1
if wrong.any():
warnings.warn(
"Constructing a Categorical with a dtype and values containing "
"non-null entries not in that dtype's categories is deprecated "
"and will raise in a future version.",
FutureWarning,
stacklevel=find_stack_level(),
)
indexer = coerce_indexer_dtype(codes_in_old_cats, new_categories)
new_codes = take_nd(indexer, codes, fill_value=-1)
return new_codes

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ def groups(self) -> dict[Hashable, Index]:
return self.groupings[0].groups
result_index, ids = self.result_index_and_ids
values = result_index._values
categories = Categorical(ids, categories=range(len(result_index)))
categories = Categorical.from_codes(ids, categories=range(len(result_index)))
result = {
# mypy is not aware that group has to be an integer
values[group]: self.axis.take(axis_ilocs) # type: ignore[call-overload]
Expand Down
6 changes: 6 additions & 0 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,12 @@ def _is_dtype_compat(self, other: Index) -> Categorical:
else:
values = other

codes = self.categories.get_indexer(values)
if ((codes == -1) & ~values.isna()).any():
# GH#37667 see test_equals_non_category
raise TypeError(
"categories must match existing categories when appending"
)
cat = Categorical(other, dtype=self.dtype)
other = CategoricalIndex(cat)
if not other.isin(values).all():
Expand Down
12 changes: 10 additions & 2 deletions pandas/tests/arrays/categorical/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,16 @@ def test_set_categories(self):
],
)
def test_set_categories_many(self, values, categories, new_categories, ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
msg = "Constructing a Categorical with a dtype and values containing"

warn1 = FutureWarning if set(values).difference(categories) else None
with tm.assert_produces_warning(warn1, match=msg):
c = Categorical(values, categories)

warn2 = FutureWarning if set(values).difference(new_categories) else None
with tm.assert_produces_warning(warn2, match=msg):
expected = Categorical(values, new_categories, ordered)

result = c.set_categories(new_categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)

Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/arrays/categorical/test_astype.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,11 @@ def test_astype_category(self, dtype_ordered, ordered):

# non-standard categories
dtype = CategoricalDtype(list("adc"), dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, dtype=dtype)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = cat.astype(dtype)
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = Categorical(data, dtype=dtype)
tm.assert_categorical_equal(result, expected)

if dtype_ordered is False:
Expand Down
65 changes: 44 additions & 21 deletions pandas/tests/arrays/categorical/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,14 +228,15 @@ def test_constructor(self):
# two arrays
# - when the first is an integer dtype and the second is not
# - when the resulting codes are all -1/NaN
with tm.assert_produces_warning(None):
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"])

with tm.assert_produces_warning(None):
with tm.assert_produces_warning(FutureWarning, match=msg):
Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5])

# the next one are from the old docs
with tm.assert_produces_warning(None):
with tm.assert_produces_warning(FutureWarning, match=msg):
Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
cat = Categorical([1, 2], categories=[1, 2, 3])

Expand All @@ -247,12 +248,16 @@ def test_constructor_with_existing_categories(self):
# GH25318: constructing with pd.Series used to bogusly skip recoding
# categories
c0 = Categorical(["a", "b", "c", "a"])
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])

c2 = Categorical(c0, categories=c1.categories)
with tm.assert_produces_warning(FutureWarning, match=msg):
c2 = Categorical(c0, categories=c1.categories)
tm.assert_categorical_equal(c1, c2)

c3 = Categorical(Series(c0), categories=c1.categories)
with tm.assert_produces_warning(FutureWarning, match=msg):
c3 = Categorical(Series(c0), categories=c1.categories)
tm.assert_categorical_equal(c1, c3)

def test_constructor_not_sequence(self):
Expand Down Expand Up @@ -430,10 +435,13 @@ def test_constructor_dtype_and_others_raises(self):

@pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]])
def test_constructor_str_category(self, categories, ordered):
result = Categorical(
["a", "b"], categories=categories, ordered=ordered, dtype="category"
)
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
warn = FutureWarning if categories == ["a", "c"] else None
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(warn, match=msg):
result = Categorical(
["a", "b"], categories=categories, ordered=ordered, dtype="category"
)
expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)

def test_constructor_str_unknown(self):
Expand All @@ -450,10 +458,12 @@ def test_constructor_np_strs(self):
def test_constructor_from_categorical_with_dtype(self):
dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
values = Categorical(["a", "b", "d"])
result = Categorical(values, dtype=dtype)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = Categorical(values, dtype=dtype)
# We use dtype.categories, not values.categories
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
["a", "b", None], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)

Expand All @@ -470,16 +480,19 @@ def test_constructor_from_categorical_with_unknown_dtype(self):
def test_constructor_from_categorical_string(self):
values = Categorical(["a", "b", "d"])
# use categories, ordered
result = Categorical(
values, categories=["a", "b", "c"], ordered=True, dtype="category"
)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = Categorical(
values, categories=["a", "b", "c"], ordered=True, dtype="category"
)
expected = Categorical(
["a", "b", "d"], categories=["a", "b", "c"], ordered=True
["a", "b", None], categories=["a", "b", "c"], ordered=True
)
tm.assert_categorical_equal(result, expected)

# No string
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
with tm.assert_produces_warning(FutureWarning, match=msg):
result = Categorical(values, categories=["a", "b", "c"], ordered=True)
tm.assert_categorical_equal(result, expected)

def test_constructor_with_categorical_categories(self):
Expand Down Expand Up @@ -661,17 +674,25 @@ def test_from_inferred_categories_dtype(self):
cats = ["a", "b", "d"]
codes = np.array([0, 1, 0, 2], dtype="i8")
dtype = CategoricalDtype(["c", "b", "a"], ordered=True)
result = Categorical._from_inferred_categories(cats, codes, dtype)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(
FutureWarning, match=msg, check_stacklevel=False
):
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical(
["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True
["a", "b", "a", None], categories=["c", "b", "a"], ordered=True
)
tm.assert_categorical_equal(result, expected)

def test_from_inferred_categories_coerces(self):
cats = ["1", "2", "bad"]
codes = np.array([0, 0, 1, 2], dtype="i8")
dtype = CategoricalDtype([1, 2])
result = Categorical._from_inferred_categories(cats, codes, dtype)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(
FutureWarning, match=msg, check_stacklevel=False
):
result = Categorical._from_inferred_categories(cats, codes, dtype)
expected = Categorical([1, 1, 2, np.nan])
tm.assert_categorical_equal(result, expected)

Expand Down Expand Up @@ -722,7 +743,9 @@ def test_interval(self):

# extra
values = pd.interval_range(8, 11, periods=3)
cat = Categorical(values, categories=idx)
msg = "Constructing a Categorical with a dtype and values containing"
with tm.assert_produces_warning(FutureWarning, match=msg):
cat = Categorical(values, categories=idx)
expected_codes = np.array([8, 9, -1], dtype="int8")
tm.assert_numpy_array_equal(cat.codes, expected_codes)
tm.assert_index_equal(cat.categories, idx)
Expand Down
Loading
Loading