Skip to content

Commit ce66719

Browse files
committed
# BUG: Fix inconsistent results for pd.api.types.is_string_dtype() with Categorical series vs dtype
## Description This PR fixes an inconsistency in `pd.api.types.is_string_dtype()` when passed a Categorical series directly versus the dtype of that series. Currently: ```python import pandas as pd series = pd.Categorical(['A', 'B', 'C']) print(f"is_string_dtype(series): {pd.api.types.is_string_dtype(series)}") # True print(f"is_string_dtype(series.dtype): {pd.api.types.is_string_dtype(series.dtype)}") # False ``` The issue is that when a Categorical series is passed, the function correctly checks if the categories are strings, but when a Categorical dtype is passed directly, it doesn't handle it properly. ## Fix The fix adds explicit handling for CategoricalDtype in both cases (series and dtype) to ensure consistent behavior. ## Test Plan Added a new test file `test_categorical_string_dtype.py` with tests that verify the consistent behavior for both Categorical series and their dtypes. Fixes #62109
1 parent 23aae9f commit ce66719

File tree

2 files changed

+24
-0
lines changed

2 files changed

+24
-0
lines changed

pandas/core/dtypes/common.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,13 @@ def is_string_dtype(arr_or_dtype) -> bool:
635635
>>> is_string_dtype(pd.Series([1, 2], dtype=object))
636636
False
637637
"""
638+
# Handle Categorical consistently whether passed as array or dtype
639+
if hasattr(arr_or_dtype, "dtype") and isinstance(_get_dtype(arr_or_dtype), CategoricalDtype):
640+
return is_all_strings(arr_or_dtype)
641+
elif isinstance(arr_or_dtype, CategoricalDtype):
642+
# For CategoricalDtype, check if categories are strings
643+
return arr_or_dtype.categories.inferred_type == "string"
644+
638645
if hasattr(arr_or_dtype, "dtype") and _get_dtype(arr_or_dtype).kind == "O":
639646
return is_all_strings(arr_or_dtype)
640647

pandas/tests/dtypes/test_common.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,23 @@ def test_is_string_dtype_nullable(nullable_string_dtype):
337337
assert com.is_string_dtype(pd.array(["a", "b"], dtype=nullable_string_dtype))
338338

339339

340+
def test_is_string_dtype_categorical():
341+
# GH#XXXXX - is_string_dtype should be consistent for Categorical series and dtype
342+
cat_series = pd.Categorical(['A', 'B', 'C'])
343+
assert not com.is_string_dtype(cat_series)
344+
assert not com.is_string_dtype(cat_series.dtype)
345+
346+
# Test with string categories
347+
cat_string_series = pd.Categorical(['A', 'B', 'C'], categories=['A', 'B', 'C'])
348+
assert com.is_string_dtype(cat_string_series)
349+
assert com.is_string_dtype(cat_string_series.dtype)
350+
351+
# Test with non-string categories
352+
cat_int_series = pd.Categorical([1, 2, 3], categories=[1, 2, 3])
353+
assert not com.is_string_dtype(cat_int_series)
354+
assert not com.is_string_dtype(cat_int_series.dtype)
355+
356+
340357
integer_dtypes: list = []
341358

342359

0 commit comments

Comments
 (0)