Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,7 @@ Reshaping
- Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
- Bug in :meth:`DataFrame.unstack` raising an error with indexes containing ``NaN`` with ``sort=False`` (:issue:`61221`)
- Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`)
- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
- Bug in :meth:`DataFrame.pivot_table` incorrectly ignoring the ``values`` argument when also supplied to the ``index`` or ``columns`` parameters (:issue:`57876`, :issue:`61292`)
Expand Down
36 changes: 28 additions & 8 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,11 @@ def __init__(

self.level = self.index._get_level_number(level)

# when index includes `nan`, need to lift levels/strides by 1
self.lift = 1 if -1 in self.index.codes[self.level] else 0
# `nan` values have code `-1`, when sorting, we lift to assign them
# at index 0
self.has_nan = -1 in self.index.codes[self.level]
should_lift = self.has_nan and self.sort
self.lift = 1 if should_lift else 0

# Note: the "pop" below alters these in-place.
self.new_index_levels = list(self.index.levels)
Expand All @@ -138,8 +141,16 @@ def __init__(
self.removed_name = self.new_index_names.pop(self.level)
self.removed_level = self.new_index_levels.pop(self.level)
self.removed_level_full = index.levels[self.level]
self.unique_nan_index: int = -1
if not self.sort:
unique_codes = unique(self.index.codes[self.level])
unique_codes: np.ndarray = unique(self.index.codes[self.level])
if self.has_nan:
# drop nan codes, because they are not represented in level
nan_mask = unique_codes == -1

unique_codes = unique_codes[~nan_mask]
self.unique_nan_index = np.flatnonzero(nan_mask)[0]

self.removed_level = self.removed_level.take(unique_codes)
self.removed_level_full = self.removed_level_full.take(unique_codes)

Expand Down Expand Up @@ -210,7 +221,7 @@ def _make_selectors(self) -> None:
ngroups = len(obs_ids)

comp_index = ensure_platform_int(comp_index)
stride = self.index.levshape[self.level] + self.lift
stride = self.index.levshape[self.level] + self.has_nan
self.full_shape = ngroups, stride

selector = self.sorted_labels[-1] + stride * comp_index + self.lift
Expand Down Expand Up @@ -362,13 +373,13 @@ def get_new_values(self, values, fill_value=None):

def get_new_columns(self, value_columns: Index | None):
if value_columns is None:
if self.lift == 0:
if not self.has_nan:
return self.removed_level._rename(name=self.removed_name)

lev = self.removed_level.insert(0, item=self.removed_level._na_value)
return lev.rename(self.removed_name)

stride = len(self.removed_level) + self.lift
stride = len(self.removed_level) + self.has_nan
width = len(value_columns)
propagator = np.repeat(np.arange(width), stride)

Expand Down Expand Up @@ -401,12 +412,21 @@ def _repeater(self) -> np.ndarray:
if len(self.removed_level_full) != len(self.removed_level):
# In this case, we remap the new codes to the original level:
repeater = self.removed_level_full.get_indexer(self.removed_level)
if self.lift:
if self.has_nan:
# insert nan index at first position
repeater = np.insert(repeater, 0, -1)
else:
# Otherwise, we just use each level item exactly once:
stride = len(self.removed_level) + self.lift
stride = len(self.removed_level) + self.has_nan
repeater = np.arange(stride) - self.lift
if self.has_nan and not self.sort:
assert self.unique_nan_index > -1, (
"`unique_nan_index` not properly initialized"
)
# assign -1 where should be nan according to the unique values.
repeater[self.unique_nan_index] = -1
# compensate for the removed index level
repeater[self.unique_nan_index + 1 :] -= 1

return repeater

Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,43 @@ def test_unstack_sort_false(frame_or_series, dtype):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"levels2, expected_columns",
[
(
[None, 1, 2, 3],
[("value", np.nan), ("value", 1), ("value", 2), ("value", 3)],
),
(
[1, None, 2, 3],
[("value", 1), ("value", np.nan), ("value", 2), ("value", 3)],
),
(
[1, 2, None, 3],
[("value", 1), ("value", 2), ("value", np.nan), ("value", 3)],
),
(
[1, 2, 3, None],
[("value", 1), ("value", 2), ("value", 3), ("value", np.nan)],
),
],
ids=["nan=first", "nan=second", "nan=third", "nan=last"],
)
def test_unstack_sort_false_nan(levels2, expected_columns):
# GH#61221
levels1 = ["b", "a"]
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": [0, 1, 2, 3, 4, 5, 6, 7]}, index=index)
result = df.unstack(level="level2", sort=False)
expected_data = [[0, 4], [1, 5], [2, 6], [3, 7]]
expected = DataFrame(
dict(zip(expected_columns, expected_data)),
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(expected_columns, names=[None, "level2"]),
)
tm.assert_frame_equal(result, expected)


def test_unstack_fill_frame_object():
# GH12815 Test unstacking with object.
data = Series(["a", "b", "c", "a"], dtype="object")
Expand Down
Loading