Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1083,6 +1083,7 @@ Reshaping
- Bug in :meth:`DataFrame.join` when a :class:`DataFrame` with a :class:`MultiIndex` would raise an ``AssertionError`` when :attr:`MultiIndex.names` contained ``None``. (:issue:`58721`)
- Bug in :meth:`DataFrame.merge` where merging on a column containing only ``NaN`` values resulted in an out-of-bounds array access (:issue:`59421`)
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
- Bug in :meth:`DataFrame.unstack` raising an error with indexes containing ``NaN`` with ``sort=False`` (:issue:`61221`)
- Bug in :meth:`DataFrame.merge` when merging two :class:`DataFrame` on ``intc`` or ``uintc`` types on Windows (:issue:`60091`, :issue:`58713`)
- Bug in :meth:`DataFrame.pivot_table` incorrectly subaggregating results when called without an ``index`` argument (:issue:`58722`)
- Bug in :meth:`DataFrame.pivot_table` incorrectly ignoring the ``values`` argument when also supplied to the ``index`` or ``columns`` parameters (:issue:`57876`, :issue:`61292`)
Expand Down
36 changes: 28 additions & 8 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,11 @@ def __init__(

self.level = self.index._get_level_number(level)

# when index includes `nan`, need to lift levels/strides by 1
self.lift = 1 if -1 in self.index.codes[self.level] else 0
# `nan` values have code `-1`, when sorting, we lift to assign them
# at index 0
self.has_nan = -1 in self.index.codes[self.level]
should_lift = self.has_nan and self.sort
self.lift = 1 if should_lift else 0

# Note: the "pop" below alters these in-place.
self.new_index_levels = list(self.index.levels)
Expand All @@ -138,8 +141,16 @@ def __init__(
self.removed_name = self.new_index_names.pop(self.level)
self.removed_level = self.new_index_levels.pop(self.level)
self.removed_level_full = index.levels[self.level]
self.unique_nan_index: int = -1
if not self.sort:
unique_codes = unique(self.index.codes[self.level])
unique_codes: np.ndarray = unique(self.index.codes[self.level])
if self.has_nan:
# drop nan codes, because they are not represented in level
nan_mask = unique_codes == -1

unique_codes = unique_codes[~nan_mask]
self.unique_nan_index = np.flatnonzero(nan_mask)[0]

self.removed_level = self.removed_level.take(unique_codes)
self.removed_level_full = self.removed_level_full.take(unique_codes)

Expand Down Expand Up @@ -210,7 +221,7 @@ def _make_selectors(self) -> None:
ngroups = len(obs_ids)

comp_index = ensure_platform_int(comp_index)
stride = self.index.levshape[self.level] + self.lift
stride = self.index.levshape[self.level] + self.has_nan
self.full_shape = ngroups, stride

selector = self.sorted_labels[-1] + stride * comp_index + self.lift
Expand Down Expand Up @@ -362,13 +373,13 @@ def get_new_values(self, values, fill_value=None):

def get_new_columns(self, value_columns: Index | None):
if value_columns is None:
if self.lift == 0:
if not self.has_nan:
return self.removed_level._rename(name=self.removed_name)

lev = self.removed_level.insert(0, item=self.removed_level._na_value)
return lev.rename(self.removed_name)

stride = len(self.removed_level) + self.lift
stride = len(self.removed_level) + self.has_nan
width = len(value_columns)
propagator = np.repeat(np.arange(width), stride)

Expand Down Expand Up @@ -401,12 +412,21 @@ def _repeater(self) -> np.ndarray:
if len(self.removed_level_full) != len(self.removed_level):
# In this case, we remap the new codes to the original level:
repeater = self.removed_level_full.get_indexer(self.removed_level)
if self.lift:
if self.has_nan:
# insert nan index at first position
repeater = np.insert(repeater, 0, -1)
else:
# Otherwise, we just use each level item exactly once:
stride = len(self.removed_level) + self.lift
stride = len(self.removed_level) + self.has_nan
repeater = np.arange(stride) - self.lift
if self.has_nan and not self.sort:
assert self.unique_nan_index > -1, (
"`unique_nan_index` not properly initialized"
)
# assign -1 where should be nan according to the unique values.
repeater[self.unique_nan_index] = -1
# compensate for the removed index level
repeater[self.unique_nan_index + 1 :] -= 1

return repeater

Expand Down
40 changes: 40 additions & 0 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1386,6 +1386,46 @@ def test_unstack_sort_false(frame_or_series, dtype):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"levels2, expected_columns, expected_data",
[
(
Index([None, 1, 2, 3]),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Could you make the Index call in the body of the test since it's the same across all parameters?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the parametrized argument change would be

- Index([None, 1, 2, 3])
+ [None, 1, 2, 3]

or do you expect something different?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes correct.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After applying the suggested change I had to alter the expected_columns values, the columns from .unstack() became integers instead of floats.

I also removed expected_data from the parametrization and put it into the test body, as it was the same for all tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am sorry, I ended up not calling Index in the test, that's why it is no longer float. But I can still reproduce the error that I was seeing in the issue with these new tests:

git restore --source=main pandas/core/reshape/reshape.pypytest pandas/tests/frame/test_stack_unstack.py::test_unstack_sort_false_nan
...
FAILED pandas/tests/frame/test_stack_unstack.py::test_unstack_sort_false_nan[nan=first] - IndexError: index 8 is out of bounds for axis 0 with size 8
FAILED pandas/tests/frame/test_stack_unstack.py::test_unstack_sort_false_nan[nan=second] - IndexError: index 8 is out of bounds for axis 0 with size 8
FAILED pandas/tests/frame/test_stack_unstack.py::test_unstack_sort_false_nan[nan=third] - IndexError: index 8 is out of bounds for axis 0 with size 8
FAILED pandas/tests/frame/test_stack_unstack.py::test_unstack_sort_false_nan[nan=last] - IndexError: index 8 is out of bounds for axis 0 with size 8

[("value", np.nan), ("value", 1.0), ("value", 2.0), ("value", 3.0)],
[[0, 4], [1, 5], [2, 6], [3, 7]],
),
(
Index([1, None, 2, 3]),
[("value", 1.0), ("value", np.nan), ("value", 2.0), ("value", 3.0)],
[[0, 4], [1, 5], [2, 6], [3, 7]],
),
(
Index([1, 2, None, 3]),
[("value", 1.0), ("value", 2.0), ("value", np.nan), ("value", 3.0)],
[[0, 4], [1, 5], [2, 6], [3, 7]],
),
(
Index([1, 2, 3, None]),
[("value", 1.0), ("value", 2.0), ("value", 3.0), ("value", np.nan)],
[[0, 4], [1, 5], [2, 6], [3, 7]],
),
],
ids=["nan=first", "nan=second", "nan=third", "nan=last"],
)
def test_unstack_sort_false_nan(levels2, expected_columns, expected_data):
# GH#61221
levels1 = ["b", "a"]
index = MultiIndex.from_product([levels1, levels2], names=["level1", "level2"])
df = DataFrame({"value": [0, 1, 2, 3, 4, 5, 6, 7]}, index=index)
result = df.unstack(level="level2", sort=False)
expected = DataFrame(
dict(zip(expected_columns, expected_data)),
index=Index(["b", "a"], name="level1"),
columns=MultiIndex.from_tuples(expected_columns, names=[None, "level2"]),
)
tm.assert_frame_equal(result, expected)


def test_unstack_fill_frame_object():
# GH12815 Test unstacking with object.
data = Series(["a", "b", "c", "a"], dtype="object")
Expand Down
Loading