diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 930704e6f62f4..ee958b15ee78d 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -680,14 +680,45 @@ def compress_group_index( space can be huge, so this function compresses it, by computing offsets (comp_ids) into the list of unique labels (obs_group_ids). """ - if len(group_index) and np.all(group_index[1:] >= group_index[:-1]): + import sys + + # Use numpy-based approach for Python 3.14+ to avoid hashtable issues + is_sorted = len(group_index) and np.all( + group_index[1:] >= group_index[:-1] + ) + if sys.version_info >= (3, 14) or is_sorted: # GH 53806: fast path for sorted group_index + # GH 63314: also use for Python 3.14+ due to hashtable behavior changes + if len(group_index) == 0: + empty_arr = np.array([], dtype=np.int64) + return ensure_int64(empty_arr), ensure_int64(empty_arr) + + # Sort if needed + if not np.all(group_index[1:] >= group_index[:-1]): + sorted_idx = np.argsort(group_index, kind="stable") + sorted_group_index = group_index[sorted_idx] + unsort_idx = np.empty_like(sorted_idx) + unsort_idx[sorted_idx] = np.arange(len(sorted_idx)) + else: + sorted_group_index = group_index + unsort_idx = None + unique_mask = np.concatenate( - [group_index[:1] > -1, group_index[1:] != group_index[:-1]] + [ + sorted_group_index[:1] > -1, + sorted_group_index[1:] != sorted_group_index[:-1], + ] ) - comp_ids = unique_mask.cumsum() - comp_ids -= 1 - obs_group_ids = group_index[unique_mask] + comp_ids_sorted = unique_mask.cumsum() - 1 + obs_group_ids = sorted_group_index[unique_mask] + + if unsort_idx is not None: + comp_ids = comp_ids_sorted[unsort_idx] + else: + comp_ids = comp_ids_sorted + + if sort and not np.all(obs_group_ids[1:] >= obs_group_ids[:-1]): + obs_group_ids, comp_ids = _reorder_by_uniques(obs_group_ids, comp_ids) else: size_hint = len(group_index) table = hashtable.Int64HashTable(size_hint) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 5f47fb8f992d0..ecfcb289777cf 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2959,3 +2959,42 @@ def test_pivot_empty_dataframe_period_dtype(self, freq): ) tm.assert_frame_equal(result, expected) + + def test_pivot_table_large_dataset_no_duplicates(self): + # GH 63314: pivot_table with large datasets should not produce + # duplicate indices. This test ensures the Python 3.14 fix works. + n_indices = 10000 + metrics = ["apple", "banana", "coconut"] + + data = [ + {"idx": f"id_{i}", "metric": metric, "value": i * 10 + len(metric)} + for i in range(n_indices) + for metric in metrics + ] + + df = DataFrame(data) + + result = df.pivot_table( + index=["idx"], + columns="metric", + values="value", + aggfunc="first", + ) + + # Verify no duplicate indices in the result + n_unique = len(result.index.unique()) + assert len(result.index) == n_unique, ( + f"Expected {n_unique} unique indices, got {len(result.index)}" + ) + + # Verify we have the expected number of rows + assert len(result) == n_indices, ( + f"Expected {n_indices} rows, got {len(result)}" + ) + + # Verify all expected indices are present + expected_indices = {f"id_{i}" for i in range(n_indices)} + actual_indices = set(result.index) + assert expected_indices == actual_indices, ( + "Result indices don't match expected indices" + )