diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9210f1e0082f0..f6e960513765f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -1148,6 +1148,7 @@ Other - Bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`) - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.isin` raising ``TypeError`` when series is large (>10**6) and ``values`` contains NA (:issue:`60678`) +- Bug in :meth:`Series.kurt` and :meth:`Series.skew` resulting in zero for low variance arrays (:issue:`57972`) - Bug in :meth:`Series.map` with a ``timestamp[pyarrow]`` dtype or ``duration[pyarrow]`` dtype incorrectly returning all-``NaN`` entries (:issue:`61231`) - Bug in :meth:`Series.mode` where an exception was raised when taking the mode with nullable types with no null values in the series. (:issue:`58926`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 58bcc60f9274e..7bcf4371a0bcd 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1273,12 +1273,13 @@ def nanskew( m2 = adjusted2.sum(axis, dtype=np.float64) m3 = adjusted3.sum(axis, dtype=np.float64) - # floating point error - # - # #18044 in _libs/windows.pyx calc_skew follow this behavior - # to fix the fperr to treat m2 <1e-14 as zero - m2 = _zero_out_fperr(m2) - m3 = _zero_out_fperr(m3) + # floating point error. See comment in [nankurt] + max_abs = np.abs(values).max(axis, initial=0.0) + eps = np.finfo(m2.dtype).eps + constant_tolerance2 = ((eps * max_abs) ** 2) * count + constant_tolerance3 = ((eps * max_abs) ** 3) * count + m2 = _zero_out_fperr(m2, constant_tolerance2) + m3 = _zero_out_fperr(m3, constant_tolerance3) with np.errstate(invalid="ignore", divide="ignore"): result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5) @@ -1361,18 +1362,40 @@ def nankurt( m2 = adjusted2.sum(axis, dtype=np.float64) m4 = adjusted4.sum(axis, dtype=np.float64) + # Several floating point errors may occur during the summation due to rounding. + # This computation is similar to the one in Scipy + # https://github.com/scipy/scipy/blob/04d6d9c460b1fed83f2919ecec3d743cfa2e8317/scipy/stats/_stats_py.py#L1429 + # With a few modifications, like using the maximum value instead of the averages + # and some adaptations because they use the average and we use the sum for `m2`. + # We need to estimate an upper bound to the error to consider the data constant. + # Lets call: + # x: true value in data + # y: floating point representation + # e: relative approximation error + # n: number of observations in array + # + # We have that: + # |x - y|/|x| <= e (See https://en.wikipedia.org/wiki/Machine_epsilon) + # (|x - y|/|x|)² <= e² + # Σ (|x - y|/|x|)² <= ne² + # + # Lets say that the fperr upper bound for m2 is constrained by the summation. + # |m2 - y|/|m2| <= ne² + # |m2 - y| <= n|m2|e² + # + # We will use max (x²) to estimate |m2| + max_abs = np.abs(values).max(axis, initial=0.0) + eps = np.finfo(m2.dtype).eps + constant_tolerance2 = ((eps * max_abs) ** 2) * count + constant_tolerance4 = ((eps * max_abs) ** 4) * count + m2 = _zero_out_fperr(m2, constant_tolerance2) + m4 = _zero_out_fperr(m4, constant_tolerance4) + with np.errstate(invalid="ignore", divide="ignore"): adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3)) numerator = count * (count + 1) * (count - 1) * m4 denominator = (count - 2) * (count - 3) * m2**2 - # floating point error - # - # #18044 in _libs/windows.pyx calc_kurt follow this behavior - # to fix the fperr to treat denom <1e-14 as zero - numerator = _zero_out_fperr(numerator) - denominator = _zero_out_fperr(denominator) - if not isinstance(denominator, np.ndarray): # if ``denom`` is a scalar, check these corner cases first before # doing division @@ -1587,12 +1610,12 @@ def check_below_min_count( return False -def _zero_out_fperr(arg): +def _zero_out_fperr(arg, tol: float | np.ndarray): # #18044 reference this behavior to fix rolling skew/kurt issue if isinstance(arg, np.ndarray): - return np.where(np.abs(arg) < 1e-14, 0, arg) + return np.where(np.abs(arg) < tol, 0, arg) else: - return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg + return arg.dtype.type(0) if np.abs(arg) < tol else arg @disallow("M8", "m8") diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py index e7ed8e855a762..6788f2056bb9a 100644 --- a/pandas/tests/test_nanops.py +++ b/pandas/tests/test_nanops.py @@ -1051,6 +1051,23 @@ def test_nans_skipna(self, samples, actual_skew): skew = nanops.nanskew(samples, skipna=True) tm.assert_almost_equal(skew, actual_skew) + @pytest.mark.parametrize( + "initial_data, nobs", + [ + ([-2.05191341e-05, -4.10391103e-05], 27), + ([-2.05191341e-10, -4.10391103e-10], 27), + ([-2.05191341e-05, -4.10391103e-05], 10_000), + ([-2.05191341e-10, -4.10391103e-10], 10_000), + ], + ) + def test_low_variance(self, initial_data, nobs): + st = pytest.importorskip("scipy.stats") + data = np.zeros((nobs,), dtype=np.float64) + data[: len(initial_data)] = initial_data + skew = nanops.nanskew(data) + expected = st.skew(data, bias=False) + tm.assert_almost_equal(skew, expected) + @property def prng(self): return np.random.default_rng(2) @@ -1072,7 +1089,7 @@ def test_constant_series(self, val): # xref GH 11974 data = val * np.ones(300) kurt = nanops.nankurt(data) - assert kurt == 0.0 + tm.assert_equal(kurt, 0.0) def test_all_finite(self): alpha, beta = 0.3, 0.1 @@ -1102,6 +1119,24 @@ def test_nans_skipna(self, samples, actual_kurt): kurt = nanops.nankurt(samples, skipna=True) tm.assert_almost_equal(kurt, actual_kurt) + @pytest.mark.parametrize( + "initial_data, nobs", + [ + ([-2.05191341e-05, -4.10391103e-05], 27), + ([-2.05191341e-10, -4.10391103e-10], 27), + ([-2.05191341e-05, -4.10391103e-05], 10_000), + ([-2.05191341e-10, -4.10391103e-10], 10_000), + ], + ) + def test_low_variance(self, initial_data, nobs): + # GH#57972 + st = pytest.importorskip("scipy.stats") + data = np.zeros((nobs,), dtype=np.float64) + data[: len(initial_data)] = initial_data + kurt = nanops.nankurt(data) + expected = st.kurtosis(data, bias=False) + tm.assert_almost_equal(kurt, expected) + @property def prng(self): return np.random.default_rng(2)