lorentzenchr · m-maggi · Mar 5, 2024 · Mar 5, 2024
diff --git a/src/model_diagnostics/scoring/__init__.py b/src/model_diagnostics/scoring/__init__.py
@@ -9,6 +9,7 @@
     PoissonDeviance,
     SquaredError,
     decompose,
+    compute_score
 )
 
 __all__ = [
@@ -22,4 +23,5 @@
     "PoissonDeviance",
     "SquaredError",
     "decompose",
+    "compute_score"
 ]
diff --git a/src/model_diagnostics/scoring/scoring.py b/src/model_diagnostics/scoring/scoring.py
@@ -11,7 +11,7 @@
 import numpy.typing as npt
 import polars as pl
 from scipy import special
-from scipy.stats import expectile
+from scipy.stats import expectile, t
 from sklearn.isotonic import IsotonicRegression as IsotonicRegression_skl
 
 from model_diagnostics._utils._array import (
@@ -61,6 +61,21 @@ def __call__(
             The average score.
         """
         return np.average(self.score_per_obs(y_obs, y_pred), weights=weights)
+
+    def confidence_interval(
+        self,
+        y_obs: npt.ArrayLike,
+        y_pred: npt.ArrayLike,
+        confidence_level: Optional[float] = None,
+        weights: Optional[npt.ArrayLike] = None,
+    ) -> np.ndarray:
+        score_per_obs = self.score_per_obs(y_obs, y_pred)
+        avg_score = np.average(score_per_obs, weights=weights)
+        df = len(score_per_obs) - 1
+        score_stddev = np.average((score_per_obs - avg_score) ** 2, weights=weights) / np.amax(
+            [1, df]
+        )
+        return avg_score + t.interval(confidence_level, df=df, scale=np.sqrt(score_stddev))
 
     @abstractmethod
     def score_per_obs(
@@ -935,3 +950,257 @@ def decompose(
         df = df.drop("model")
 
     return df
+
+
+def compute_score(
+    y_obs: npt.ArrayLike,
+    y_pred: npt.ArrayLike,
+    weights: Optional[npt.ArrayLike] = None,
+    confidence_level: Optional[float] = None,
+    *,
+    scoring_function: Callable[..., Any],  # TODO: make type hint stricter
+    functional: Optional[str] = None,
+    level: Optional[float] = None,
+) -> pl.DataFrame:
+    r"""Additive decomposition of scores.
+
+    The score is decomposed as
+    `score = miscalibration - discrimination + uncertainty`.
+
+    Parameters
+    ----------
+    y_obs : array-like of shape (n_obs)
+        Observed values of the response variable.
+    y_pred : array-like of shape (n_obs) or (n_obs, n_models)
+        Predicted values of the `functional` of interest, e.g. the conditional
+        expectation of the response, `E(Y|X)`.
+    weights : array-like of shape (n_obs) or None
+        Case weights.
+    scoring_function : callable
+        A scoring function with signature roughly
+        `fun(y_obs, y_pred, weights) -> float`.
+    functional : str or None
+        The target functional which `y_pred` aims to predict.
+        If `None`, then it will be inferred from `scoring_function.functional`.
+        Options are:
+
+        - `"mean"`. Argument `level` is neglected.
+        - `"median"`. Argument `level` is neglected.
+        - `"expectile"`
+        - `"quantile"`
+    level : float or None
+        Functionals like expectiles and quantiles have a level (often called alpha).
+        If `None`, then it will be inferred from `scoring_function.level`.
+
+    Returns
+    -------
+    decomposition : polars.DataFrame
+        The resulting score decomposition as a dataframe with columns:
+
+        - `miscalibration`
+        - `discrimination`
+        - `uncertainty`
+        - `score`: the average score
+
+    If `y_pred` contains several predictions, i.e. it is 2-dimension with shape
+    `(n_obs, n_pred)` and `n_pred >1`, then there is the additional column:
+
+        - `model`
+
+    Notes
+    -----
+    To be precise, this function returns the decomposition of the score in terms of
+    auto-miscalibration, auto-discrimination (or resolution) and uncertainy (or
+    entropy), see `[FLM2022]` and references therein.
+    The key element is to estimate the recalibrated predictions, i.e. \(T(Y|m(X))\) for
+    the target functional \(T\) and model predictions \(m(X)\).
+    This is accomplished by isotonic regression, `[Dimitriadis2021]` and
+    `[Gneiting2021]`.
+
+    References
+    ----------
+    `[FLM2022]`
+
+    :   T. Fissler, C. Lorentzen, and M. Mayer.
+        "Model Comparison and Calibration Assessment". (2022)
+        [arxiv:2202.12780](https://arxiv.org/abs/2202.12780).
+
+    `[Dimitriadis2021]`
+
+    :   T. Dimitriadis, T. Gneiting, and A. I. Jordan.
+        "Stable reliability diagrams for probabilistic classifiers". (2021)
+        [doi:10.1073/pnas.2016191118](https://doi.org/10.1073/pnas.2016191118)
+
+    `[Gneiting2021]`
+
+    :   T. Gneiting and J. Resin.
+        "Regression Diagnostics meets Forecast Evaluation: Conditional Calibration,
+        Reliability Diagrams, and Coefficient of Determination". (2021).
+        [arXiv:2108.03210](https://arxiv.org/abs/2108.03210).
+
+    Examples
+    --------
+    >>> decompose(y_obs=[0, 0, 1, 1], y_pred=[-1, 1, 1, 2],
+    ... scoring_function=SquaredError())
+    shape: (1, 4)
+    ┌────────────────┬────────────────┬─────────────┬───────┐
+    │ miscalibration ┆ discrimination ┆ uncertainty ┆ score │
+    │ ---            ┆ ---            ┆ ---         ┆ ---   │
+    │ f64            ┆ f64            ┆ f64         ┆ f64   │
+    ╞════════════════╪════════════════╪═════════════╪═══════╡
+    │ 0.625          ┆ 0.125          ┆ 0.25        ┆ 0.75  │
+    └────────────────┴────────────────┴─────────────┴───────┘
+    """
+    if functional is None:
+        if hasattr(scoring_function, "functional"):
+            functional = scoring_function.functional
+        else:
+            msg = (
+                "You set functional=None, but scoring_function has no attribute "
+                "functional."
+            )
+            raise ValueError(msg)
+    if level is None:
+        level = 0.5
+        if functional in ("expectile", "quantile"):
+            if hasattr(scoring_function, "level"):
+                level = float(scoring_function.level)
+            else:
+                msg = (
+                    "You set level=None, but scoring_function has no attribute "
+                    "level."
+                )
+                raise ValueError(msg)
+
+    allowed_functionals = ("mean", "median", "expectile", "quantile")
+    if functional not in allowed_functionals:
+        msg = (
+            f"The functional must be one of {allowed_functionals}, got "
+            f"{functional}."
+        )
+        raise ValueError(msg)
+    if functional in ("expectile", "quantile") and (level <= 0 or level >= 1):
+        msg = f"The level must fulfil 0 < level < 1, got {level}."
+        raise ValueError(msg)
+    if confidence_level is not None and functional != "mean":
+        msg = ("Confidence intervals for the score are currently implemented "
+                "only for the mean functional.")
+        raise ValueError
+
+    validate_same_first_dimension(y_obs, y_pred)
+    n_pred = length_of_second_dimension(y_pred)
+    pred_names, _ = get_sorted_array_names(y_pred)
+    y_o = np.asarray(y_obs)
+
+    if weights is None:
+        w = None
+    else:
+        validate_same_first_dimension(weights, y_o)
+        w = np.asarray(weights)  # needed to satisfy mypy
+        if w.ndim > 1:
+            msg = f"The array weights must be 1-dimensional, got weights.ndim={w.ndim}."
+            raise ValueError(msg)
+
+    if functional == "mean":
+        iso = IsotonicRegression_skl(y_min=None, y_max=None)
+        marginal = np.average(y_o, weights=w)
+    else:
+        iso = IsotonicRegression(functional=functional, level=level)
+        if functional == "expectile":
+            marginal = expectile(y_o, alpha=level, weights=w)
+        elif functional == "quantile":
+            marginal = 0.5 * (
+                quantile_lower(y_o, level=level) + quantile_upper(y_o, level=level)
+            )
+
+    if y_o[0] == marginal == y_o[-1]:
+        # y_o is constant. We need to check if y_o is allowed as argument to y_pred.
+        # For instance for the poisson deviance, y_o = 0 is allowed. But 0 is forbidden
+        # as a prediction.
+        try:
+            scoring_function(y_o[0], marginal)
+        except ValueError as exc:
+            msg = (
+                "Your y_obs is constant and lies outside the allowed range of y_pred "
+                "of your scoring function. Therefore, the score decomposition cannot "
+                "be applied."
+            )
+            raise ValueError(msg) from exc
+
+    # The recalibrated versions, further down, could contain min(y_obs) and that could
+    # be outside of the valid domain, e.g. y_pred = 0 for the Poisson deviance where
+    # y_obs=0 is allowed. We detect that here:
+    y_min = np.amin(y_o)
+    y_min_allowed = True
+    try:
+        scoring_function(y_o[:1], np.array([y_min]), None if w is None else w[:1])
+    except ValueError:
+        y_min_allowed = False
+
+    marginal = np.full_like(y_o, fill_value=marginal, dtype=float)
+    score_marginal = scoring_function(y_o, marginal, w)
+
+    df_list = []
+    for i in range(len(pred_names)):
+        # Loop over columns of y_pred.
+        x = y_pred if n_pred == 0 else get_second_dimension(y_pred, i)
+        iso.fit(x, y_o, sample_weight=w)
+        recalibrated = np.squeeze(iso.predict(x))
+        if not y_min_allowed and recalibrated[0] <= y_min:
+            # Oh dear, this needs quite some extra work:
+            # First index of value greater than y_min
+            idx1 = np.argmax(recalibrated > y_min)
+            val1 = recalibrated[idx1]
+            # First index of value greater than the value at idx1.
+            idx2 = np.argmax(recalibrated > val1)
+            # Note that val1 may already be the largest value of the array => idx2 = 0.
+            if idx2 == 0:
+                idx2 = recalibrated.shape[0]
+            # We merge the first 2 blocks of the isotonic regression as it violates
+            # our domain requirements.
+            re2 = recalibrated[:idx2]
+            w2 = None if w is None else w[:idx2]
+            if functional == "mean":
+                recalibrated[:idx2] = np.average(re2, weights=w2)
+            elif functional == "expectile":
+                recalibrated[:idx2] = expectile(re2, alpha=level, weights=w2)
+            elif functional == "quantile":
+                # Note, no scoring function known that could end up here.
+                lower = quantile_lower(re2, level=level)
+                upper = quantile_upper(re2, level=level)
+                recalibrated[:idx2] = 0.5 * (lower + upper)
+
+        score = scoring_function(y_o, x, w)
+        interval = scoring_function.confidence_interval(y_o, x, confidence_level, w,)
+        try:
+            score_recalibrated = scoring_function(y_o, recalibrated, w)
+        except ValueError as exc:
+            msg = (
+                "The recalibrated predictions obtained from isotonic regression are "
+                "very likely outside the allowed range of y_pred of your scoring "
+                "function. Therefore, the score decomposition cannot be applied."
+            )
+            raise ValueError(msg) from exc
+
+        df = pl.DataFrame(
+            {
+                "model": pred_names[i],
+                "miscalibration": score - score_recalibrated,
+                "discrimination": score_marginal - score_recalibrated,
+                "uncertainty": score_marginal,
+                "score": score,
+            }
+        )
+        if confidence_level is not None:
+            df = df.with_columns(
+                pl.lit([interval]).alias("score_interval")
+            )
+        df_list.append(df)
+
+    df = pl.concat(df_list)
+
+    # Remove column "model" for a single model.
+    if n_pred <= 1:
+        df = df.drop("model")
+
+    return df