Source code for flightrisk.eval.metrics

from __future__ import annotations

from collections.abc import Mapping
from dataclasses import dataclass

import numpy as np
import pandas as pd
from sklearn.metrics import (
    average_precision_score,
    brier_score_loss,
    roc_auc_score,
)



[docs]
@dataclass(frozen=True)
class RiskMetrics:
    """Metrics emitted by the risk track.

    :param auc: Area under the ROC curve.
    :param pr_auc: Average precision (area under the precision-recall curve).
    :param brier: Brier score (lower is better).
    :param ece: Expected calibration error.
    :param decile_lift: Lift in the top decile relative to the overall churn rate.
    """

    auc: float
    pr_auc: float
    brier: float
    ece: float
    decile_lift: float


[docs]
    def as_dict(self) -> Mapping[str, float]:
        """Return the metrics as a plain ``dict`` for MLflow logging.

        :returns: Mapping from metric name to value.
        """
        return {
            "auc": self.auc,
            "pr_auc": self.pr_auc,
            "brier": self.brier,
            "ece": self.ece,
            "decile_lift": self.decile_lift,
        }





[docs]
def expected_calibration_error(
    y_true: np.ndarray, y_prob: np.ndarray, *, n_bins: int = 20
) -> float:
    """Compute the expected calibration error with equal-width probability bins.

    :param y_true: Binary ground truth (0/1).
    :param y_prob: Predicted probabilities in ``[0, 1]``.
    :param n_bins: Number of equal-width probability bins.
    :returns: ECE in ``[0, 1]``.
    :raises ValueError: If shapes mismatch or ``n_bins < 1``.
    """
    if n_bins < 1:
        raise ValueError("n_bins must be >= 1")
    if len(y_true) != len(y_prob):
        raise ValueError("y_true and y_prob must share the same length")
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    idx = np.clip(np.digitize(y_prob, bins[1:-1]), 0, n_bins - 1)
    ece = 0.0
    n = len(y_true)
    for b in range(n_bins):
        mask = idx == b
        if not mask.any():
            continue
        weight = mask.sum() / n
        bin_acc = float(y_true[mask].mean())
        bin_conf = float(y_prob[mask].mean())
        ece += weight * abs(bin_acc - bin_conf)
    return float(ece)




[docs]
def lift_at_decile(y_true: np.ndarray, y_prob: np.ndarray, *, decile: int = 1) -> float:
    """Compute the lift in the top ``decile`` (1 = top 10%) versus base rate.

    :param y_true: Binary ground truth.
    :param y_prob: Predicted probabilities.
    :param decile: Decile rank, 1 for the top 10%, 2 for the top 20%, etc.
    :returns: Ratio of in-decile churn rate to overall churn rate.
    :raises ValueError: If ``decile`` falls outside ``[1, 10]``.
    """
    if not 1 <= decile <= 10:
        raise ValueError("decile must lie in [1, 10]")
    n = len(y_true)
    cutoff = max(int(np.ceil(n * decile / 10)), 1)
    order = np.argsort(-y_prob, kind="stable")
    top = order[:cutoff]
    base_rate = float(y_true.mean())
    if base_rate <= 0:
        return float("nan")
    return float(y_true[top].mean()) / base_rate




[docs]
def risk_metrics(
    y_true: np.ndarray, y_prob: np.ndarray, *, n_calibration_bins: int = 20
) -> RiskMetrics:
    """Compute the standard suite of risk metrics in one call.

    :param y_true: Binary ground truth.
    :param y_prob: Predicted probabilities.
    :param n_calibration_bins: Number of bins for ECE.
    :returns: A :class:`RiskMetrics` bundle.
    """
    y_true_arr = np.asarray(y_true).astype(int)
    y_prob_arr = np.asarray(y_prob).astype(float)
    return RiskMetrics(
        auc=float(roc_auc_score(y_true_arr, y_prob_arr)),
        pr_auc=float(average_precision_score(y_true_arr, y_prob_arr)),
        brier=float(brier_score_loss(y_true_arr, y_prob_arr)),
        ece=expected_calibration_error(y_true_arr, y_prob_arr, n_bins=n_calibration_bins),
        decile_lift=lift_at_decile(y_true_arr, y_prob_arr, decile=1),
    )




[docs]
def calibration_table(y_true: np.ndarray, y_prob: np.ndarray, *, n_bins: int = 20) -> pd.DataFrame:
    """Return a per-bin calibration table for plotting reliability diagrams.

    :param y_true: Binary ground truth.
    :param y_prob: Predicted probabilities.
    :param n_bins: Number of equal-width bins.
    :returns: Frame with columns ``bin``, ``count``, ``mean_predicted``,
        ``empirical_rate``.
    """
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    idx = np.clip(np.digitize(y_prob, bins[1:-1]), 0, n_bins - 1)
    rows = []
    for b in range(n_bins):
        mask = idx == b
        if not mask.any():
            continue
        rows.append(
            {
                "bin": b,
                "count": int(mask.sum()),
                "mean_predicted": float(np.mean(y_prob[mask])),
                "empirical_rate": float(np.mean(y_true[mask])),
            }
        )
    return pd.DataFrame(rows)