Source code for MetricsReloaded.metrics.calibration_measures

# Copyright (c) Carole Sudre
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Calibration measures - :mod:`MetricsReloaded.metrics.calibration_measures`
==========================================================================

This module provides classes for calculating :ref:`calibration
<calibration>` measures.

.. _calibration:

Calculating calibration measures
--------------------------------

.. autoclass:: CalibrationMeasures
    :members:

"""


import numpy as np
from scipy.special import gamma

# from metrics.pairwise_measures import CacheFunctionOutput
from MetricsReloaded.utility.utils import (
    CacheFunctionOutput,
    max_x_at_y_more,
    max_x_at_y_less,
    min_x_at_y_more,
    min_x_at_y_less,
    trapezoidal_integration,
    one_hot_encode,
    median_heuristic
)


__all__ = [
    "CalibrationMeasures",
]


[docs]class CalibrationMeasures(object): def __init__( self, pred_proba, ref, case=None, measures=[], empty=False, dict_args={}, ): self.measures_dict = { "ece": (self.expectation_calibration_error, "ECE"), "bs": (self.brier_score, "BS"), "rbs": (self.root_brier_score, "RBS"), "ls": (self.logarithmic_score, "LS"), "cwece": (self.class_wise_expectation_calibration_error, "cwECE"), "ece_kde": (self.kernel_based_ece, "ECE-KDE"), "kce":(self.kernel_calibration_error, "KCE"), "nll":(self.negative_log_likelihood,"NLL") } self.pred = np.asarray(pred_proba) self.ref = np.asarray(ref) self.n_classes = self.pred.shape[1] self.one_hot_ref = one_hot_encode(ref, self.n_classes) self.case = case self.flag_empty = empty self.dict_args = dict_args self.measures = measures if measures is not None else self.measures_dict
[docs] def class_wise_expectation_calibration_error(self): r""" Class_wise version of the expectation calibration error Ananya Kumar, Percy S Liang, and Tengyu Ma. 2019. Verified uncertainty calibration. Advances in Neural Information Processing Systems 32 (2019). .. math:: cwECE = \dfrac{1}{K}\sum_{k=1}^{K}\sum_{i=1}^{N}\dfrac{\vert B_{i,k} \vert}{N} \left(y_{k}(B_{i,k}) - p_{k}(B_{i,k})\right) """ if "bins_ece" in self.dict_args: nbins = self.dict_args["bins_ece"] else: nbins = 10 step = 1.0 / nbins range_values = np.arange(0, 1.00001, step) list_values = [] numb_samples = self.pred.shape[0] class_pred = np.argmax(self.pred, 1) n_classes = self.pred.shape[1] for k in range(n_classes): list_values_k = [] for (l, u) in zip(range_values[:-1], range_values[1:]): pred_k = self.pred[:, k] ref_tmp = np.where( np.logical_and(pred_k > l, pred_k <= u), self.ref, np.ones_like(self.ref) * -1, ) ref_sel = ref_tmp[ref_tmp > -1] ref_selk = np.where( ref_sel == k, np.ones_like(ref_sel), np.zeros_like(ref_sel) ) nsamples = np.size(ref_sel) prop = np.sum(ref_selk) / nsamples pred_tmp = np.where( np.logical_and(pred_k > l, pred_k <= u), pred_k, np.ones_like(pred_k) * -1, ) pred_sel = pred_tmp[pred_tmp > -1] if nsamples == 0: list_values_k.append(0) else: list_values_k.append(nsamples * np.abs(prop - np.mean(pred_sel))) list_values.append(np.sum(np.asarray(list_values_k)) / numb_samples) cwece = np.sum(np.asarray(list_values)) / n_classes return cwece
[docs] def expectation_calibration_error(self): """ Derives the expectation calibration error in the case of binary task bins_ece is the key in the dictionary for the number of bins to consider Default is 10 """ if "bins_ece" in self.dict_args: nbins = self.dict_args["bins_ece"] else: nbins = 10 step = 1.0 / nbins range_values = np.arange(0, 1.00001, step) list_values = [] numb_samples = 0 pred_prob = self.pred[:,1] for (l, u) in zip(range_values[:-1], range_values[1:]): ref_tmp = np.where( np.logical_and(pred_prob > l, pred_prob <= u), self.ref, np.ones_like(self.ref) * -1, ) ref_sel = ref_tmp[ref_tmp > -1] nsamples = np.size(ref_sel) prop = np.sum(ref_sel) / nsamples pred_tmp = np.where( np.logical_and(pred_prob > l, pred_prob <= u), pred_prob, np.ones_like(pred_prob) * -1, ) pred_sel = pred_tmp[pred_tmp > -1] if nsamples == 0: list_values.append(0) else: list_values.append(nsamples * np.abs(prop - np.mean(pred_sel))) numb_samples += nsamples return np.sum(np.asarray(list_values)) / numb_samples
[docs] def brier_score(self): """ Calculation of the Brier score https://en.wikipedia.org/wiki/Brier_score here considering prediction probabilities as a vector of dimension N samples Glenn W Brier et al. 1950. Verification of forecasts expressed in terms of probability. Monthly weather review 78, 1 (1950), 1–3. :return: brier score (BS) """ bs = np.mean(np.sum(np.square(self.one_hot_ref - self.pred),1)) return bs
[docs] def root_brier_score(self): """ Gruber S. and Buettner F., Better Uncertainty Calibration via Proper Scores for Classification and Beyond, In Proceedings of the 36th International Conference on Neural Information Processing Systems, 2022 """ return np.sqrt(self.brier_score())
[docs] def logarithmic_score(self): """ Calculation of the logarithmic score https://en.wikipedia.org/wiki/Scoring_rule """ eps = 1e-10 log_pred = np.log(self.pred + eps) to_log = self.pred[np.arange(log_pred.shape[0]),self.ref] to_sum = log_pred[np.arange(log_pred.shape[0]),self.ref] ls = np.mean(to_sum) return ls
def distance_ij(self,i,j): pred_i = self.pred[i,:] pred_j = self.pred[j,:] distance = np.sqrt(np.sum(np.square(pred_i - pred_j))) return distance def kernel_calculation(self, i,j): distance = self.distance_ij(i,j) if 'bandwidth_kce' in self.dict_args.keys(): bandwidth = self.dict_args['bandwidth_kce'] else: bandwidth = median_heuristic(self.pred) value = np.exp(-distance/bandwidth) identity = np.ones([self.pred.shape[1], self.pred.shape[1]]) return value * identity
[docs] def kernel_calibration_error(self): """ Based on the paper Widmann, D., Lindsten, F., and Zachariah, D. Calibration tests in multi-class classification: A unifying framework. Advances in Neural Information Processing Systems, 32:12257–12267, 2019. """ one_hot_ref = one_hot_encode(self.ref, self.pred.shape[1]) numb_samples = self.pred.shape[0] sum_tot = 0 for i in range(0,numb_samples): for j in range(0,i): kernel = self.kernel_calculation(i,j) vect_i = one_hot_ref[i,:] - self.pred[i,:] vect_j = one_hot_ref[j,:] - self.pred[j,:] value_ij = np.matmul(vect_i, np.matmul(kernel,vect_j.T)) sum_tot += value_ij multiplicative_factor = np.math.factorial(numb_samples)/ (2 * np.math.factorial(numb_samples-2)) kce = 1/multiplicative_factor * sum_tot return kce
[docs] def top_label_classification_error(self): """ Calculation of the top-label classification error. Assumes pred_proba a matrix K x Numb observations with probability to be in class k for observation i in position (k,i) """ class_max = np.argmax(self.pred, 1) prob_pred_max = np.max(self.pred, 1) nclasses = self.pred.shape[1] numb_samples = self.pred.shape[0] prob = np.zeros([nclasses]) prob_ref_values, prob_ref_counts = np.unique(self.ref, return_counts=True) for k in range(nclasses): idx = np.where(prob_ref_values == k) if len(idx) == 0: prob[k] = 0 else: prob[k] = prob_ref_counts[idx[0]] / numb_samples prob_expected_max = prob[class_max] tce = np.sqrt(np.mean(np.square(prob_expected_max - prob_pred_max))) return tce
[docs] def kernel_based_ece(self): """ Calculates kernel based ECE Teodora Popordanoska, Raphael Sayer, and Matthew B Blaschko. 2022. A Consistent and Differentiable Lp Canonical Calibration Error Estimator. In Advances in Neural Information Processing Systems. :return: ece_kde """ ece_kde = 0 one_hot_ref = one_hot_encode(self.ref, self.pred.shape[1]) nclasses = self.pred.shape[1] numb_samples = self.pred.shape[0] norm_list = [] for j in range(numb_samples): new_list = [] new_vect = np.zeros([nclasses]) for i in range(numb_samples): if j != i: new_dir = self.dirichlet_kernel(j, i) new_list.append(new_dir) ref_tmp = one_hot_ref[i, :] new_add = ref_tmp * new_dir new_vect += new_add norm = np.sum(np.asarray(new_list)) final_vect = new_vect / norm norm_list.append(final_vect - self.pred[j, :]) full_array = np.vstack(norm_list) ece_kde = np.mean(np.sqrt(np.sum(np.square(full_array), 1))) return ece_kde
def gamma_ik(self, i, k): pred_ik = self.pred[i, k] if "bandwidth" in self.dict_args.keys(): h = self.dict_args["bandwidth"] else: h = 0.5 alpha_ik = pred_ik / h + 1 gamma_ik = gamma(alpha_ik) return gamma_ik def dirichlet_kernel(self, j, i): pred_i = self.pred[i, :] pred_j = self.pred[j, :] nclasses = self.pred.shape[1] if "bandwidth" in self.dict_args.keys(): h = self.dict_args["bandwidth"] else: h = 0.5 alpha_i = pred_i / h + 1 numerator = gamma(np.sum(alpha_i)) denominator = np.prod(gamma(alpha_i)) prod = 1 for k in range(nclasses): prod *= np.power(pred_j[k], alpha_i[k] - 1) kernel_value = numerator / denominator * prod return kernel_value
[docs] def negative_log_likelihood(self): """ Derives the negative log-likelihood defined as George Cybenko, Dianne P O’Leary, and Jorma Rissanen. 1998. The Mathematics of Information Coding, Extraction and Distribution. Vol. 107. Springer Science & Business Media. .. math:: -\sum_{i=1}{N} log(p_{i,k} | y_i=k) """ log_pred = np.log(self.pred) numb_samples = self.pred.shape[0] ll = np.sum(log_pred[range(numb_samples), self.ref]) nll = -1 * ll return nll
[docs] def to_dict_meas(self, fmt="{:.4f}"): """Given the selected metrics provides a dictionary with relevant metrics""" result_dict = {} for key in self.measures: result = self.measures_dict[key][0]() #result_dict[key] = fmt.format(result) result_dict[key] = result return result_dict