# Copyright (c) Carole Sudre
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Calibration measures - :mod:`MetricsReloaded.metrics.calibration_measures`
==========================================================================
This module provides classes for calculating :ref:`calibration
<calibration>` measures.
.. _calibration:
Calculating calibration measures
--------------------------------
.. autoclass:: CalibrationMeasures
:members:
"""
import numpy as np
from scipy.special import gamma
# from metrics.pairwise_measures import CacheFunctionOutput
from MetricsReloaded.utility.utils import (
CacheFunctionOutput,
max_x_at_y_more,
max_x_at_y_less,
min_x_at_y_more,
min_x_at_y_less,
trapezoidal_integration,
one_hot_encode,
median_heuristic
)
__all__ = [
"CalibrationMeasures",
]
[docs]class CalibrationMeasures(object):
def __init__(
self,
pred_proba,
ref,
case=None,
measures=[],
empty=False,
dict_args={},
):
self.measures_dict = {
"ece": (self.expectation_calibration_error, "ECE"),
"bs": (self.brier_score, "BS"),
"rbs": (self.root_brier_score, "RBS"),
"ls": (self.logarithmic_score, "LS"),
"cwece": (self.class_wise_expectation_calibration_error, "cwECE"),
"ece_kde": (self.kernel_based_ece, "ECE-KDE"),
"kce":(self.kernel_calibration_error, "KCE"),
"nll":(self.negative_log_likelihood,"NLL")
}
self.pred = np.asarray(pred_proba)
self.ref = np.asarray(ref)
self.n_classes = self.pred.shape[1]
self.one_hot_ref = one_hot_encode(ref, self.n_classes)
self.case = case
self.flag_empty = empty
self.dict_args = dict_args
self.measures = measures if measures is not None else self.measures_dict
[docs] def class_wise_expectation_calibration_error(self):
r"""
Class_wise version of the expectation calibration error
Ananya Kumar, Percy S Liang, and Tengyu Ma. 2019. Verified uncertainty calibration. Advances in Neural Information
Processing Systems 32 (2019).
.. math::
cwECE = \dfrac{1}{K}\sum_{k=1}^{K}\sum_{i=1}^{N}\dfrac{\vert B_{i,k} \vert}{N} \left(y_{k}(B_{i,k}) - p_{k}(B_{i,k})\right)
"""
if "bins_ece" in self.dict_args:
nbins = self.dict_args["bins_ece"]
else:
nbins = 10
step = 1.0 / nbins
range_values = np.arange(0, 1.00001, step)
list_values = []
numb_samples = self.pred.shape[0]
class_pred = np.argmax(self.pred, 1)
n_classes = self.pred.shape[1]
for k in range(n_classes):
list_values_k = []
for (l, u) in zip(range_values[:-1], range_values[1:]):
pred_k = self.pred[:, k]
ref_tmp = np.where(
np.logical_and(pred_k > l, pred_k <= u),
self.ref,
np.ones_like(self.ref) * -1,
)
ref_sel = ref_tmp[ref_tmp > -1]
ref_selk = np.where(
ref_sel == k, np.ones_like(ref_sel), np.zeros_like(ref_sel)
)
nsamples = np.size(ref_sel)
prop = np.sum(ref_selk) / nsamples
pred_tmp = np.where(
np.logical_and(pred_k > l, pred_k <= u),
pred_k,
np.ones_like(pred_k) * -1,
)
pred_sel = pred_tmp[pred_tmp > -1]
if nsamples == 0:
list_values_k.append(0)
else:
list_values_k.append(nsamples * np.abs(prop - np.mean(pred_sel)))
list_values.append(np.sum(np.asarray(list_values_k)) / numb_samples)
cwece = np.sum(np.asarray(list_values)) / n_classes
return cwece
[docs] def expectation_calibration_error(self):
"""
Derives the expectation calibration error in the case of binary task
bins_ece is the key in the dictionary for the number of bins to consider
Default is 10
"""
if "bins_ece" in self.dict_args:
nbins = self.dict_args["bins_ece"]
else:
nbins = 10
step = 1.0 / nbins
range_values = np.arange(0, 1.00001, step)
list_values = []
numb_samples = 0
pred_prob = self.pred[:,1]
for (l, u) in zip(range_values[:-1], range_values[1:]):
ref_tmp = np.where(
np.logical_and(pred_prob > l, pred_prob <= u),
self.ref,
np.ones_like(self.ref) * -1,
)
ref_sel = ref_tmp[ref_tmp > -1]
nsamples = np.size(ref_sel)
prop = np.sum(ref_sel) / nsamples
pred_tmp = np.where(
np.logical_and(pred_prob > l, pred_prob <= u),
pred_prob,
np.ones_like(pred_prob) * -1,
)
pred_sel = pred_tmp[pred_tmp > -1]
if nsamples == 0:
list_values.append(0)
else:
list_values.append(nsamples * np.abs(prop - np.mean(pred_sel)))
numb_samples += nsamples
return np.sum(np.asarray(list_values)) / numb_samples
[docs] def brier_score(self):
"""
Calculation of the Brier score https://en.wikipedia.org/wiki/Brier_score
here considering prediction probabilities as a vector of dimension N samples
Glenn W Brier et al. 1950. Verification of forecasts expressed in terms of probability. Monthly weather review 78, 1
(1950), 1–3.
:return: brier score (BS)
"""
bs = np.mean(np.sum(np.square(self.one_hot_ref - self.pred),1))
return bs
[docs] def root_brier_score(self):
"""
Gruber S. and Buettner F., Better Uncertainty Calibration via Proper Scores
for Classification and Beyond, In Proceedings of the 36th International
Conference on Neural Information Processing Systems, 2022
"""
return np.sqrt(self.brier_score())
[docs] def logarithmic_score(self):
"""
Calculation of the logarithmic score https://en.wikipedia.org/wiki/Scoring_rule
"""
eps = 1e-10
log_pred = np.log(self.pred + eps)
to_log = self.pred[np.arange(log_pred.shape[0]),self.ref]
to_sum = log_pred[np.arange(log_pred.shape[0]),self.ref]
ls = np.mean(to_sum)
return ls
def distance_ij(self,i,j):
pred_i = self.pred[i,:]
pred_j = self.pred[j,:]
distance = np.sqrt(np.sum(np.square(pred_i - pred_j)))
return distance
def kernel_calculation(self, i,j):
distance = self.distance_ij(i,j)
if 'bandwidth_kce' in self.dict_args.keys():
bandwidth = self.dict_args['bandwidth_kce']
else:
bandwidth = median_heuristic(self.pred)
value = np.exp(-distance/bandwidth)
identity = np.ones([self.pred.shape[1], self.pred.shape[1]])
return value * identity
[docs] def kernel_calibration_error(self):
"""
Based on the paper Widmann, D., Lindsten, F., and Zachariah, D.
Calibration tests in multi-class classification: A unifying framework.
Advances in Neural Information Processing Systems, 32:12257–12267, 2019.
"""
one_hot_ref = one_hot_encode(self.ref, self.pred.shape[1])
numb_samples = self.pred.shape[0]
sum_tot = 0
for i in range(0,numb_samples):
for j in range(0,i):
kernel = self.kernel_calculation(i,j)
vect_i = one_hot_ref[i,:] - self.pred[i,:]
vect_j = one_hot_ref[j,:] - self.pred[j,:]
value_ij = np.matmul(vect_i, np.matmul(kernel,vect_j.T))
sum_tot += value_ij
multiplicative_factor = np.math.factorial(numb_samples)/ (2 * np.math.factorial(numb_samples-2))
kce = 1/multiplicative_factor * sum_tot
return kce
[docs] def top_label_classification_error(self):
"""
Calculation of the top-label classification error. Assumes pred_proba a matrix K x Numb observations
with probability to be in class k for observation i in position (k,i)
"""
class_max = np.argmax(self.pred, 1)
prob_pred_max = np.max(self.pred, 1)
nclasses = self.pred.shape[1]
numb_samples = self.pred.shape[0]
prob = np.zeros([nclasses])
prob_ref_values, prob_ref_counts = np.unique(self.ref, return_counts=True)
for k in range(nclasses):
idx = np.where(prob_ref_values == k)
if len(idx) == 0:
prob[k] = 0
else:
prob[k] = prob_ref_counts[idx[0]] / numb_samples
prob_expected_max = prob[class_max]
tce = np.sqrt(np.mean(np.square(prob_expected_max - prob_pred_max)))
return tce
[docs] def kernel_based_ece(self):
"""
Calculates kernel based ECE
Teodora Popordanoska, Raphael Sayer, and Matthew B Blaschko. 2022. A Consistent and Differentiable Lp Canonical
Calibration Error Estimator. In Advances in Neural Information Processing Systems.
:return: ece_kde
"""
ece_kde = 0
one_hot_ref = one_hot_encode(self.ref, self.pred.shape[1])
nclasses = self.pred.shape[1]
numb_samples = self.pred.shape[0]
norm_list = []
for j in range(numb_samples):
new_list = []
new_vect = np.zeros([nclasses])
for i in range(numb_samples):
if j != i:
new_dir = self.dirichlet_kernel(j, i)
new_list.append(new_dir)
ref_tmp = one_hot_ref[i, :]
new_add = ref_tmp * new_dir
new_vect += new_add
norm = np.sum(np.asarray(new_list))
final_vect = new_vect / norm
norm_list.append(final_vect - self.pred[j, :])
full_array = np.vstack(norm_list)
ece_kde = np.mean(np.sqrt(np.sum(np.square(full_array), 1)))
return ece_kde
def gamma_ik(self, i, k):
pred_ik = self.pred[i, k]
if "bandwidth" in self.dict_args.keys():
h = self.dict_args["bandwidth"]
else:
h = 0.5
alpha_ik = pred_ik / h + 1
gamma_ik = gamma(alpha_ik)
return gamma_ik
def dirichlet_kernel(self, j, i):
pred_i = self.pred[i, :]
pred_j = self.pred[j, :]
nclasses = self.pred.shape[1]
if "bandwidth" in self.dict_args.keys():
h = self.dict_args["bandwidth"]
else:
h = 0.5
alpha_i = pred_i / h + 1
numerator = gamma(np.sum(alpha_i))
denominator = np.prod(gamma(alpha_i))
prod = 1
for k in range(nclasses):
prod *= np.power(pred_j[k], alpha_i[k] - 1)
kernel_value = numerator / denominator * prod
return kernel_value
[docs] def negative_log_likelihood(self):
"""
Derives the negative log-likelihood defined as
George Cybenko, Dianne P O’Leary, and Jorma Rissanen. 1998. The Mathematics of Information Coding, Extraction
and Distribution. Vol. 107. Springer Science & Business Media.
.. math::
-\sum_{i=1}{N} log(p_{i,k} | y_i=k)
"""
log_pred = np.log(self.pred)
numb_samples = self.pred.shape[0]
ll = np.sum(log_pred[range(numb_samples), self.ref])
nll = -1 * ll
return nll
[docs] def to_dict_meas(self, fmt="{:.4f}"):
"""Given the selected metrics provides a dictionary with relevant metrics"""
result_dict = {}
for key in self.measures:
result = self.measures_dict[key][0]()
#result_dict[key] = fmt.format(result)
result_dict[key] = result
return result_dict