Spaces:

jordyvl
/

ece

Configuration error

App Files Files Community

jordyvl commited on Aug 11, 2022

Commit

0a06e4b

1 Parent(s): 0bca13b

fix for equal mass binning

Browse files

Files changed (3) hide show

ece.py +67 -23
resnet110_c10_logits.p +0 -0
test_resnet-cifar_logits.py +164 -0

ece.py CHANGED Viewed

@@ -41,23 +41,13 @@ More concretely, we provide a binned empirical estimator of top-1 calibration er
 _KWARGS_DESCRIPTION = """
 Calculates how good are predictions given some references, using certain scores
 Args:
-    predictions: list of predictions to score. Each predictions
-        should be a string with tokens separated by spaces.
-    references: list of reference for each prediction. Each
-        reference should be a string with tokens separated by spaces.
-    y_true : array-like
-        Ground truth labels.
-    p_hat : array-like
-        Array of confidence estimates.
     n_bins : int, default=15
         Number of bins of :math:`[\\frac{1}{n_{\\text{classes}},1]` for the confidence estimates.
-    n_classes : int default=None
-        Number of classes. Estimated from `y` and `y_pred` if not given.
     p : int, default=1
         Power of the calibration error, :math:`1 \\leq p \\leq \\infty`.
 Returns
     Expected calibration error (ECE), float.
@@ -97,24 +87,37 @@ def create_bins(n_bins=10, scheme="equal-range", bin_range=None, P=None):
         # split sorted probabilities into groups of approx equal size
         groups = np.array_split(np.sort(P), n_bins)
-        bin_upper_edges = list()
         # rightmost entry per equal size group
-        for cur_group in range(n_bins - 1):
-            bin_upper_edges += [max(groups[cur_group])]
-        bin_upper_edges += [1.01]  # [np.inf]  # always +1 for right edges
         bins = np.array(bin_upper_edges)
-        # OverflowError: cannot convert float infinity to integer
     return bins
 def discretize_into_bins(P, bins):
-    oneDbins = np.digitize(P, bins) - 1  # since bins contains extra righmost & leftmost bins
     # Fix to scipy.binned_dd_statistic:
     # Tie-breaking to the left for rightmost bin
     # Using `digitize`, values that fall on an edge are put in the right bin.
     # For the rightmost bin, we want values equal to the right
     # edge to be counted in the last bin, and not as an outlier.
@@ -130,6 +133,7 @@ def discretize_into_bins(P, bins):
         on_edge = np.where(
             (P[:, k] >= bins[-1]) & (np.around(P[:, k], decimal) == np.around(bins[-1], decimal))
         )[0]
         # Shift these points one bin to the left.
         oneDbins[on_edge, k] -= 1
@@ -138,16 +142,19 @@ def discretize_into_bins(P, bins):
 def manual_binned_statistic(P, y_correct, bins, statistic="mean"):
     bin_assignments = discretize_into_bins(np.expand_dims(P, 0), bins)[0]
     result = np.empty([len(bins)], float)
     result.fill(np.nan)  # cannot assume each bin will have observations
     flatcount = np.bincount(bin_assignments, None)
     a = flatcount.nonzero()
     if statistic == "mean":
         flatsum = np.bincount(bin_assignments, y_correct)
         result[a] = flatsum[a] / flatcount[a]
-    return result, bins, bin_assignments + 1  # fix for what happens in discretize_into_bins
 def bin_calibrated_accuracy(bins, proxy="upper-edge"):
@@ -168,16 +175,19 @@ def CE_estimate(y_correct, P, bins=None, p=1, proxy="upper-edge", detail=False):
     Summary: weighted average over the accuracy/confidence difference of discrete bins of prediction probability
     """
-    n_bins = len(bins) - 1
     bin_range = [min(bins), max(bins)]
     # average bin probability #55 for bin 50-60, mean per bin; or right/upper bin edges
-    calibrated_acc = bin_calibrated_accuracy(bins, proxy="upper-edge")
     empirical_acc, bin_edges, bin_assignment = manual_binned_statistic(P, y_correct, bins)
     bin_numbers, weights_ece = np.unique(bin_assignment, return_counts=True)
     anindices = bin_numbers - 1  # reduce bin counts; left edge; indexes right by default
     # Expected calibration error
     if p < np.inf:  # L^p-CE
         CE = np.average(
@@ -292,7 +302,7 @@ class ECE(evaluate.EvaluationModule):
         }
-def test_ECE():
     N = 10  # N evaluation instances {(x_i,y_i)}_{i=1}^N
     K = 5  # K class problem
@@ -308,7 +318,7 @@ def test_ECE():
     references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
     references = np.array(references, dtype=np.int64)
     predictions = np.array(predictions, dtype=np.float32)
-    res = ECE()._compute(predictions, references)
     print(f"ECE: {res['ECE']}")
     res = ECE()._compute(predictions, references, detail=True)
@@ -324,6 +334,40 @@ def test_deterministic():
     print(f"ECE: {res['ECE']}\n {res}")
 if __name__ == "__main__":
     test_deterministic()
     test_ECE()

 _KWARGS_DESCRIPTION = """
 Calculates how good are predictions given some references, using certain scores
 Args:
+    predictions: 2D Array of confidence estimates.
+    references: 1D Array of Ground truth indices.
     n_bins : int, default=15
         Number of bins of :math:`[\\frac{1}{n_{\\text{classes}},1]` for the confidence estimates.
     p : int, default=1
         Power of the calibration error, :math:`1 \\leq p \\leq \\infty`.
 Returns
     Expected calibration error (ECE), float.
         # split sorted probabilities into groups of approx equal size
         groups = np.array_split(np.sort(P), n_bins)
+        # is this really required?
+        bin_upper_edges = []
         # rightmost entry per equal size group
+        for cur_group in range(n_bins):
+            bin_upper_edges += [max(groups[cur_group])]  # if upper edges is what we compare against
+        bin_upper_edges += [1]  # always +1 for right edges
+        bin_upper_edges = sorted(list(set(bin_upper_edges)))  # important for numerical conditions!
+        # might change number of bins :O
         bins = np.array(bin_upper_edges)
     return bins
 def discretize_into_bins(P, bins):
+    contains_rightmost = bool(bins[-1] > 1) #outlier bins
+    contains_leftmost = bool(bins[0] == 0) #beyond [before] bin_range[0]
+    # bins_with_left_edge = np.insert(bins, 0, 0, axis=0)
+    oneDbins = np.digitize(
+        P, bins, right=contains_rightmost
+    )  # since bins contains extra righmost (& leftmost bins)
+    if contains_leftmost:
+        oneDbins -= 1
+    # oneDbins = np.digitize(P, bins) - 1  # since bins contains extra righmost (& leftmost bins)
     # Fix to scipy.binned_dd_statistic:
     # Tie-breaking to the left for rightmost bin
     # Using `digitize`, values that fall on an edge are put in the right bin.
     # For the rightmost bin, we want values equal to the right
     # edge to be counted in the last bin, and not as an outlier.
         on_edge = np.where(
             (P[:, k] >= bins[-1]) & (np.around(P[:, k], decimal) == np.around(bins[-1], decimal))
         )[0]
         # Shift these points one bin to the left.
         oneDbins[on_edge, k] -= 1
 def manual_binned_statistic(P, y_correct, bins, statistic="mean"):
     bin_assignments = discretize_into_bins(np.expand_dims(P, 0), bins)[0]
+    # indexed as in julia!
     result = np.empty([len(bins)], float)
     result.fill(np.nan)  # cannot assume each bin will have observations
     flatcount = np.bincount(bin_assignments, None)
+    # cannot have a negative index
     a = flatcount.nonzero()
     if statistic == "mean":
         flatsum = np.bincount(bin_assignments, y_correct)
         result[a] = flatsum[a] / flatcount[a]
+    return result, bins, bin_assignments + 1  # upper right edge as proxy
 def bin_calibrated_accuracy(bins, proxy="upper-edge"):
     Summary: weighted average over the accuracy/confidence difference of discrete bins of prediction probability
     """
+    n_bins = len(bins) - 1 #true number of bins
     bin_range = [min(bins), max(bins)]
     # average bin probability #55 for bin 50-60, mean per bin; or right/upper bin edges
+    calibrated_acc = bin_calibrated_accuracy(bins, proxy=proxy)
     empirical_acc, bin_edges, bin_assignment = manual_binned_statistic(P, y_correct, bins)
     bin_numbers, weights_ece = np.unique(bin_assignment, return_counts=True)
     anindices = bin_numbers - 1  # reduce bin counts; left edge; indexes right by default
+    import pdb; pdb.set_trace()  # breakpoint 83c9148b //
     # Expected calibration error
     if p < np.inf:  # L^p-CE
         CE = np.average(
         }
+def test_ECE(**kwargs):
     N = 10  # N evaluation instances {(x_i,y_i)}_{i=1}^N
     K = 5  # K class problem
     references, predictions = list(zip(*[random_mc_instance() for i in range(N)]))
     references = np.array(references, dtype=np.int64)
     predictions = np.array(predictions, dtype=np.float32)
+    res = ECE()._compute(predictions, references, **kwargs)
     print(f"ECE: {res['ECE']}")
     res = ECE()._compute(predictions, references, detail=True)
     print(f"ECE: {res['ECE']}\n {res}")
+def test_equalmass_binning():
+    probs = np.array([0.63, 0.2, 0.2, 0, 0.95, 0.05, 0.72, 0.1, 0.2])
+    kwargs = dict(
+        n_bins=5,
+        scheme="equal-mass",
+        bin_range=None,
+        proxy="upper-edge",
+        # proxy="center",
+        p=1,
+        detail=True,
+    )
+    bins = create_bins(
+        n_bins=kwargs["n_bins"], scheme=kwargs["scheme"], bin_range=kwargs["bin_range"], P=probs
+    )
+    test_ECE(**kwargs)
+    """
+    res = ECE()._compute(
+        references=[0, 1, 2],
+        predictions=[[0.63, 0.2, 0.2], [0, 0.95, 0.05], [0.72, 0.1, 0.2]],
+        detail=True,
+    )
+    print(f"ECE: {res['ECE']}\n {res}")
+    """
+    # need to provide lens
+    import pdb
+    pdb.set_trace()  # breakpoint 94583f9f //
 if __name__ == "__main__":
+    test_equalmass_binning()
     test_deterministic()
     test_ECE()

resnet110_c10_logits.p ADDED Viewed

Binary file (685 kB). View file

test_resnet-cifar_logits.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""
+This testing script loads actual probabilisitic predictions from a resnet finetuned on CIFAR
+There are a number of logits-groundtruth pickles available @ https://github.com/markus93/NN_calibration/tree/master/logits
+[Seems to have moved from Git-LFS to sharepoint]
+https://tartuulikool-my.sharepoint.com/:f:/g/personal/markus93_ut_ee/EmW0xbhcic5Ou0lRbTrySOUBF2ccSsN7lo6lvSfuG1djew?e=l0TErb
+See https://github.com/markus93/NN_calibration/blob/master/logits/Readme.txt to decode the [model_dataset] filenames
+As a bonus, one could consider temperature scaling and measuring after calibration.
+"""
+import sys
+import numpy as np
+import scipy.stats as stats
+from scipy.special import softmax
+import pickle
+from sklearn.model_selection import train_test_split
+from matplotlib import pyplot as plt
+from ece import create_bins, discretize_into_bins, ECE
+# Open file with pickled variables
+def unpickle_probs(file, verbose=0, normalize=True):
+    with open(file, "rb") as f:  # Python 3: open(..., 'rb')
+        y1, y2 = pickle.load(f)  # unpickle the content
+    if isinstance(y1, tuple):
+        y_probs_val, y_val = y1
+        y_probs_test, y_test = y2
+    else:
+        y_probs_val, y_probs_test, y_val, y_test = train_test_split(
+            y1, y2.reshape(-1, 1), test_size=len(y2) - 5000, random_state=15
+        )  # Splits the data in the case of pretrained models
+    if normalize:
+        y_probs_val = softmax(y_probs_val, -1)
+        y_probs_test = softmax(y_probs_test, -1)
+    if verbose:
+        print(
+            "y_probs_val:", y_probs_val.shape
+        )  # (5000, 10); Validation set probabilities of predictions
+        print("y_true_val:", y_val.shape)  # (5000, 1); Validation set true labels
+        print("y_probs_test:", y_probs_test.shape)  # (10000, 10); Test set probabilities
+        print("y_true_test:", y_test.shape)  # (10000, 1); Test set true labels
+    return ((y_probs_val, y_val.ravel()), (y_probs_test, y_test.ravel()))
+def unpickle_structured_probs(valpath=None, testpath=None):
+    valpath = "/home/jordy/code/gordon/arkham/arkham/StructuredCalibration/models/jordyvl/bert-base-cased_conll2003-sm-first-ner_validation_UTY.pickle"
+    testpath = "/home/jordy/code/gordon/arkham/arkham/StructuredCalibration/models/jordyvl/bert-base-cased_conll2003-sm-first-ner_test_UTY.pickle"
+    with open(valpath, "rb") as f:
+        X_val, _, y_val, _ = pickle.load(f)
+    with open(testpath, "rb") as f:
+        X_test, _, y_test, _ = pickle.load(f)
+    X_val = np.log(X_val)  # originally exponentiated [different purposes]
+    X_test = np.log(X_test)  # originally exponentiated [different purposes]
+    # structured logits
+"""
+ALTERNATE equal mass binning
+"""
+# Define data types.
+from typing import List, Tuple, NewType, TypeVar
+Data = List[Tuple[float, float]]  # List of (predicted_probability, true_label).
+Bins = List[float]  # List of bin boundaries, excluding 0.0, but including 1.0.
+BinnedData = List[Data]  # binned_data[i] contains the data in bin i.
+T = TypeVar('T')
+eps = 1e-6
+def split(sequence: List[T], parts: int) -> List[List[T]]:
+    assert parts <= len(sequence), "more bins than probabilities"
+    part_size = int(np.ceil(len(sequence) * 1.0 / parts))
+    assert part_size * parts >= len(sequence), "no missing instances when partitioning"
+    assert (part_size - 1) * parts < len(sequence), "dropping 1 does not make for missing"
+    return [sequence[i:i + part_size] for i in range(0, len(sequence), part_size)]
+def get_equal_bins(probs: List[float], n_bins: int=10) -> Bins:
+    """Get bins that contain approximately an equal number of data points."""
+    sorted_probs = sorted(probs)
+    binned_data = split(sorted_probs, n_bins)
+    bins: Bins = []
+    for i in range(len(binned_data) - 1):
+        last_prob = binned_data[i][-1]
+        next_first_prob = binned_data[i + 1][0]
+        bins.append((last_prob + next_first_prob) / 2.0)
+    bins.append(1.0)
+    bins = sorted(list(set(bins))) #this is the special thing!
+    return bins
+def histedges_equalN(x, nbin):
+    npt = len(x)
+    return np.interp(np.linspace(0, npt, nbin + 1),
+                     np.arange(npt),
+                     np.sort(x))
+    '''
+    bin_upper_edges = histedges_equalN(P, n_bins)
+    #n, bins, patches = plt.hist(x, histedges_equalN(x, 10))
+    '''
+def test_equalmass_binning(P, Y):
+    #probs = np.array([0.63, 0.2, 0.2, 0, 0.95, 0.05, 0.72, 0.1, 0.2])
+    kwargs = dict(
+        n_bins= 10,
+        scheme="equal-mass",
+        bin_range=None,
+        proxy="upper-edge",
+        #proxy="center",
+        p=1,
+        detail=True,
+    )
+    if P.ndim == 2: #can assume ECE
+        p_max = np.max(P, -1)  # create p̂ as top-1 softmax probability € [0,1]
+    eqr_bins = create_bins(n_bins=kwargs["n_bins"], scheme="equal-range", bin_range=kwargs["bin_range"], P=p_max)
+    eqm_bins = create_bins(n_bins=kwargs["n_bins"], scheme=kwargs["scheme"], bin_range=kwargs["bin_range"], P=p_max)
+    #alternate_eqm_bins = get_equal_bins(p_max, kwargs["n_bins"])
+    eqr_hist = np.digitize(p_max, eqr_bins, right=True)
+    eqm_hist = np.digitize(p_max, eqm_bins, right=True)
+    eqml_hist = np.digitize(p_max, eqm_bins, right=False)
+    #eqm_bins = [0] + eqm_bins
+    other_hist = discretize_into_bins(np.expand_dims(p_max, 0), eqm_bins)
+    hist_difference = stats.power_divergence(eqr_hist, eqm_hist, lambda_="pearson") #chisquare
+    #plt.hist(eqr_hist, color="green", label="equal-range")
+    plt.hist(eqm_hist, color="blue", label="equal-mass")
+    plt.legend()
+    #plt.show()
+    res = ECE()._compute(P, Y, **kwargs)
+    print(f"eqm ECE: {res['ECE']}")
+    kwargs["scheme"] = "equal-range"
+    res = ECE()._compute(P, Y, **kwargs)
+    print(f"eqr ECE: {res['ECE']}")
+    # res = ECE()._compute(predictions, references, detail=True)
+    # print(f"ECE: {res['ECE']}")
+if __name__ == "__main__":
+    FILE_PATH = sys.argv[1] if len(sys.argv) > 1 else "resnet110_c10_logits.p"
+    (p_val, y_val), (p_test, y_test) = unpickle_probs(FILE_PATH, False, True)
+    test_equalmass_binning(p_val, y_val)
+    # do on val