aai-institute
diff --git a/‎.bumpversion.cfg
+1-1 b/‎.bumpversion.cfg
+1-1
diff --git a/‎.pre-commit-config.yaml
+11-1 b/‎.pre-commit-config.yaml
+11-1
diff --git a/‎notebooks/calibration_demo.ipynb
+61-40 b/‎notebooks/calibration_demo.ipynb
+61-40
diff --git a/‎notebooks/evaluating_cal_methods.ipynb
+64-203 b/‎notebooks/evaluating_cal_methods.ipynb
+64-203
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.6
+current_version = 0.1.7
 commit = False
 tag = False
 allow_dirty = False
 
@@ -1,6 +1,16 @@
 repos:
   - repo: https://github.com/psf/black
-    rev: 22.1.0
+    rev: 22.10.0
     hooks:
       - id: black-jupyter
         language_version: python3
+
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.10.1
+    hooks:
+      - id: isort
+
+  - repo: https://github.com/kynan/nbstripout
+    rev: 0.6.1
+    hooks:
+      - id: nbstripout
@@ -1,5 +1,15 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -50,7 +60,7 @@
    "outputs": [],
    "source": [
     "n_samples = 2000\n",
-    "n_classes = 2"
+    "n_classes = 3"
    ]
   },
   {
@@ -68,7 +78,7 @@
     "    random_state=42,\n",
     ")\n",
     "X_train, X_test, y_train, y_test = train_test_split(\n",
-    "    X, y, test_size=0.2, random_state=42\n",
+    "    X, y, test_size=0.5, random_state=42\n",
     ")"
    ]
   },
@@ -85,7 +95,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = MLPClassifier(hidden_layer_sizes=(50, 50, 50))\n",
+    "model = MLPClassifier(hidden_layer_sizes=(20, 20, 10))\n",
     "model.fit(X_train, y_train)"
    ]
   },
@@ -181,7 +191,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ece = ECE(bins=10)"
+    "ece = ECE(bins=12)"
    ]
   },
   {
@@ -191,9 +201,9 @@
    "outputs": [],
    "source": [
     "# Evaluate uncalibrated predictions\n",
-    "uncalibrated_confidences = model.predict_proba(X_test)\n",
+    "y_pred = model.predict_proba(X_test)\n",
     "\n",
-    "pre_calibration_ece = ece.compute(uncalibrated_confidences, y_test)\n",
+    "pre_calibration_ece = ece.compute(y_pred, y_test)\n",
     "\n",
     "f\"ECE before calibration: {pre_calibration_ece}\""
    ]
@@ -212,10 +222,48 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "eval_stats = EvalStats(y_test, uncalibrated_confidences)\n",
-    "class_labels = [i for i in range(n_classes)]\n",
-    "\n",
-    "eval_stats.plot_reliability_curves(class_labels)"
+    "eval_stats = EvalStats(y_test, y_pred)\n",
+    "class_labels = range(n_classes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = eval_stats.plot_reliability_curves(\n",
+    "    [\"top_class\", 0], display_weights=True, strategy=\"uniform\", n_bins=8\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The density of predictions is distributed highly inhomogeneously on the unit interval, some bins have\n",
+    "few members and the estimate of the reliability has high variance. This can be helped by employing\n",
+    "the \"quantile\" binning strategy, also called adaptive binning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fig = eval_stats.plot_reliability_curves(\n",
+    "    [0, \"top_class\"], display_weights=True, n_bins=8, strategy=\"quantile\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now all bins have the same weight but different width. The pointwise reliability estimates\n",
+    "have lower variance but there are wide gaps, thus requiring more interpolation.\n",
+    "Both binning strategies have their advantages and disadvantages."
    ]
   },
   {
@@ -455,38 +503,11 @@
    "source": [
     "ece.compute(confidences, ground_truth)"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Once again, to verify that miscalibration will indeed increase with more samples, let's sample *5x* as many samples as\n",
-    "before and measure $ECE$ again:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "uncalibrated_samples = shifted_sampler.get_sample_arrays(1000)\n",
-    "ground_truth, confidences = uncalibrated_samples\n",
-    "\n",
-    "ece.compute(confidences, ground_truth)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Great! Calibration error goes up as we sample more instances."
-   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -500,9 +521,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.8.13"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 1
-}
+}