fix compare and make stacking default method (#1438)

* fix compare and make stacking default method * update changelog
arviz-devs · Nov 9, 2020 · 44ceb08 · 44ceb08
1 parent 9c21881
commit 44ceb08
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@
 * Add observed argument to (un)plot observed data in `plot_ppc` ([1422](https://github.com/arviz-devs/arviz/pull/1422))
 * Add support for named dims and coordinates with multivariate observations ([1429](https://github.com/arviz-devs/arviz/pull/1429))
 * Add skipna argument to `plot_posterior` ([1432](https://github.com/arviz-devs/arviz/pull/1432))
+* Make stacking the default method to compute weights in `compare` ([1438](https://github.com/arviz-devs/arviz/pull/1438))
 
 
 ### Maintenance and fixes
@@ -20,8 +21,9 @@
 * Have `from_pystan` store attrs as strings to allow netCDF storage ([1417](https://github.com/arviz-devs/arviz/pull/1417))
 * Remove ticks and spines in `plot_violin`  ([1426 ](https://github.com/arviz-devs/arviz/pull/1426))
 * Use circular KDE function and fix tick labels in circular `plot_trace` ([1428](https://github.com/arviz-devs/arviz/pull/1428))
-*  Fix `pair_plot` for mixed discrete and continuous variables ([1434](https://github.com/arviz-devs/arviz/pull/1434))
+* Fix `pair_plot` for mixed discrete and continuous variables ([1434](https://github.com/arviz-devs/arviz/pull/1434))
 * Fix in-sample deviance in `plot_compare` ([1435](https://github.com/arviz-devs/arviz/pull/1435))
+* Fix computation of weights in compare ([1438](https://github.com/arviz-devs/arviz/pull/1438))
 
 ### Deprecation
 

diff --git a/arviz/stats/stats.py b/arviz/stats/stats.py
@@ -43,7 +43,7 @@
 
 
 def compare(
-    dataset_dict, ic=None, method="BB-pseudo-BMA", b_samples=1000, alpha=1, seed=None, scale=None
+    dataset_dict, ic=None, method="stacking", b_samples=1000, alpha=1, seed=None, scale=None
 ):
     r"""Compare models based on PSIS-LOO `loo` or WAIC `waic` cross-validation.
 
@@ -62,8 +62,8 @@ def compare(
     method: str
         Method used to estimate the weights for each model. Available options are:
 
-        - 'stacking' : stacking of predictive distributions.
-        - 'BB-pseudo-BMA' : (default) pseudo-Bayesian Model averaging using Akaike-type
+        - 'stacking' : (default) stacking of predictive distributions.
+        - 'BB-pseudo-BMA' : pseudo-Bayesian Model averaging using Akaike-type
           weighting. The weights are stabilized using the Bayesian bootstrap.
         - 'pseudo-BMA': pseudo-Bayesian Model averaging using Akaike-type
           weighting, without Bootstrap stabilization (not recommended).
@@ -141,6 +141,10 @@ def compare(
     waic : Compute the widely applicable information criterion.
 
     """
+    warnings.warn(
+        "The default method used to estimate the weights for each model,"
+        "has changed from BB-pseudo-BMA to stacking"
+    )
     names = list(dataset_dict.keys())
     scale = rcParams["stats.ic_scale"] if scale is None else scale.lower()
     if scale == "log":
@@ -210,7 +214,7 @@ def compare(
     if method.lower() == "stacking":
         rows, cols, ic_i_val = _ic_matrix(ics, ic_i)
         exp_ic_i = np.exp(ic_i_val / scale_value)
-        last_col = cols - 1
+        km1 = cols - 1
 
         def w_fuller(weights):
             return np.concatenate((weights, [max(1.0 - np.sum(weights), 0.0)]))
@@ -224,18 +228,16 @@ def log_score(weights):
 
         def gradient(weights):
             w_full = w_fuller(weights)
-            grad = np.zeros(last_col)
-            for k in range(last_col - 1):
+            grad = np.zeros(km1)
+            for k in range(km1):
                 for i in range(rows):
-                    grad[k] += (exp_ic_i[i, k] - exp_ic_i[i, last_col]) / np.dot(
-                        exp_ic_i[i], w_full
-                    )
+                    grad[k] += (exp_ic_i[i, k] - exp_ic_i[i, km1]) / np.dot(exp_ic_i[i], w_full)
             return -grad
 
-        theta = np.full(last_col, 1.0 / cols)
-        bounds = [(0.0, 1.0) for _ in range(last_col)]
+        theta = np.full(km1, 1.0 / cols)
+        bounds = [(0.0, 1.0) for _ in range(km1)]
         constraints = [
-            {"type": "ineq", "fun": lambda x: 1.0 - np.sum(x)},
+            {"type": "ineq", "fun": lambda x: -np.sum(x) + 1.0},
             {"type": "ineq", "fun": np.sum},
         ]
 
@@ -255,7 +257,7 @@ def gradient(weights):
         z_bs = np.zeros_like(weights)
         for i in range(b_samples):
             z_b = np.dot(b_weighting[i], ic_i_val)
-            u_weights = np.exp((z_b - np.min(z_b)) / scale_value)
+            u_weights = np.exp((z_b - np.max(z_b)) / scale_value)
             z_bs[i] = z_b  # pylint: disable=unsupported-assignment-operation
             weights[i] = u_weights / np.sum(u_weights)
 

diff --git a/arviz/tests/base_tests/test_stats.py b/arviz/tests/base_tests/test_stats.py
@@ -163,7 +163,7 @@ def test_compare_unknown_ic_and_method(centered_eight, non_centered_eight):
 def test_compare_different(centered_eight, non_centered_eight, ic, method, scale):
     model_dict = {"centered": centered_eight, "non_centered": non_centered_eight}
     weight = compare(model_dict, ic=ic, method=method, scale=scale)["weight"]
-    assert weight["non_centered"] >= weight["centered"]
+    assert weight["non_centered"] > weight["centered"]
     assert_allclose(np.sum(weight), 1.0)
 
 
@@ -174,7 +174,7 @@ def test_compare_different_multidim(multidim_models, ic, method):
     weight = compare(model_dict, ic=ic, method=method)["weight"]
 
     # this should hold because the same seed is always used
-    assert weight["model_1"] >= weight["model_2"]
+    assert weight["model_1"] > weight["model_2"]
     assert_allclose(np.sum(weight), 1.0)