arviz-devs · aloctavodia · Sep 23, 2020 · Sep 11, 2020 · Sep 11, 2020 · Sep 11, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@
 * Extended methods from `xr.Dataset` to `InferenceData` ([1254](https://github.com/arviz-devs/arviz/pull/1254))
 * Add `extend` and `add_groups` to `InferenceData` ([1300](https://github.com/arviz-devs/arviz/pull/1300) and [1386](https://github.com/arviz-devs/arviz/pull/1386))
 * Added `__iter__` method (`.items`) for InferenceData ([1356](https://github.com/arviz-devs/arviz/pull/1356))
+* Add support for discrete variables in `plot_bpv` ([#1379](https://github.com/arviz-devs/arviz/pull/1379))
 
 
 ### Maintenance and fixes

diff --git a/arviz/plots/backends/bokeh/bpvplot.py b/arviz/plots/backends/bokeh/bpvplot.py
@@ -3,6 +3,7 @@
 from bokeh.models import BoxAnnotation
 from bokeh.models.annotations import Title
 from scipy import stats
+from scipy.interpolate import CubicSpline
 
 from ....stats.density_utils import kde
 from ...kdeplot import plot_kde
@@ -29,6 +30,7 @@ def plot_bpv(
     bpv,
     plot_mean,
     reference,
+    mse,
     n_ref,
     hdi_prob,
     color,
@@ -85,13 +87,22 @@ def plot_bpv(
         pp_var_name, _, pp_vals = pp_plotters[i]
 
         obs_vals = obs_vals.flatten()
-        pp_vals = pp_vals.reshape(total_pp_samples, -1)
+        if pp_vals.ndim > 2:
+            pp_vals = pp_vals.reshape(total_pp_samples, -1)
+
+        if obs_vals.dtype.kind == "i" or pp_vals.dtype.kind == "i":
+            x = np.linspace(0, 1, len(obs_vals))
+            csi = CubicSpline(x, obs_vals)
+            obs_vals = csi(np.linspace(0.001, 0.999, len(obs_vals)))
+
+            x = np.linspace(0, 1, pp_vals.shape[1])
+            csi = CubicSpline(x, pp_vals, axis=1)
+            pp_vals = csi(np.linspace(0.001, 0.999, pp_vals.shape[1]))
 
         if kind == "p_value":
             tstat_pit = np.mean(pp_vals <= obs_vals, axis=-1)
             x_s, tstat_pit_dens = kde(tstat_pit)
             ax_i.line(x_s, tstat_pit_dens, line_width=linewidth, line_color=color)
-            # ax_i.set_yticks([])
             if reference is not None:
                 dist = stats.beta(obs_vals.size / 2, obs_vals.size / 2)
                 if reference == "analytical":
@@ -104,8 +115,8 @@ def plot_bpv(
                     x_ss, u_dens = sample_reference_distribution(
                         dist,
                         (
-                            n_ref,
                             tstat_pit_dens.size,
+                            n_ref,
                         ),
                     )
                     ax_i.multi_line(
@@ -115,12 +126,12 @@ def plot_bpv(
         elif kind == "u_value":
             tstat_pit = np.mean(pp_vals <= obs_vals, axis=0)
             x_s, tstat_pit_dens = kde(tstat_pit)
-            ax_i.line(x_s, tstat_pit_dens, line_color=color)
+            ax_i.line(x_s, tstat_pit_dens, color=color)
             if reference is not None:
                 if reference == "analytical":
                     n_obs = obs_vals.size
-                    hdi = stats.beta(n_obs / 2, n_obs / 2).ppf((1 - hdi_prob) / 2)
-                    hdi_odds = (hdi / (1 - hdi), (1 - hdi) / hdi)
+                    hdi_ = stats.beta(n_obs / 2, n_obs / 2).ppf((1 - hdi_prob) / 2)
+                    hdi_odds = (hdi_ / (1 - hdi_), (1 - hdi_) / hdi_)
                     ax_i.add_layout(
                         BoxAnnotation(
                             bottom=hdi_odds[1],
@@ -136,6 +147,9 @@ def plot_bpv(
                     x_ss, u_dens = sample_reference_distribution(dist, (tstat_pit_dens.size, n_ref))
                     for x_ss_i, u_dens_i in zip(x_ss.T, u_dens.T):
                         ax_i.line(x_ss_i, u_dens_i, line_width=linewidth, **plot_ref_kwargs)
+            if mse:
+                ax_i.line(0, 0, legend_label=f"mse={np.mean((1 - tstat_pit_dens)**2) * 100:.2f}")
+
             ax_i.line(0, 0)
         else:
             if t_stat in ["mean", "median", "std"]:

diff --git a/arviz/plots/backends/matplotlib/bpvplot.py b/arviz/plots/backends/matplotlib/bpvplot.py
@@ -2,6 +2,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 from scipy import stats
+from scipy.interpolate import CubicSpline
 
 from ....stats.density_utils import kde
 from ...kdeplot import plot_kde
@@ -27,6 +28,7 @@ def plot_bpv(
     bpv,
     plot_mean,
     reference,
+    mse,
     n_ref,
     hdi_prob,
     color,
@@ -60,6 +62,9 @@ def plot_bpv(
     if kind == "p_value" and reference == "analytical":
         plot_ref_kwargs.setdefault("color", "k")
         plot_ref_kwargs.setdefault("linestyle", "--")
+    elif kind == "u_value" and reference == "analytical":
+        plot_ref_kwargs.setdefault("color", "k")
+        plot_ref_kwargs.setdefault("alpha", 0.2)
     else:
         plot_ref_kwargs.setdefault("alpha", 0.1)
         plot_ref_kwargs.setdefault("color", color)
@@ -81,7 +86,17 @@ def plot_bpv(
         pp_var_name, _, pp_vals = pp_plotters[i]
 
         obs_vals = obs_vals.flatten()
-        pp_vals = pp_vals.reshape(total_pp_samples, -1)
+        if pp_vals.ndim > 2:
+            pp_vals = pp_vals.reshape(total_pp_samples, -1)
+
+        if obs_vals.dtype.kind == "i" or pp_vals.dtype.kind == "i":
+            x = np.linspace(0, 1, len(obs_vals))
+            csi = CubicSpline(x, obs_vals)
+            obs_vals = csi(np.linspace(0.001, 0.999, len(obs_vals)))
+
+            x = np.linspace(0, 1, pp_vals.shape[1])
+            csi = CubicSpline(x, pp_vals, axis=1)
+            pp_vals = csi(np.linspace(0.001, 0.999, pp_vals.shape[1]))
 
         if kind == "p_value":
             tstat_pit = np.mean(pp_vals <= obs_vals, axis=-1)
@@ -95,32 +110,36 @@ def plot_bpv(
                     upb = 1 - lwb
                     x = np.linspace(lwb, upb, 500)
                     dens_ref = dist.pdf(x)
-                    ax_i.plot(x, dens_ref, **plot_ref_kwargs)
+                    ax_i.plot(x, dens_ref, zorder=1, **plot_ref_kwargs)
                 elif reference == "samples":
                     x_ss, u_dens = sample_reference_distribution(
                         dist,
                         (
-                            n_ref,
                             tstat_pit_dens.size,
+                            n_ref,
                         ),
                     )
                     ax_i.plot(x_ss, u_dens, linewidth=linewidth, **plot_ref_kwargs)
 
         elif kind == "u_value":
             tstat_pit = np.mean(pp_vals <= obs_vals, axis=0)
             x_s, tstat_pit_dens = kde(tstat_pit)
+            ax_i.plot(x_s, tstat_pit_dens, color=color)
             if reference is not None:
                 if reference == "analytical":
                     n_obs = obs_vals.size
-                    hdi = stats.beta(n_obs / 2, n_obs / 2).ppf((1 - hdi_prob) / 2)
-                    hdi_odds = (hdi / (1 - hdi), (1 - hdi) / hdi)
+                    hdi_ = stats.beta(n_obs / 2, n_obs / 2).ppf((1 - hdi_prob) / 2)
+                    hdi_odds = (hdi_ / (1 - hdi_), (1 - hdi_) / hdi_)
                     ax_i.axhspan(*hdi_odds, **plot_ref_kwargs)
-                    ax_i.axhline(1, color="w")
+                    ax_i.axhline(1, color="w", zorder=1)
                 elif reference == "samples":
                     dist = stats.uniform(0, 1)
                     x_ss, u_dens = sample_reference_distribution(dist, (tstat_pit_dens.size, n_ref))
                     ax_i.plot(x_ss, u_dens, linewidth=linewidth, **plot_ref_kwargs)
-            ax_i.plot(x_s, tstat_pit_dens, color=color)
+            if mse:
+                ax_i.plot(0, 0, label=f"mse={np.mean((1 - tstat_pit_dens)**2) * 100:.2f}")
+                ax_i.legend()
+
             ax_i.set_ylim(0, None)
             ax_i.set_xlim(0, 1)
         else:
@@ -147,7 +166,7 @@ def plot_bpv(
             ax_i.set_yticks([])
             if bpv:
                 p_value = np.mean(pp_vals <= obs_vals)
-                ax_i.plot(0, 0, label=f"bpv={p_value:.2f}", alpha=0)
+                ax_i.plot(obs_vals, 0, label=f"bpv={p_value:.2f}", alpha=0)
                 ax_i.legend()
 
             if plot_mean:

diff --git a/arviz/plots/bpvplot.py b/arviz/plots/bpvplot.py
@@ -12,7 +12,8 @@ def plot_bpv(
     t_stat="median",
     bpv=True,
     plot_mean=True,
-    reference="samples",
+    reference="analytical",
+    mse=False,
     n_ref=100,
     hdi_prob=0.94,
     color="C0",
@@ -34,6 +35,7 @@ def plot_bpv(
     """
     Plot Bayesian p-value for observed data and Posterior/Prior predictive.
 
+
     Parameters
     ----------
     data : az.InferenceData object
@@ -62,6 +64,9 @@ def plot_bpv(
         How to compute the distributions used as reference for u_values or p_values. Allowed values
         are "analytical" (default) and "samples". Use `None` to do not plot any reference.
         Defaults to "samples".
+    mse :bool
+        Show scaled mean square error between uniform distribution and marginal p_value
+        distribution. Defaults to False.
     n_ref : int, optional
         Number of reference distributions to sample when `reference=samples`. Defaults to 100.
     hdi_prob: float, optional
@@ -245,6 +250,7 @@ def plot_bpv(
         bpv=bpv,
         t_stat=t_stat,
         reference=reference,
+        mse=mse,
         n_ref=n_ref,
         hdi_prob=hdi_prob,
         plot_mean=plot_mean,

diff --git a/arviz/tests/base_tests/test_plots_bokeh.py b/arviz/tests/base_tests/test_plots_bokeh.py
@@ -1034,3 +1034,15 @@ def test_plot_dist_comparison_warn(models):
 def test_plot_bpv(models, kwargs):
     axes = plot_bpv(models.model_1, backend="bokeh", show=False, **kwargs)
     assert axes.shape
+
+
+def test_plot_bpv_discrete():
+    fake_obs = {"a": np.random.poisson(2.5, 100)}
+    fake_pp = {"a": np.random.poisson(2.5, (10, 100))}
+    fake_model = from_dict(posterior_predictive=fake_pp, observed_data=fake_obs)
+    axes = plot_bpv(
+        fake_model,
+        backend="bokeh",
+        show=False,
+    )
+    assert axes.shape
diff --git a/arviz/tests/base_tests/test_plots_matplotlib.py b/arviz/tests/base_tests/test_plots_matplotlib.py
@@ -1403,3 +1403,11 @@ def test_plot_dist_comparison_different_vars():
 def test_plot_bpv(models, kwargs):
     axes = plot_bpv(models.model_1, **kwargs)
     assert not isinstance(axes, np.ndarray)
+
+
+def test_plot_bpv_discrete():
+    fake_obs = {"a": np.random.poisson(2.5, 100)}
+    fake_pp = {"a": np.random.poisson(2.5, (10, 100))}
+    fake_model = from_dict(posterior_predictive=fake_pp, observed_data=fake_obs)
+    axes = plot_bpv(fake_model)
+    assert not isinstance(axes, np.ndarray)