add test model to build posterior prob from samples (gwastro#3616)

* initial commit * fixes + logging * fixes * cc * add validation script for test posterior * bounds have to be done manually * plot mod * Update analytic.py
ahnitz · Feb 15, 2021 · 8f7e949 · 8f7e949
1 parent 1be55fe
commit 8f7e949
Show file tree

Hide file tree

Showing 4 changed files with 118 additions and 1 deletion.
diff --git a/bin/inference/pycbc_validate_test_posterior b/bin/inference/pycbc_validate_test_posterior
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+""" Validate and generate diagnostic plots for a inference file using the
+test posterior model.
+"""
+import sys
+import numpy
+import argparse
+from matplotlib import use; use('Agg')
+import pylab
+from pycbc.inference.option_utils import prior_from_config
+from pycbc.inference import models, io
+from scipy.stats import gaussian_kde, ks_2samp
+from pycbc.io import FieldArray
+numpy.random.seed(0)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--input-file', help='inference posterior file')
+parser.add_argument('--output-file', help='diagnostic plot')
+parser.add_argument('--p-value-threshold', help='minimum ks test p-value',
+                    type=float)
+parser.add_argument('--ind-samples', help='use only this number of samples',
+                    default=1000, type=int)
+args = parser.parse_args()
+
+size = int(1e6)
+d1 = io.loadfile(args.input_file, 'r')
+
+#We directly recreate the model and prior from the stored
+#config to ensure the same configuration
+config = d1.read_config_file()
+
+prior = prior_from_config(config)
+model = models.read_from_config(config)
+
+# Draw reference samples directly from the kde
+draw = model.kde.resample(size=size)
+data = {v: draw[i, :] for i, v in enumerate(model.variable_params)}
+ref = FieldArray.from_kwargs(**data)
+
+# apply the prior bounds to ensure kde leakage is not a concern
+for dist in prior.distributions:
+    param = dist._params[0]
+    bound = dist._bounds[param]
+    ref = ref[(bound.min < ref[param]) & (ref[param] < bound.max)]
+
+nparam = len(model.variable_params)
+fig, axs = pylab.subplots(1, nparam, figsize=[6*nparam, 4], dpi=100)
+
+result = d1.read_samples(model.variable_params)
+failed = False
+for param, ax in zip(model.variable_params, axs):
+    rpart = numpy.random.choice(result[param], replace=False,
+                                size=args.ind_samples)
+    kv, pvalue = ks_2samp(ref[param], rpart)
+    print("{}, p-value={:.3f}".format(param, pvalue))
+
+    pylab.sca(ax)
+    pylab.hist(ref[param], density=True, bins=30, label='reference')
+    pylab.hist(result[param], density=True, bins=30, alpha=0.5, label='sampler')
+    pylab.title('KS p-value = {:.4f}'.format(pvalue))
+    pylab.xlabel(param)
+    pylab.legend()
+    ax.get_yaxis().set_visible(False)
+
+    if pvalue < args.p_value_threshold:
+        failed = True
+
+pylab.savefig(args.output_file)
+sys.exit(failed)
diff --git a/examples/inference/analytic-normal2d/posterior-normal2d.png b/examples/inference/analytic-normal2d/posterior-normal2d.png
diff --git a/pycbc/inference/models/__init__.py b/pycbc/inference/models/__init__.py
@@ -21,7 +21,7 @@
 
 
 from .analytic import (TestEggbox, TestNormal, TestRosenbrock, TestVolcano,
-                       TestPrior)
+                       TestPrior, TestPosterior)
 from .gaussian_noise import GaussianNoise
 from .marginalized_gaussian_noise import MarginalizedPhaseGaussianNoise
 from .marginalized_gaussian_noise import MarginalizedPolarization
@@ -182,6 +182,7 @@ def read_from_config(cp, **kwargs):
     TestNormal,
     TestRosenbrock,
     TestVolcano,
+    TestPosterior,
     TestPrior,
     GaussianNoise,
     MarginalizedPhaseGaussianNoise,

diff --git a/pycbc/inference/models/analytic.py b/pycbc/inference/models/analytic.py
@@ -18,7 +18,9 @@
 log likelihood.
 """
 
+import logging
 import numpy
+import numpy.random
 from scipy import stats
 
 from .base import BaseModel
@@ -212,3 +214,48 @@ def _loglikelihood(self):
         """Returns zero.
         """
         return 0.
+
+
+class TestPosterior(BaseModel):
+    r"""Build a test posterior from a set of samples using a kde
+
+    Parameters
+    ----------
+    variable_params : (tuple of) string(s)
+        A tuple of parameter names that will be varied.
+    posterior_file : hdf file
+        A compatible pycbc inference output file which posterior samples can
+        be read from.
+    nsamples : int
+        Number of samples to draw from posterior file to build KDE.
+    **kwargs :
+        All other keyword arguments are passed to ``BaseModel``.
+
+    """
+    name = "test_posterior"
+
+    def __init__(self, variable_params, posterior_file, nsamples, **kwargs):
+        super(TestPosterior, self).__init__(variable_params, **kwargs)
+
+        from pycbc.inference.io import loadfile  # avoid cyclic import
+        logging.info('loading test posterior model')
+        inf_file = loadfile(posterior_file)
+        logging.info('reading samples')
+        samples = inf_file.read_samples(variable_params)
+        samples = numpy.array([samples[v] for v in variable_params])
+
+        # choose only the requested amount of samples
+        idx = numpy.arange(0, samples.shape[-1])
+        idx = numpy.random.choice(idx, size=int(nsamples), replace=False)
+        samples = samples[:, idx]
+
+        logging.info('making kde with %s samples', samples.shape[-1])
+        self.kde = stats.gaussian_kde(samples)
+        logging.info('done initializing test posterior model')
+
+    def _loglikelihood(self):
+        """Returns the log pdf of the test posterior kde
+        """
+        p = numpy.array([self.current_params[p] for p in self.variable_params])
+        logpost = self.kde.logpdf(p)
+        return float(logpost[0])