Add jitter+full_adapt initialization (#3893)

* Add jitter+full_adapt initialization * Add tests and benchmarks * Actually save file
pymc-devs · Apr 28, 2020 · a4aed97 · a4aed97
1 parent ae54ba2
commit a4aed97
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 8 deletions.
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -11,7 +11,7 @@
 - GP covariance functions can now be exponentiated by a scalar. See PR [#3852](https://github.com/pymc-devs/pymc3/pull/3852)
 - `sample_posterior_predictive` can now feed on `xarray.Dataset` - e.g. from `InferenceData.posterior`. (see [#3846](https://github.com/pymc-devs/pymc3/pull/3846))
 - `SamplerReport` (`MultiTrace.report`) now has properties `n_tune`, `n_draws`, `t_sampling` for increased convenience (see [#3827](https://github.com/pymc-devs/pymc3/pull/3827))
-- `pm.sample` now has support for adapting dense mass matrix using `QuadPotentialFullAdapt` (see [#3596](https://github.com/pymc-devs/pymc3/pull/3596), [#3705](https://github.com/pymc-devs/pymc3/pull/3705) and [#3858](https://github.com/pymc-devs/pymc3/pull/3858))
+- `pm.sample` now has support for adapting dense mass matrix using `QuadPotentialFullAdapt` (see [#3596](https://github.com/pymc-devs/pymc3/pull/3596), [#3705](https://github.com/pymc-devs/pymc3/pull/3705), [#3858](https://github.com/pymc-devs/pymc3/pull/3858), and [#3893](https://github.com/pymc-devs/pymc3/pull/3893)). Use `init="adapt_full"` or `init="jitter+adapt_full"` to use.
 - `Moyal` distribution added (see [#3870](https://github.com/pymc-devs/pymc3/pull/3870)).
 - `pm.LKJCholeskyCov` now automatically computes and returns the unpacked Cholesky decomposition, the correlations and the standard deviations of the covariance matrix (see [#3881](https://github.com/pymc-devs/pymc3/pull/3881)).
 

diff --git a/benchmarks/benchmarks/benchmarks.py b/benchmarks/benchmarks/benchmarks.py
@@ -146,7 +146,7 @@ class NUTSInitSuite:
     """Tests initializations for NUTS sampler on models
     """
     timeout = 360.0
-    params = ('adapt_diag', 'jitter+adapt_diag', 'advi+adapt_diag_grad')
+    params = ('adapt_diag', 'jitter+adapt_diag', 'jitter+adapt_full', 'adapt_full')
     number = 1
     repeat = 1
     draws = 10000
@@ -245,7 +245,7 @@ def freefall(y, t, p):
             46.48,
             48.18
         ]).reshape(-1, 1)
-        
+
         ode_model = pm.ode.DifferentialEquation(func=freefall, times=times, n_states=1, n_theta=2, t0=0)
         with pm.Model() as model:
             # Specify prior distributions for some of our model parameters
@@ -255,12 +255,12 @@ def freefall(y, t, p):
             ode_solution = ode_model(y0=[0], theta=[gamma, 9.8])
             # The ode_solution has a shape of (n_times, n_states)
             Y = pm.Normal("Y", mu=ode_solution, sd=sigma, observed=y)
-            
+
             t0 = time.time()
             trace = pm.sample(500, tune=1000, chains=2, cores=2, random_seed=0)
             tot = time.time() - t0
         ess = pm.ess(trace)
         return np.mean([ess.sigma, ess.gamma]) / tot
 
-    
+
 DifferentialEquationSuite.track_1var_2par_ode_ess.unit = 'Effective samples per second'
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
@@ -1853,8 +1853,8 @@ def init_nuts(
         * adapt_diag: Start with a identity mass matrix and then adapt a diagonal based on the
           variance of the tuning samples. All chains use the test value (usually the prior mean)
           as starting point.
-        * jitter+adapt_diag: Same as ``adapt_diag``, but use uniform jitter in [-1, 1] as starting
-          point in each chain.
+        * jitter+adapt_diag: Same as ``adapt_diag``, but use test value plus a uniform jitter in
+          [-1, 1] as starting point in each chain.
         * advi+adapt_diag: Run ADVI and then adapt the resulting diagonal mass matrix based on the
           sample variance of the tuning samples.
         * advi+adapt_diag_grad: Run ADVI and then adapt the resulting diagonal mass matrix based
@@ -1863,7 +1863,10 @@ def init_nuts(
         * advi: Run ADVI to estimate posterior mean and diagonal mass matrix.
         * advi_map: Initialize ADVI with MAP and use MAP as starting point.
         * map: Use the MAP as starting point. This is discouraged.
-        * adapt_full: Adapt a dense mass matrix using the sample covariances
+        * adapt_full: Adapt a dense mass matrix using the sample covariances. All chains use the
+          test value (usually the prior mean) as starting point.
+        * jitter+adapt_full: Same as ``adapt_full`, but use test value plus a uniform jitter in
+          [-1, 1] as starting point in each chain.
     chains: int
         Number of jobs to start.
     n_init: int
@@ -2001,6 +2004,16 @@ def init_nuts(
         mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0)
         cov = np.eye(model.ndim)
         potential = quadpotential.QuadPotentialFullAdapt(model.ndim, mean, cov, 10)
+    elif init == 'jitter+adapt_full':
+        start = []
+        for _ in range(chains):
+            mean = {var: val.copy() for var, val in model.test_point.items()}
+            for val in mean.values():
+                val[...] += 2 * np.random.rand(*val.shape) - 1
+            start.append(mean)
+        mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0)
+        cov = np.eye(model.ndim)
+        potential = quadpotential.QuadPotentialFullAdapt(model.ndim, mean, cov, 10)
     else:
         raise ValueError("Unknown initializer: {}.".format(init))
 

diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
@@ -675,6 +675,8 @@ def test_sample_posterior_predictive_w(self):
         "advi+adapt_diag_grad",
         "map",
         "advi_map",
+        "adapt_full",
+        "jitter+adapt_full",
     ],
 )
 def test_exec_nuts_init(method):