From e0d261dc41385f8d0f71948a809227a3c2f8d527 Mon Sep 17 00:00:00 2001
From: Michael Osthege <michael.osthege@outlook.com>
Date: Sun, 6 Jun 2021 14:15:36 +0200
Subject: [PATCH] Return InferenceData by default

Also removes some unnecessary XFAIL marks.

Closes #4372, #4740
---
 RELEASE-NOTES.md                              |   1 +
 benchmarks/benchmarks/benchmarks.py           |  16 +-
 .../Advanced_usage_of_Aesara_in_PyMC3.rst     |   8 +-
 docs/source/Gaussian_Processes.rst            |   2 +-
 docs/source/about.rst                         |   4 +-
 pymc3/data.py                                 |   4 +-
 pymc3/distributions/discrete.py               |  14 +-
 pymc3/distributions/distribution.py           |   2 +-
 pymc3/model.py                                |   4 +-
 pymc3/sampling.py                             |  22 +--
 pymc3/step_methods/mlda.py                    |   4 +-
 pymc3/tests/test_data_container.py            |  21 +--
 pymc3/tests/test_distributions_random.py      |  25 ++--
 pymc3/tests/test_examples.py                  |  12 +-
 pymc3/tests/test_hmc.py                       |  11 +-
 pymc3/tests/test_idata_conversion.py          |  16 +-
 pymc3/tests/test_mixture.py                   |   4 +-
 pymc3/tests/test_ndarray_backend.py           |   4 +-
 pymc3/tests/test_ode.py                       |  39 +++--
 pymc3/tests/test_parallel_sampling.py         |   4 +-
 pymc3/tests/test_sampling.py                  |  82 +++++-----
 pymc3/tests/test_shared.py                    |   6 +-
 pymc3/tests/test_step.py                      | 141 ++++++++++--------
 pymc3/tests/test_variational_inference.py     |  10 +-
 24 files changed, 225 insertions(+), 231 deletions(-)

diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index f1daf593aa3..0e220ea2ac1 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -6,6 +6,7 @@
 - ArviZ `plots` and `stats` *wrappers* were removed. The functions are now just available by their original names (see [#4549](https://github.com/pymc-devs/pymc3/pull/4471) and `3.11.2` release notes).
 - The GLM submodule has been removed, please use [Bambi](https://bambinos.github.io/bambi/) instead.
 - The `Distribution` keyword argument `testval` has been deprecated in favor of `initval`.
+- `pm.sample` now returns results as `InferenceData` instead of `MultiTrace` by default (see [#4744](https://github.com/pymc-devs/pymc3/pull/4744)).
 - ...
 
 ### New Features
diff --git a/benchmarks/benchmarks/benchmarks.py b/benchmarks/benchmarks/benchmarks.py
index a518c6c0301..283b236ff58 100644
--- a/benchmarks/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks/benchmarks.py
@@ -181,7 +181,7 @@ def track_glm_hierarchical_ess(self, init):
                 init=init, chains=self.chains, progressbar=False, random_seed=123
             )
             t0 = time.time()
-            trace = pm.sample(
+            idata = pm.sample(
                 draws=self.draws,
                 step=step,
                 cores=4,
@@ -192,7 +192,7 @@ def track_glm_hierarchical_ess(self, init):
                 compute_convergence_checks=False,
             )
             tot = time.time() - t0
-        ess = float(az.ess(trace, var_names=["mu_a"])["mu_a"].values)
+        ess = float(az.ess(idata, var_names=["mu_a"])["mu_a"].values)
         return ess / tot
 
     def track_marginal_mixture_model_ess(self, init):
@@ -203,7 +203,7 @@ def track_marginal_mixture_model_ess(self, init):
             )
             start = [{k: v for k, v in start.items()} for _ in range(self.chains)]
             t0 = time.time()
-            trace = pm.sample(
+            idata = pm.sample(
                 draws=self.draws,
                 step=step,
                 cores=4,
@@ -214,7 +214,7 @@ def track_marginal_mixture_model_ess(self, init):
                 compute_convergence_checks=False,
             )
             tot = time.time() - t0
-        ess = az.ess(trace, var_names=["mu"])["mu"].values.min()  # worst case
+        ess = az.ess(idata, var_names=["mu"])["mu"].values.min()  # worst case
         return ess / tot
 
 
@@ -235,7 +235,7 @@ def track_glm_hierarchical_ess(self, step):
             if step is not None:
                 step = step()
             t0 = time.time()
-            trace = pm.sample(
+            idata = pm.sample(
                 draws=self.draws,
                 step=step,
                 cores=4,
@@ -245,7 +245,7 @@ def track_glm_hierarchical_ess(self, step):
                 compute_convergence_checks=False,
             )
             tot = time.time() - t0
-        ess = float(az.ess(trace, var_names=["mu_a"])["mu_a"].values)
+        ess = float(az.ess(idata, var_names=["mu_a"])["mu_a"].values)
         return ess / tot
 
 
@@ -302,9 +302,9 @@ def freefall(y, t, p):
             Y = pm.Normal("Y", mu=ode_solution, sd=sigma, observed=y)
 
             t0 = time.time()
-            trace = pm.sample(500, tune=1000, chains=2, cores=2, random_seed=0)
+            idata = pm.sample(500, tune=1000, chains=2, cores=2, random_seed=0)
             tot = time.time() - t0
-        ess = az.ess(trace)
+        ess = az.ess(idata)
         return np.mean([ess.sigma, ess.gamma]) / tot
 
 
diff --git a/docs/source/Advanced_usage_of_Aesara_in_PyMC3.rst b/docs/source/Advanced_usage_of_Aesara_in_PyMC3.rst
index 60187ede867..9aa235426b9 100644
--- a/docs/source/Advanced_usage_of_Aesara_in_PyMC3.rst
+++ b/docs/source/Advanced_usage_of_Aesara_in_PyMC3.rst
@@ -40,12 +40,12 @@ be time consuming if the number of datasets is large)::
         pm.Normal('y', mu=mu, sigma=1, observed=data)
 
     # Generate one trace for each dataset
-    traces = []
+    idatas = []
     for data_vals in observed_data:
         # Switch out the observed dataset
         data.set_value(data_vals)
         with model:
-            traces.append(pm.sample())
+            idatas.append(pm.sample())
 
 We can also sometimes use shared variables to work around limitations
 in the current PyMC3 api. A common task in Machine Learning is to predict
@@ -63,7 +63,7 @@ variable for our observations::
       pm.Bernoulli('obs', p=logistic, observed=y)
 
       # fit the model
-      trace = pm.sample()
+      idata = pm.sample()
 
       # Switch out the observations and use `sample_posterior_predictive` to predict
       x_shared.set_value([-1, 0, 1.])
@@ -220,4 +220,4 @@ We can now define our model using this new `Op`::
         mu = pm.Deterministic('mu', at_mu_from_theta(theta))
         pm.Normal('y', mu=mu, sigma=0.1, observed=[0.2, 0.21, 0.3])
 
-        trace = pm.sample()
+        idata = pm.sample()
diff --git a/docs/source/Gaussian_Processes.rst b/docs/source/Gaussian_Processes.rst
index d357cea0e31..d4dbabc29be 100644
--- a/docs/source/Gaussian_Processes.rst
+++ b/docs/source/Gaussian_Processes.rst
@@ -231,7 +231,7 @@ other implementations.  The first block fits the GP prior.  We denote
 
         f = gp.marginal_likelihood("f", X, y, noise)
 
-        trace = pm.sample(1000)
+        idata = pm.sample(1000)
 
 
 To construct the conditional distribution of :code:`gp1` or :code:`gp2`, we
diff --git a/docs/source/about.rst b/docs/source/about.rst
index 0a06f3c2fd4..7a8b6922e8a 100644
--- a/docs/source/about.rst
+++ b/docs/source/about.rst
@@ -237,9 +237,9 @@ Save this file, then from a python shell (or another file in the same directory)
     with bioassay_model:
 
         # Draw samples
-        trace = pm.sample(1000, tune=2000, cores=2)
+        idata = pm.sample(1000, tune=2000, cores=2)
         # Plot two parameters
-        az.plot_forest(trace, var_names=['alpha', 'beta'], r_hat=True)
+        az.plot_forest(idata, var_names=['alpha', 'beta'], r_hat=True)
 
 This example will generate 1000 posterior samples on each of two cores using the NUTS algorithm, preceded by 2000 tuning samples (these are good default numbers for most models).
 
diff --git a/pymc3/data.py b/pymc3/data.py
index 06dfb2766bc..6a582c1b06b 100644
--- a/pymc3/data.py
+++ b/pymc3/data.py
@@ -498,12 +498,12 @@ class Data:
     ...     pm.Normal('y', mu=mu, sigma=1, observed=data)
 
     >>> # Generate one trace for each dataset
-    >>> traces = []
+    >>> idatas = []
     >>> for data_vals in observed_data:
     ...     with model:
     ...         # Switch out the observed dataset
     ...         model.set_data('data', data_vals)
-    ...         traces.append(pm.sample())
+    ...         idatas.append(pm.sample())
 
     To set the value of the data container variable, check out
     :func:`pymc3.model.set_data()`.
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index ae3fba4639b..b6a831f3b2e 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -1691,14 +1691,15 @@ class OrderedLogistic(Categorical):
             cutpoints = pm.Normal("cutpoints", mu=[-1,1], sigma=10, shape=2,
                                   transform=pm.distributions.transforms.ordered)
             y_ = pm.OrderedLogistic("y", cutpoints=cutpoints, eta=x, observed=y)
-            tr = pm.sample(1000)
+            idata = pm.sample(1000)
 
         # Plot the results
         plt.hist(cluster1, 30, alpha=0.5);
         plt.hist(cluster2, 30, alpha=0.5);
         plt.hist(cluster3, 30, alpha=0.5);
-        plt.hist(tr["cutpoints"][:,0], 80, alpha=0.2, color='k');
-        plt.hist(tr["cutpoints"][:,1], 80, alpha=0.2, color='k');
+        posterior = idata.posterior.stack(sample=("chain", "draw"))
+        plt.hist(posterior["cutpoints"][0], 80, alpha=0.2, color='k');
+        plt.hist(posterior["cutpoints"][1], 80, alpha=0.2, color='k');
 
     """
 
@@ -1782,14 +1783,15 @@ class OrderedProbit(Categorical):
             cutpoints = pm.Normal("cutpoints", mu=[-1,1], sigma=10, shape=2,
                                   transform=pm.distributions.transforms.ordered)
             y_ = pm.OrderedProbit("y", cutpoints=cutpoints, eta=x, observed=y)
-            tr = pm.sample(1000)
+            idata = pm.sample(1000)
 
         # Plot the results
         plt.hist(cluster1, 30, alpha=0.5);
         plt.hist(cluster2, 30, alpha=0.5);
         plt.hist(cluster3, 30, alpha=0.5);
-        plt.hist(tr["cutpoints"][:,0], 80, alpha=0.2, color='k');
-        plt.hist(tr["cutpoints"][:,1], 80, alpha=0.2, color='k');
+        posterior = idata.posterior.stack(sample=("chain", "draw"))
+        plt.hist(posterior["cutpoints"][0], 80, alpha=0.2, color='k');
+        plt.hist(posterior["cutpoints"][1], 80, alpha=0.2, color='k');
 
     """
 
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index 26530af9a01..3f3f3d1e2d2 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -495,7 +495,7 @@ def __init__(
                         normal_dist.logp,
                         observed=np.random.randn(100),
                     )
-                    trace = pm.sample(100)
+                    idata = pm.sample(100)
 
             .. code-block:: python
 
diff --git a/pymc3/model.py b/pymc3/model.py
index 0139c430b7e..81c6f4f437f 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -1696,7 +1696,7 @@ def set_data(new_data, model=None):
         ...     y = pm.Data('y', [1., 2., 3.])
         ...     beta = pm.Normal('beta', 0, 1)
         ...     obs = pm.Normal('obs', x * beta, 1, observed=y)
-        ...     trace = pm.sample(1000, tune=1000)
+        ...     idata = pm.sample(1000, tune=1000)
 
     Set the value of `x` to predict on new data.
 
@@ -1704,7 +1704,7 @@ def set_data(new_data, model=None):
 
         >>> with model:
         ...     pm.set_data({'x': [5., 6., 9.]})
-        ...     y_test = pm.sample_posterior_predictive(trace)
+        ...     y_test = pm.sample_posterior_predictive(idata)
         >>> y_test['obs'].mean(axis=0)
         array([4.6088569 , 5.54128318, 8.32953844])
     """
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 39ef3ca1e43..a24ca8d259e 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -27,7 +27,6 @@
 
 import aesara.gradient as tg
 import numpy as np
-import packaging
 import xarray
 
 from aesara.compile.mode import Mode
@@ -355,7 +354,7 @@ def sample(
         Maximum number of repeated attempts (per chain) at creating an initial matrix with uniform jitter
         that yields a finite probability. This applies to ``jitter+adapt_diag`` and ``jitter+adapt_full``
         init methods.
-    return_inferencedata : bool, default=False
+    return_inferencedata : bool, default=True
         Whether to return the trace as an :class:`arviz:arviz.InferenceData` (True) object or a `MultiTrace` (False)
         Defaults to `False`, but we'll switch to `True` in an upcoming release.
     idata_kwargs : dict, optional
@@ -430,9 +429,9 @@ def sample(
         In [2]: with pm.Model() as model: # context management
            ...:     p = pm.Beta("p", alpha=alpha, beta=beta)
            ...:     y = pm.Binomial("y", n=n, p=p, observed=h)
-           ...:     trace = pm.sample()
+           ...:     idata = pm.sample()
 
-        In [3]: az.summary(trace, kind="stats")
+        In [3]: az.summary(idata, kind="stats")
 
         Out[3]:
             mean     sd  hdi_3%  hdi_97%
@@ -471,6 +470,9 @@ def sample(
     if not isinstance(random_seed, abc.Iterable):
         raise TypeError("Invalid value for `random_seed`. Must be tuple, list or int")
 
+    if return_inferencedata is None:
+        return_inferencedata = True
+
     if not discard_tuned_samples and not return_inferencedata:
         warnings.warn(
             "Tuning samples will be included in the returned `MultiTrace` object, which can lead to"
@@ -480,18 +482,6 @@ def sample(
             stacklevel=2,
         )
 
-    if return_inferencedata is None:
-        v = packaging.version.parse(pm.__version__)
-        if v.release[0] > 3 or v.release[1] >= 10:  # type: ignore
-            warnings.warn(
-                "In v4.0, pm.sample will return an `arviz.InferenceData` object instead of a `MultiTrace` by default. "
-                "You can pass return_inferencedata=True or return_inferencedata=False to be safe and silence this warning.",
-                FutureWarning,
-                stacklevel=2,
-            )
-        # set the default
-        return_inferencedata = False
-
     if start is not None:
         for start_vals in start:
             _check_start_shape(model, start_vals)
diff --git a/pymc3/step_methods/mlda.py b/pymc3/step_methods/mlda.py
index 600e8beb4ba..3dfdd14ba94 100644
--- a/pymc3/step_methods/mlda.py
+++ b/pymc3/step_methods/mlda.py
@@ -334,11 +334,11 @@ class MLDA(ArrayStepShared):
         ...     y = pm.Normal("y", mu=x, sigma=1, observed=datum)
         ...     step_method = pm.MLDA(coarse_models=[coarse_model],
         ...                           subsampling_rates=5)
-        ...     trace = pm.sample(500, chains=2,
+        ...     idata = pm.sample(500, chains=2,
         ...                       tune=100, step=step_method,
         ...                       random_seed=123)
         ...
-        ... az.summary(trace, kind="stats")
+        ... az.summary(idata, kind="stats")
            mean     sd  hdi_3%  hdi_97%
         x  0.99  0.987  -0.734    2.992
 
diff --git a/pymc3/tests/test_data_container.py b/pymc3/tests/test_data_container.py
index 93acae3057c..f4974b363d3 100644
--- a/pymc3/tests/test_data_container.py
+++ b/pymc3/tests/test_data_container.py
@@ -49,12 +49,12 @@ def test_sample(self):
             pm.Normal("obs", b * x_shared, np.sqrt(1e-2), observed=y)
 
             prior_trace0 = pm.sample_prior_predictive(1000)
-            trace = pm.sample(1000, init=None, tune=1000, chains=1)
-            pp_trace0 = pm.sample_posterior_predictive(trace, 1000)
+            idata = pm.sample(1000, init=None, tune=1000, chains=1)
+            pp_trace0 = pm.sample_posterior_predictive(idata, 1000)
 
             x_shared.set_value(x_pred)
             prior_trace1 = pm.sample_prior_predictive(1000)
-            pp_trace1 = pm.sample_posterior_predictive(trace, samples=1000)
+            pp_trace1 = pm.sample_posterior_predictive(idata, samples=1000)
 
         assert prior_trace0["b"].shape == (1000,)
         assert prior_trace0["obs"].shape == (1000, 100)
@@ -101,7 +101,6 @@ def test_sample_after_set_data(self):
                 init=None,
                 tune=1000,
                 chains=1,
-                return_inferencedata=False,
                 compute_convergence_checks=False,
             )
         # Predict on new data.
@@ -109,15 +108,14 @@ def test_sample_after_set_data(self):
         new_y = [5.0, 6.0, 9.0]
         with model:
             pm.set_data(new_data={"x": new_x, "y": new_y})
-            new_trace = pm.sample(
+            new_idata = pm.sample(
                 1000,
                 init=None,
                 tune=1000,
                 chains=1,
-                return_inferencedata=False,
                 compute_convergence_checks=False,
             )
-            pp_trace = pm.sample_posterior_predictive(new_trace, 1000)
+            pp_trace = pm.sample_posterior_predictive(new_idata, 1000)
 
         assert pp_trace["obs"].shape == (1000, 3)
         np.testing.assert_allclose(new_y, pp_trace["obs"].mean(axis=0), atol=1e-1)
@@ -134,12 +132,11 @@ def test_shared_data_as_index(self):
             pm.Normal("obs", alpha[index], np.sqrt(1e-2), observed=y)
 
             prior_trace = pm.sample_prior_predictive(1000, var_names=["alpha"])
-            trace = pm.sample(
+            idata = pm.sample(
                 1000,
                 init=None,
                 tune=1000,
                 chains=1,
-                return_inferencedata=False,
                 compute_convergence_checks=False,
             )
 
@@ -148,10 +145,10 @@ def test_shared_data_as_index(self):
         new_y = [5.0, 6.0, 9.0]
         with model:
             pm.set_data(new_data={"index": new_index, "y": new_y})
-            pp_trace = pm.sample_posterior_predictive(trace, 1000, var_names=["alpha", "obs"])
+            pp_trace = pm.sample_posterior_predictive(idata, 1000, var_names=["alpha", "obs"])
 
         assert prior_trace["alpha"].shape == (1000, 3)
-        assert trace["alpha"].shape == (1000, 3)
+        assert idata.posterior["alpha"].shape == (1, 1000, 3)
         assert pp_trace["alpha"].shape == (1000, 3)
         assert pp_trace["obs"].shape == (1000, 3)
 
@@ -233,7 +230,6 @@ def test_set_data_to_non_data_container_variables(self):
                 init=None,
                 tune=1000,
                 chains=1,
-                return_inferencedata=False,
                 compute_convergence_checks=False,
             )
         with pytest.raises(TypeError) as error:
@@ -253,7 +249,6 @@ def test_model_to_graphviz_for_model_with_data_container(self):
                 init=None,
                 tune=1000,
                 chains=1,
-                return_inferencedata=False,
                 compute_convergence_checks=False,
             )
 
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index f9844ce5986..a523dda75de 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -1658,11 +1658,11 @@ def test_density_dist_with_random_sampleable(self, shape):
                 shape=shape,
                 random=normal_dist.random,
             )
-            trace = pm.sample(100, cores=1)
+            idata = pm.sample(100, cores=1)
 
         samples = 500
         size = 100
-        ppc = pm.sample_posterior_predictive(trace, samples=samples, model=model, size=size)
+        ppc = pm.sample_posterior_predictive(idata, samples=samples, model=model, size=size)
         assert ppc["density_dist"].shape == (samples, size) + obs.distribution.shape
 
     @pytest.mark.parametrize("shape", [(), (3,), (3, 2)], ids=str)
@@ -1678,11 +1678,11 @@ def test_density_dist_with_random_sampleable_failure(self, shape):
                 random=normal_dist.random,
                 wrap_random_with_dist_shape=False,
             )
-            trace = pm.sample(100, cores=1)
+            idata = pm.sample(100, cores=1)
 
         samples = 500
         with pytest.raises(RuntimeError):
-            pm.sample_posterior_predictive(trace, samples=samples, model=model, size=100)
+            pm.sample_posterior_predictive(idata, samples=samples, model=model, size=100)
 
     @pytest.mark.parametrize("shape", [(), (3,), (3, 2)], ids=str)
     def test_density_dist_with_random_sampleable_hidden_error(self, shape):
@@ -1698,10 +1698,10 @@ def test_density_dist_with_random_sampleable_hidden_error(self, shape):
                 wrap_random_with_dist_shape=False,
                 check_shape_in_random=False,
             )
-            trace = pm.sample(100, cores=1)
+            idata = pm.sample(100, cores=1)
 
         samples = 500
-        ppc = pm.sample_posterior_predictive(trace, samples=samples, model=model)
+        ppc = pm.sample_posterior_predictive(idata, samples=samples, model=model)
         assert len(ppc["density_dist"]) == samples
         assert ((samples,) + obs.distribution.shape) != ppc["density_dist"].shape
 
@@ -1717,11 +1717,11 @@ def test_density_dist_with_random_sampleable_handcrafted_success(self):
                 random=rvs,
                 wrap_random_with_dist_shape=False,
             )
-            trace = pm.sample(100, cores=1)
+            idata = pm.sample(100, cores=1)
 
         samples = 500
         size = 100
-        ppc = pm.sample_posterior_predictive(trace, samples=samples, model=model, size=size)
+        ppc = pm.sample_posterior_predictive(idata, samples=samples, model=model, size=size)
         assert ppc["density_dist"].shape == (samples, size) + obs.distribution.shape
 
     @pytest.mark.xfail
@@ -1737,21 +1737,18 @@ def test_density_dist_with_random_sampleable_handcrafted_success_fast(self):
                 random=rvs,
                 wrap_random_with_dist_shape=False,
             )
-            trace = pm.sample(100, cores=1)
-
-        samples = 500
-        size = 100
+            pm.sample(100, cores=1)
 
     def test_density_dist_without_random_not_sampleable(self):
         with pm.Model() as model:
             mu = pm.Normal("mu", 0, 1)
             normal_dist = pm.Normal.dist(mu, 1)
             pm.DensityDist("density_dist", normal_dist.logp, observed=np.random.randn(100))
-            trace = pm.sample(100, cores=1)
+            idata = pm.sample(100, cores=1)
 
         samples = 500
         with pytest.raises(ValueError):
-            pm.sample_posterior_predictive(trace, samples=samples, model=model, size=100)
+            pm.sample_posterior_predictive(idata, samples=samples, model=model, size=100)
 
 
 @pytest.mark.xfail(reason="This distribution has not been refactored for v4")
diff --git a/pymc3/tests/test_examples.py b/pymc3/tests/test_examples.py
index 3eb7d3992f1..ab4d6f5349d 100644
--- a/pymc3/tests/test_examples.py
+++ b/pymc3/tests/test_examples.py
@@ -193,10 +193,6 @@ def build_disaster_model(masked=False):
     return model
 
 
-@pytest.mark.xfail(
-    reason="Arviz summary fails"
-    # condition=(aesara.config.floatX == "float32"), reason="Fails on float32"
-)
 class TestDisasterModel(SeededTest):
     @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     # Time series of recorded coal mining disasters in the UK from 1851 to 1962
@@ -207,8 +203,8 @@ def test_disaster_model(self):
             start = {"early_mean": 2, "late_mean": 3.0}
             # Use slice sampler for means (other variables auto-selected)
             step = pm.Slice([model["early_mean_log__"], model["late_mean_log__"]])
-            tr = pm.sample(500, tune=50, start=start, step=step, chains=2)
-            az.summary(tr)
+            idata = pm.sample(500, tune=50, start=start, step=step, chains=2)
+            az.summary(idata)
 
     @pytest.mark.xfail(condition=(aesara.config.floatX == "float32"), reason="Fails on float32")
     def test_disaster_model_missing(self):
@@ -218,8 +214,8 @@ def test_disaster_model_missing(self):
             start = {"early_mean": 2.0, "late_mean": 3.0}
             # Use slice sampler for means (other variables auto-selected)
             step = pm.Slice([model["early_mean_log__"], model["late_mean_log__"]])
-            tr = pm.sample(500, tune=50, start=start, step=step, chains=2)
-            az.summary(tr)
+            idata = pm.sample(500, tune=50, start=start, step=step, chains=2)
+            az.summary(idata)
 
 
 class TestLatentOccupancy(SeededTest):
diff --git a/pymc3/tests/test_hmc.py b/pymc3/tests/test_hmc.py
index 82c59291e14..bd588393b15 100644
--- a/pymc3/tests/test_hmc.py
+++ b/pymc3/tests/test_hmc.py
@@ -56,11 +56,14 @@ def _hamiltonian_step(self, *args, **kwargs):
 
 
 def test_nuts_tuning():
-    model = pymc3.Model()
-    with model:
+    with pymc3.Model():
         pymc3.Normal("mu", mu=0, sigma=1)
         step = pymc3.NUTS()
-        trace = pymc3.sample(10, step=step, tune=5, progressbar=False, chains=1)
+        idata = pymc3.sample(
+            10, step=step, tune=5, discard_tuned_samples=False, progressbar=False, chains=1
+        )
 
     assert not step.tune
-    assert np.all(trace["step_size"][5:] == trace["step_size"][5])
+    ss_tuned = idata.warmup_sample_stats["step_size"][0, -1]
+    ss_posterior = idata.sample_stats["step_size"][0, :]
+    np.testing.assert_array_equal(ss_posterior, ss_tuned)
diff --git a/pymc3/tests/test_idata_conversion.py b/pymc3/tests/test_idata_conversion.py
index 63d09dd8857..8ba0be2e210 100644
--- a/pymc3/tests/test_idata_conversion.py
+++ b/pymc3/tests/test_idata_conversion.py
@@ -56,7 +56,7 @@ def data(self, eight_schools_params, draws, chains):
                 sd=eight_schools_params["sigma"],
                 observed=eight_schools_params["y"],
             )
-            trace = pm.sample(draws, chains=chains)
+            trace = pm.sample(draws, chains=chains, return_inferencedata=False)
 
         return self.Data(model, trace)
 
@@ -224,7 +224,6 @@ def test_posterior_predictive_warning(self, data, eight_schools_params, caplog):
         assert len(records) == 1
         assert records[0].levelname == "WARNING"
 
-    @pytest.mark.xfail(reason="Dims option is not supported yet")
     @pytest.mark.parametrize("use_context", [True, False])
     def test_autodetect_coords_from_model(self, use_context):
         df_data = pd.DataFrame(columns=["date"]).set_index("date")
@@ -268,7 +267,6 @@ def test_autodetect_coords_from_model(self, use_context):
         np.testing.assert_array_equal(idata.observed_data.coords["date"], coords["date"])
         np.testing.assert_array_equal(idata.observed_data.coords["city"], coords["city"])
 
-    @pytest.mark.xfail(reason="Dims option is not supported yet")
     def test_ovewrite_model_coords_dims(self):
         """Check coords and dims from model object can be partially overwrited."""
         dim1 = ["a", "b"]
@@ -280,7 +278,7 @@ def test_ovewrite_model_coords_dims(self):
             x = pm.Data("x", x_data, dims=("dim1", "dim2"))
             beta = pm.Normal("beta", 0, 1, dims="dim1")
             _ = pm.Normal("obs", x * beta, 1, observed=y, dims=("dim1", "dim2"))
-            trace = pm.sample(100, tune=100)
+            trace = pm.sample(100, tune=100, return_inferencedata=False)
             idata1 = to_inference_data(trace)
             idata2 = to_inference_data(trace, coords={"dim1": new_dim1}, dims={"beta": ["dim2"]})
 
@@ -449,7 +447,7 @@ def test_constant_data(self, use_context):
             y = pm.Data("y", [1.0, 2.0, 3.0])
             beta = pm.Normal("beta", 0, 1)
             obs = pm.Normal("obs", x * beta, 1, observed=y)  # pylint: disable=unused-variable
-            trace = pm.sample(100, tune=100)
+            trace = pm.sample(100, tune=100, return_inferencedata=False)
             if use_context:
                 inference_data = to_inference_data(trace=trace)
 
@@ -465,7 +463,7 @@ def test_predictions_constant_data(self):
             y = pm.Data("y", [1.0, 2.0, 3.0])
             beta = pm.Normal("beta", 0, 1)
             obs = pm.Normal("obs", x * beta, 1, observed=y)  # pylint: disable=unused-variable
-            trace = pm.sample(100, tune=100)
+            trace = pm.sample(100, tune=100, return_inferencedata=False)
             inference_data = to_inference_data(trace)
 
         test_dict = {"posterior": ["beta"], "observed_data": ["obs"], "constant_data": ["x"]}
@@ -499,9 +497,9 @@ def test_no_trace(self):
             y = pm.Data("y", [1.0, 2.0, 3.0])
             beta = pm.Normal("beta", 0, 1)
             obs = pm.Normal("obs", x * beta, 1, observed=y)  # pylint: disable=unused-variable
-            trace = pm.sample(100, tune=100)
+            idata = pm.sample(100, tune=100)
             prior = pm.sample_prior_predictive()
-            posterior_predictive = pm.sample_posterior_predictive(trace)
+            posterior_predictive = pm.sample_posterior_predictive(idata)
 
         # Only prior
         inference_data = to_inference_data(prior=prior, model=model)
@@ -548,7 +546,6 @@ def test_priors_separation(self, use_context):
         fails = check_multiple_attrs(test_dict, inference_data)
         assert not fails
 
-    @pytest.mark.xfail(reason="Dims option is not supported yet")
     def test_multivariate_observations(self):
         coords = {"direction": ["x", "y", "z"], "experiment": np.arange(20)}
         data = np.random.multinomial(20, [0.2, 0.3, 0.5], size=20)
@@ -616,6 +613,7 @@ def test_save_warmup_issue_1208_after_3_9(self):
                 cores=1,
                 step=pm.Metropolis(),
                 discard_tuned_samples=False,
+                return_inferencedata=False,
             )
             assert isinstance(trace, pm.backends.base.MultiTrace)
             assert len(trace) == 300
diff --git a/pymc3/tests/test_mixture.py b/pymc3/tests/test_mixture.py
index eb87f132508..08658043b6f 100644
--- a/pymc3/tests/test_mixture.py
+++ b/pymc3/tests/test_mixture.py
@@ -375,11 +375,11 @@ def build_toy_dataset(N, K):
 
             pm.Mixture("x_obs", pi, comp_dist, observed=X)
         with model:
-            trace = pm.sample(30, tune=10, chains=1)
+            idata = pm.sample(30, tune=10, chains=1)
 
         n_samples = 20
         with model:
-            ppc = pm.sample_posterior_predictive(trace, n_samples)
+            ppc = pm.sample_posterior_predictive(idata, n_samples)
             prior = pm.sample_prior_predictive(samples=n_samples)
         assert ppc["x_obs"].shape == (n_samples,) + X.shape
         assert prior["x_obs"].shape == (n_samples,) + X.shape
diff --git a/pymc3/tests/test_ndarray_backend.py b/pymc3/tests/test_ndarray_backend.py
index df71e077644..71bb87f6db6 100644
--- a/pymc3/tests/test_ndarray_backend.py
+++ b/pymc3/tests/test_ndarray_backend.py
@@ -219,7 +219,7 @@ def model(rng_seeder=None):
     @classmethod
     def setup_class(cls):
         with TestSaveLoad.model():
-            cls.trace = pm.sample()
+            cls.trace = pm.sample(return_inferencedata=False)
 
     def test_save_new_model(self, tmpdir_factory):
         directory = str(tmpdir_factory.mktemp("data"))
@@ -228,7 +228,7 @@ def test_save_new_model(self, tmpdir_factory):
         assert save_dir == directory
         with pm.Model() as model:
             w = pm.Normal("w", 0, 1)
-            new_trace = pm.sample()
+            new_trace = pm.sample(return_inferencedata=False)
 
         with pytest.raises(OSError):
             _ = pm.save_trace(new_trace, directory)
diff --git a/pymc3/tests/test_ode.py b/pymc3/tests/test_ode.py
index d910e7b6b49..95937a36746 100644
--- a/pymc3/tests/test_ode.py
+++ b/pymc3/tests/test_ode.py
@@ -265,7 +265,7 @@ def ode_func(y, t, p):
         assert op_1 != op_other
         return
 
-    @pytest.mark.xfail(reason="HalfCauchy was not yet refactored")
+    @pytest.mark.xfail(reason="https://github.com/pymc-devs/aesara/issues/390")
     def test_scalar_ode_1_param(self):
         """Test running model for a scalar ODE with 1 parameter"""
 
@@ -288,13 +288,13 @@ def system(y, t, p):
             sigma = pm.HalfCauchy("sigma", 1)
             forward = ode_model(theta=[alpha], y0=[y0])
             y = pm.Lognormal("y", mu=pm.math.log(forward), sd=sigma, observed=yobs)
-            trace = pm.sample(100, tune=0, chains=1)
+            idata = pm.sample(100, tune=0, chains=1)
 
-        assert trace["alpha"].size > 0
-        assert trace["y0"].size > 0
-        assert trace["sigma"].size > 0
+        assert idata.posterior["alpha"].shape == (1, 100)
+        assert idata.posterior["y0"].shape == (1, 100)
+        assert idata.posterior["sigma"].shape == (1, 100)
 
-    @pytest.mark.xfail(reason="HalfCauchy was not yet refactored")
+    @pytest.mark.xfail(reason="https://github.com/pymc-devs/aesara/issues/390")
     def test_scalar_ode_2_param(self):
         """Test running model for a scalar ODE with 2 parameters"""
 
@@ -319,14 +319,14 @@ def system(y, t, p):
             forward = ode_model(theta=[alpha, beta], y0=[y0])
             y = pm.Lognormal("y", mu=pm.math.log(forward), sd=sigma, observed=yobs)
 
-            trace = pm.sample(100, tune=0, chains=1)
+            idata = pm.sample(100, tune=0, chains=1)
 
-        assert trace["alpha"].size > 0
-        assert trace["beta"].size > 0
-        assert trace["y0"].size > 0
-        assert trace["sigma"].size > 0
+        assert idata.posterior["alpha"].shape == (1, 100)
+        assert idata.posterior["beta"].shape == (1, 100)
+        assert idata.posterior["y0"].shape == (1, 100)
+        assert idata.posterior["sigma"].shape == (1, 100)
 
-    @pytest.mark.xfail(reason="HalfCauchy was not yet refactored")
+    @pytest.mark.xfail(reason="https://github.com/pymc-devs/aesara/issues/390")
     def test_vector_ode_1_param(self):
         """Test running model for a vector ODE with 1 parameter"""
 
@@ -361,12 +361,11 @@ def system(y, t, p):
             forward = ode_model(theta=[R], y0=[0.99, 0.01])
             y = pm.Lognormal("y", mu=pm.math.log(forward), sd=sigma, observed=yobs)
 
-            trace = pm.sample(100, tune=0, chains=1)
+            idata = pm.sample(100, tune=0, chains=1)
 
-        assert trace["R"].size > 0
-        assert trace["sigma"].size > 0
+        assert idata.posterior["R"].shape == (1, 100)
+        assert idata.posterior["sigma"].shape == (1, 100, 2)
 
-    @pytest.mark.xfail(reason="HalfCauchy was not yet refactored")
     def test_vector_ode_2_param(self):
         """Test running model for a vector ODE with 2 parameters"""
 
@@ -402,8 +401,8 @@ def system(y, t, p):
             forward = ode_model(theta=[beta, gamma], y0=[0.99, 0.01])
             y = pm.Lognormal("y", mu=pm.math.log(forward), sd=sigma, observed=yobs)
 
-            trace = pm.sample(100, tune=0, chains=1)
+            idata = pm.sample(100, tune=0, chains=1)
 
-        assert trace["beta"].size > 0
-        assert trace["gamma"].size > 0
-        assert trace["sigma"].size > 0
+        assert idata.posterior["beta"].shape == (1, 100)
+        assert idata.posterior["gamma"].shape == (1, 100)
+        assert idata.posterior["sigma"].shape == (1, 100, 2)
diff --git a/pymc3/tests/test_parallel_sampling.py b/pymc3/tests/test_parallel_sampling.py
index c8627e4c554..d8cc36620ce 100644
--- a/pymc3/tests/test_parallel_sampling.py
+++ b/pymc3/tests/test_parallel_sampling.py
@@ -173,7 +173,7 @@ def func(x):
             return -2 * (x ** 2).sum()
 
         obs = pm.DensityDist("density_dist", func, observed=np.random.randn(100))
-        trace = pm.sample(draws=10, tune=10, step=pm.Metropolis(), cores=2, mp_ctx="spawn")
+        pm.sample(draws=10, tune=10, step=pm.Metropolis(), cores=2, mp_ctx="spawn")
 
 
 def test_spawn_densitydist_bound_method():
@@ -183,7 +183,7 @@ def test_spawn_densitydist_bound_method():
         obs = pm.DensityDist("density_dist", normal_dist.logp, observed=np.random.randn(100))
         msg = "logp for DensityDist is a bound method, leading to RecursionError while serializing"
         with pytest.raises(ValueError, match=msg):
-            trace = pm.sample(draws=10, tune=10, step=pm.Metropolis(), cores=2, mp_ctx="spawn")
+            pm.sample(draws=10, tune=10, step=pm.Metropolis(), cores=2, mp_ctx="spawn")
 
 
 def test_spawn_densitydist_syswarning(monkeypatch):
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index f1405bc12ef..79ec2f0ed63 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -60,13 +60,13 @@ def test_parallel_sample_does_not_reuse_seed(self):
         for _ in range(2):
             np.random.seed(1)  # seeds in other processes don't effect main process
             with self.model:
-                trace = pm.sample(100, tune=0, cores=cores)
+                idata = pm.sample(100, tune=0, cores=cores)
             # numpy thread mentioned race condition.  might as well check none are equal
             for first, second in combinations(range(cores), 2):
-                first_chain = trace.get_values("x", chains=first)
-                second_chain = trace.get_values("x", chains=second)
-                assert not (first_chain == second_chain).all()
-            draws.append(trace.get_values("x"))
+                first_chain = idata.posterior["x"].sel(chain=first).values
+                second_chain = idata.posterior["x"].sel(chain=second).values
+                assert not np.allclose(first_chain, second_chain)
+            draws.append(idata.posterior["x"].values)
             random_numbers.append(np.random.random())
 
         # Make sure future random processes aren't effected by this
@@ -125,7 +125,7 @@ def test_iter_sample(self):
 
     def test_parallel_start(self):
         with self.model:
-            tr = pm.sample(
+            idata = pm.sample(
                 0,
                 tune=5,
                 cores=2,
@@ -133,16 +133,18 @@ def test_parallel_start(self):
                 start=[{"x": [10, 10]}, {"x": [-10, -10]}],
                 random_seed=self.random_seed,
             )
-        assert tr.get_values("x", chains=0)[0][0] > 0
-        assert tr.get_values("x", chains=1)[0][0] < 0
+        assert idata.warmup_posterior["x"].sel(chain=0, draw=0).values[0] > 0
+        assert idata.warmup_posterior["x"].sel(chain=1, draw=0).values[0] < 0
 
     def test_sample_tune_len(self):
         with self.model:
-            trace = pm.sample(draws=100, tune=50, cores=1)
+            trace = pm.sample(draws=100, tune=50, cores=1, return_inferencedata=False)
             assert len(trace) == 100
-            trace = pm.sample(draws=100, tune=50, cores=1, discard_tuned_samples=False)
+            trace = pm.sample(
+                draws=100, tune=50, cores=1, return_inferencedata=False, discard_tuned_samples=False
+            )
             assert len(trace) == 150
-            trace = pm.sample(draws=100, tune=50, cores=4)
+            trace = pm.sample(draws=100, tune=50, cores=4, return_inferencedata=False)
             assert len(trace) == 100
 
     def test_reset_tuning(self):
@@ -183,12 +185,12 @@ def test_trace_report_bart(self):
             mu = pm.BART("mu", X, Y, m=20)
             sigma = pm.HalfNormal("sigma", 1)
             y = pm.Normal("y", mu, sigma, observed=Y)
-            trace = pm.sample(500, tune=100, random_seed=3415)
+            trace = pm.sample(500, tune=100, random_seed=3415, return_inferencedata=False)
         var_imp = trace.report.variable_importance
         assert var_imp[0] > var_imp[1:].sum()
         npt.assert_almost_equal(var_imp.sum(), 1)
 
-    def test_return_inferencedata(self, monkeypatch):
+    def test_return_inferencedata(self):
         with self.model:
             kwargs = dict(draws=100, tune=50, cores=1, chains=2, step=pm.Metropolis())
 
@@ -222,16 +224,16 @@ def test_return_inferencedata(self, monkeypatch):
             assert result.posterior.sizes["chain"] == 2
             assert len(result._groups_warmup) == 0
 
-            # check warning for version 3.10 onwards
-            monkeypatch.setattr("pymc3.__version__", "3.10")
-            with pytest.warns(FutureWarning, match="pass return_inferencedata"):
-                result = pm.sample(**kwargs)
-
     @pytest.mark.parametrize("cores", [1, 2])
     def test_sampler_stat_tune(self, cores):
         with self.model:
             tune_stat = pm.sample(
-                tune=5, draws=7, cores=cores, discard_tuned_samples=False, step=pm.Metropolis()
+                tune=5,
+                draws=7,
+                cores=cores,
+                discard_tuned_samples=False,
+                return_inferencedata=False,
+                step=pm.Metropolis(),
             ).get_sampler_stats("tune", chains=1)
             assert list(tune_stat).count(True) == 5
             assert list(tune_stat).count(False) == 7
@@ -287,13 +289,14 @@ def callback(trace, draw):
                 cores=1,
                 random_seed=self.random_seed,
                 callback=callback,
+                return_inferencedata=False,
             )
             assert len(trace) == trace_cancel_length
 
     def test_sequential_backend(self):
         with self.model:
             backend = NDArray()
-            trace = pm.sample(10, cores=1, chains=2, trace=backend)
+            pm.sample(10, cores=1, chains=2, trace=backend)
 
 
 @pytest.mark.xfail(reason="Lognormal not refactored for v4")
@@ -330,8 +333,9 @@ def test_partial_trace_sample():
     with pm.Model() as model:
         a = pm.Normal("a", mu=0, sigma=1)
         b = pm.Normal("b", mu=0, sigma=1)
-        trace = pm.sample(trace=[a])
-        # TODO: Assert something to make this a real test
+        idata = pm.sample(trace=[a])
+        assert "a" in idata.posterior
+        assert "b" not in idata.posterior
 
 
 def test_chain_idx():
@@ -342,11 +346,11 @@ def test_chain_idx():
         # note draws-tune must be >100 AND we need an observed RV for this to properly
         # trigger convergence checks, which is one particular case in which this failed
         # before
-        trace = pm.sample(draws=150, tune=10, chain_idx=1)
+        idata = pm.sample(draws=150, tune=10, chain_idx=1)
 
-        ppc = pm.sample_posterior_predictive(trace)
+        ppc = pm.sample_posterior_predictive(idata)
         # TODO FIXME: Assert something.
-        ppc = pm.sample_posterior_predictive(trace, keep_size=True)
+        ppc = pm.sample_posterior_predictive(idata, keep_size=True)
 
 
 @pytest.mark.parametrize(
@@ -553,14 +557,14 @@ def test_exceptions(self, caplog):
         with pm.Model() as model:
             mu = pm.Normal("mu", 0.0, 1.0)
             a = pm.Normal("a", mu=mu, sigma=1, observed=np.array([0.5, 0.2]))
-            trace = pm.sample(idata_kwargs={"log_likelihood": False})
+            idata = pm.sample(idata_kwargs={"log_likelihood": False})
 
         with model:
             with pytest.raises(IncorrectArgumentsError):
-                ppc = pm.sample_posterior_predictive(trace, samples=10, keep_size=True)
+                ppc = pm.sample_posterior_predictive(idata, samples=10, keep_size=True)
 
             with pytest.raises(IncorrectArgumentsError):
-                ppc = pm.sample_posterior_predictive(trace, size=4, keep_size=True)
+                ppc = pm.sample_posterior_predictive(idata, size=4, keep_size=True)
 
             # test wrong type argument
             bad_trace = {"mu": stats.norm.rvs(size=1000)}
@@ -571,19 +575,19 @@ def test_vector_observed(self):
         with pm.Model() as model:
             mu = pm.Normal("mu", mu=0, sigma=1)
             a = pm.Normal("a", mu=mu, sigma=1, observed=np.array([0.0, 1.0]))
-            trace = pm.sample(idata_kwargs={"log_likelihood": False})
+            idata = pm.sample(idata_kwargs={"log_likelihood": False})
 
         with model:
             # test list input
             # ppc0 = pm.sample_posterior_predictive([model.initial_point], samples=10)
             # TODO: Assert something about the output
-            # ppc = pm.sample_posterior_predictive(trace, samples=12, var_names=[])
+            # ppc = pm.sample_posterior_predictive(idata, samples=12, var_names=[])
             # assert len(ppc) == 0
-            ppc = pm.sample_posterior_predictive(trace, samples=12, var_names=["a"])
+            ppc = pm.sample_posterior_predictive(idata, samples=12, var_names=["a"])
             assert "a" in ppc
             assert ppc["a"].shape == (12, 2)
 
-            ppc = pm.sample_posterior_predictive(trace, samples=10, var_names=["a"], size=4)
+            ppc = pm.sample_posterior_predictive(idata, samples=10, var_names=["a"], size=4)
             assert "a" in ppc
             assert ppc["a"].shape == (10, 4, 2)
 
@@ -591,13 +595,13 @@ def test_sum_normal(self):
         with pm.Model() as model:
             a = pm.Normal("a", sigma=0.2)
             b = pm.Normal("b", mu=a)
-            trace = pm.sample()
+            idata = pm.sample()
 
         with model:
             # test list input
             ppc0 = pm.sample_posterior_predictive([model.initial_point], samples=10)
             assert ppc0 == {}
-            ppc = pm.sample_posterior_predictive(trace, samples=1000, var_names=["b"])
+            ppc = pm.sample_posterior_predictive(idata, samples=1000, var_names=["b"])
             assert len(ppc) == 1
             assert ppc["b"].shape == (1000,)
             scale = np.sqrt(1 + 0.2 ** 2)
@@ -610,13 +614,13 @@ def test_model_not_drawable_prior(self):
         with model:
             mu = pm.HalfFlat("sigma")
             pm.Poisson("foo", mu=mu, observed=data)
-            trace = pm.sample(tune=1000)
+            idata = pm.sample(tune=1000)
 
         with model:
             with pytest.raises(NotImplementedError) as excinfo:
                 pm.sample_prior_predictive(50)
             assert "Cannot sample" in str(excinfo.value)
-            samples = pm.sample_posterior_predictive(trace, 40)
+            samples = pm.sample_posterior_predictive(idata, 40)
             assert samples["foo"].shape == (40, 200)
 
     def test_model_shared_variable(self):
@@ -909,7 +913,7 @@ def test_init_jitter(initval, jitter_max_retries, expectation):
 def point_list_arg_bug_fixture() -> Tuple[pm.Model, pm.backends.base.MultiTrace]:
     with pm.Model() as pmodel:
         n = pm.Normal("n")
-        trace = pm.sample()
+        trace = pm.sample(return_inferencedata=False)
 
     with pmodel:
         d = pm.Deterministic("d", n * 4)
@@ -1100,6 +1104,6 @@ def test_sample_deterministic():
     with pm.Model() as model:
         x = pm.HalfNormal("x", 1)
         y = pm.Deterministic("y", x + 100)
-        trace = pm.sample(chains=1, draws=50, compute_convergence_checks=False)
+        idata = pm.sample(chains=1, draws=50, compute_convergence_checks=False)
 
-    np.testing.assert_allclose(trace["y"], trace["x"] + 100)
+    np.testing.assert_allclose(idata.posterior["y"], idata.posterior["x"] + 100)
diff --git a/pymc3/tests/test_shared.py b/pymc3/tests/test_shared.py
index 609f88cc910..9f36fec6e56 100644
--- a/pymc3/tests/test_shared.py
+++ b/pymc3/tests/test_shared.py
@@ -41,12 +41,12 @@ def test_sample(self):
             pm.Normal("obs", b * x_shared, np.sqrt(1e-2), observed=y)
             prior_trace0 = pm.sample_prior_predictive(1000)
 
-            trace = pm.sample(1000, init=None, tune=1000, chains=1)
-            pp_trace0 = pm.sample_posterior_predictive(trace, 1000)
+            idata = pm.sample(1000, init=None, tune=1000, chains=1)
+            pp_trace0 = pm.sample_posterior_predictive(idata, 1000)
 
             x_shared.set_value(x_pred)
             prior_trace1 = pm.sample_prior_predictive(1000)
-            pp_trace1 = pm.sample_posterior_predictive(trace, 1000)
+            pp_trace1 = pm.sample_posterior_predictive(idata, 1000)
 
         assert prior_trace0["b"].shape == (1000,)
         assert prior_trace0["obs"].shape == (1000, 100)
diff --git a/pymc3/tests/test_step.py b/pymc3/tests/test_step.py
index 1daf0e1c574..8868beb210a 100644
--- a/pymc3/tests/test_step.py
+++ b/pymc3/tests/test_step.py
@@ -531,11 +531,11 @@ def check_trace(self, step_method):
             y = Normal("y", mu=x, sigma=1, observed=1)
             if step_method.__name__ == "NUTS":
                 step = step_method(scaling=model.initial_point)
-                trace = sample(
+                idata = sample(
                     0, tune=n_steps, discard_tuned_samples=False, step=step, random_seed=1, chains=1
                 )
             else:
-                trace = sample(
+                idata = sample(
                     0,
                     tune=n_steps,
                     discard_tuned_samples=False,
@@ -545,14 +545,18 @@ def check_trace(self, step_method):
                 )
 
         assert_array_almost_equal(
-            trace["x"],
+            idata.warmup_posterior["x"],
             self.master_samples[step_method],
             decimal=select_by_precision(float64=6, float32=4),
         )
 
-    def check_stat(self, check, trace, name):
+    def check_stat(self, check, idata, name):
+        if hasattr(idata, "warmup_posterior"):
+            group = idata.warmup_posterior
+        else:
+            group = idata.posterior
         for (var, stat, value, bound) in check:
-            s = stat(trace[var][2000:], axis=0)
+            s = stat(group[var].sel(chain=0, draw=slice(2000, None)), axis=0)
             close_to(s, value, bound)
 
     def test_step_continuous(self):
@@ -582,7 +586,7 @@ def test_step_continuous(self):
                 ),
             )
         for step in steps:
-            trace = sample(
+            idata = sample(
                 0,
                 tune=8000,
                 chains=1,
@@ -592,7 +596,7 @@ def test_step_continuous(self):
                 model=model,
                 random_seed=1,
             )
-            self.check_stat(check, trace, step.__class__.__name__)
+            self.check_stat(check, idata, step.__class__.__name__)
 
     def test_step_discrete(self):
         if aesara.config.floatX == "float32":
@@ -603,10 +607,10 @@ def test_step_discrete(self):
         with model:
             steps = (Metropolis(S=C, proposal_dist=MultivariateNormalProposal),)
         for step in steps:
-            trace = sample(
+            idata = sample(
                 20000, tune=0, step=step, start=start, model=model, random_seed=1, chains=1
             )
-            self.check_stat(check, trace, step.__class__.__name__)
+            self.check_stat(check, idata, step.__class__.__name__)
 
     def test_step_categorical(self):
         start, model, (mu, C) = simple_categorical()
@@ -618,8 +622,8 @@ def test_step_categorical(self):
                 CategoricalGibbsMetropolis(model.x, proposal="proportional"),
             )
         for step in steps:
-            trace = sample(8000, tune=0, step=step, start=start, model=model, random_seed=1)
-            self.check_stat(check, trace, step.__class__.__name__)
+            idata = sample(8000, tune=0, step=step, start=start, model=model, random_seed=1)
+            self.check_stat(check, idata, step.__class__.__name__)
 
     @pytest.mark.xfail(reason="EllipticalSlice not refactored for v4")
     def test_step_elliptical_slice(self):
@@ -629,10 +633,10 @@ def test_step_elliptical_slice(self):
         with model:
             steps = (EllipticalSlice(prior_cov=K), EllipticalSlice(prior_chol=L))
         for step in steps:
-            trace = sample(
+            idata = sample(
                 5000, tune=0, step=step, start=start, model=model, random_seed=1, chains=1
             )
-            self.check_stat(check, trace, step.__class__.__name__)
+            self.check_stat(check, idata, step.__class__.__name__)
 
 
 class TestMetropolisProposal:
@@ -792,24 +796,20 @@ def test_nonparallelized_chains_are_random(self):
             x = Normal("x", 0, 1)
             for stepper in TestPopulationSamplers.steppers:
                 step = stepper()
-                trace = sample(chains=4, cores=1, draws=20, tune=0, step=DEMetropolis())
-                samples = np.array(trace.get_values("x", combine=False))[:, 5]
+                idata = sample(chains=4, cores=1, draws=20, tune=0, step=DEMetropolis())
+                samples = idata.posterior["x"].values[:, 5]
 
-                assert len(set(samples)) == 4, "Parallelized {} " "chains are identical.".format(
-                    stepper
-                )
+                assert len(set(samples)) == 4, f"Parallelized {stepper} chains are identical."
 
     def test_parallelized_chains_are_random(self):
         with Model() as model:
             x = Normal("x", 0, 1)
             for stepper in TestPopulationSamplers.steppers:
                 step = stepper()
-                trace = sample(chains=4, cores=4, draws=20, tune=0, step=DEMetropolis())
-                samples = np.array(trace.get_values("x", combine=False))[:, 5]
+                idata = sample(chains=4, cores=4, draws=20, tune=0, step=DEMetropolis())
+                samples = idata.posterior["x"].values[:, 5]
 
-                assert len(set(samples)) == 4, "Parallelized {} " "chains are identical.".format(
-                    stepper
-                )
+                assert len(set(samples)) == 4, f"Parallelized {stepper} chains are identical."
 
 
 class TestMetropolis:
@@ -818,7 +818,7 @@ def test_tuning_reset(self):
         with Model() as pmodel:
             D = 3
             Normal("n", 0, 2, size=(D,))
-            trace = sample(
+            idata = sample(
                 tune=600,
                 draws=500,
                 step=Metropolis(tune=True, scaling=0.1),
@@ -826,17 +826,19 @@ def test_tuning_reset(self):
                 chains=3,
                 discard_tuned_samples=False,
             )
-        for c in range(trace.nchains):
+        for c in idata.posterior.chain:
             # check that the tuned settings changed and were reset
-            assert trace.get_sampler_stats("scaling", chains=c)[0] == 0.1
-            assert trace.get_sampler_stats("scaling", chains=c)[-1] != 0.1
+            assert idata.warmup_sample_stats["scaling"].sel(chain=c).values[0] == 0.1
+            tuned = idata.warmup_sample_stats["scaling"].sel(chain=c).values[-1]
+            assert tuned != 0.1
+            np.testing.assert_array_equal(idata.sample_stats["scaling"].sel(chain=c).values, tuned)
 
 
 class TestDEMetropolisZ:
     def test_tuning_lambda_sequential(self):
         with Model() as pmodel:
             Normal("n", 0, 2, size=(3,))
-            trace = sample(
+            idata = sample(
                 tune=1000,
                 draws=500,
                 step=DEMetropolisZ(tune="lambda", lamb=0.92),
@@ -844,16 +846,17 @@ def test_tuning_lambda_sequential(self):
                 chains=3,
                 discard_tuned_samples=False,
             )
-        for c in range(trace.nchains):
+        for c in idata.posterior.chain:
             # check that the tuned settings changed and were reset
-            assert trace.get_sampler_stats("lambda", chains=c)[0] == 0.92
-            assert trace.get_sampler_stats("lambda", chains=c)[-1] != 0.92
-            assert set(trace.get_sampler_stats("tune", chains=c)) == {True, False}
+            assert idata.warmup_sample_stats["lambda"].sel(chain=c).values[0] == 0.92
+            tuned = idata.warmup_sample_stats["lambda"].sel(chain=c).values[-1]
+            assert tuned != 0.92
+            np.testing.assert_array_equal(idata.sample_stats["lambda"].sel(chain=c).values, tuned)
 
     def test_tuning_epsilon_parallel(self):
         with Model() as pmodel:
             Normal("n", 0, 2, size=(3,))
-            trace = sample(
+            idata = sample(
                 tune=1000,
                 draws=500,
                 step=DEMetropolisZ(tune="scaling", scaling=0.002),
@@ -861,16 +864,17 @@ def test_tuning_epsilon_parallel(self):
                 chains=2,
                 discard_tuned_samples=False,
             )
-        for c in range(trace.nchains):
+        for c in idata.posterior.chain:
             # check that the tuned settings changed and were reset
-            assert trace.get_sampler_stats("scaling", chains=c)[0] == 0.002
-            assert trace.get_sampler_stats("scaling", chains=c)[-1] != 0.002
-            assert set(trace.get_sampler_stats("tune", chains=c)) == {True, False}
+            assert idata.warmup_sample_stats["scaling"].sel(chain=c).values[0] == 0.002
+            tuned = idata.warmup_sample_stats["scaling"].sel(chain=c).values[-1]
+            assert tuned != 0.002
+            np.testing.assert_array_equal(idata.sample_stats["scaling"].sel(chain=c).values, tuned)
 
     def test_tuning_none(self):
         with Model() as pmodel:
             Normal("n", 0, 2, size=(3,))
-            trace = sample(
+            idata = sample(
                 tune=1000,
                 draws=500,
                 step=DEMetropolisZ(tune=None),
@@ -878,18 +882,17 @@ def test_tuning_none(self):
                 chains=2,
                 discard_tuned_samples=False,
             )
-        for c in range(trace.nchains):
+        for c in idata.posterior.chain:
             # check that all tunable parameters remained constant
-            assert len(set(trace.get_sampler_stats("lambda", chains=c))) == 1
-            assert len(set(trace.get_sampler_stats("scaling", chains=c))) == 1
-            assert set(trace.get_sampler_stats("tune", chains=c)) == {True, False}
+            assert len(set(idata.warmup_sample_stats["lambda"].sel(chain=c).values)) == 1
+            assert len(set(idata.warmup_sample_stats["scaling"].sel(chain=c).values)) == 1
 
     def test_tuning_reset(self):
         """Re-use of the step method instance with cores=1 must not leak tuning information between chains."""
         with Model() as pmodel:
             D = 3
             Normal("n", 0, 2, size=(D,))
-            trace = sample(
+            idata = sample(
                 tune=1000,
                 draws=500,
                 step=DEMetropolisZ(tune="scaling", scaling=0.002),
@@ -897,14 +900,16 @@ def test_tuning_reset(self):
                 chains=3,
                 discard_tuned_samples=False,
             )
-        for c in range(trace.nchains):
+        for c in idata.posterior.chain:
             # check that the tuned settings changed and were reset
-            assert trace.get_sampler_stats("scaling", chains=c)[0] == 0.002
-            assert trace.get_sampler_stats("scaling", chains=c)[-1] != 0.002
+            warmup = idata.warmup_sample_stats["scaling"].sel(chain=c).values
+            assert warmup[0] == 0.002
+            assert warmup[-1] != 0.002
             # check that the variance of the first 50 iterations is much lower than the last 100
+            samples = idata.warmup_posterior["n"].sel(chain=c).values
             for d in range(D):
-                var_start = np.var(trace.get_values("n", chains=c)[:50, d])
-                var_end = np.var(trace.get_values("n", chains=c)[-100:, d])
+                var_start = np.var(samples[:50, d])
+                var_end = np.var(samples[-100:, d])
                 assert var_start < 0.1 * var_end
 
     def test_tune_drop_fraction(self):
@@ -914,10 +919,11 @@ def test_tune_drop_fraction(self):
         with Model() as pmodel:
             Normal("n", 0, 2, size=(3,))
             step = DEMetropolisZ(tune_drop_fraction=tune_drop_fraction)
-            trace = sample(
+            idata = sample(
                 tune=tune, draws=draws, step=step, cores=1, chains=1, discard_tuned_samples=False
             )
-            assert len(trace) == tune + draws
+            assert len(idata.warmup_posterior.draw) == tune
+            assert len(idata.posterior.draw) == draws
             assert len(step._history) == (tune - tune * tune_drop_fraction) + draws
 
     @pytest.mark.parametrize(
@@ -984,7 +990,7 @@ def test_linalg(self, caplog):
             b = at.slinalg.solve(floatX(np.eye(2)), a)
             Normal("c", mu=b, size=2, initval=floatX(np.r_[0.0, 0.0]))
             caplog.clear()
-            trace = sample(20, init=None, tune=5, chains=2)
+            trace = sample(20, init=None, tune=5, chains=2, return_inferencedata=False)
             warns = [msg.msg for msg in caplog.records]
             assert np.any(trace["diverging"])
             assert (
@@ -1002,7 +1008,7 @@ def test_linalg(self, caplog):
     def test_sampler_stats(self):
         with Model() as model:
             Normal("x", mu=0, sigma=1)
-            trace = sample(draws=10, tune=1, chains=1)
+            trace = sample(draws=10, tune=1, chains=1, return_inferencedata=False)
 
         # Assert stats exist and have the correct shape.
         expected_stat_names = {
@@ -1132,11 +1138,9 @@ def test_nonparallelized_chains_are_random(self):
             Normal("x", 0, 1)
             for stepper in TestMLDA.steppers:
                 step = stepper(coarse_models=[coarse_model])
-                trace = sample(chains=2, cores=1, draws=20, tune=0, step=step, random_seed=1)
-                samples = np.array(trace.get_values("x", combine=False))[:, 5]
-                assert (
-                    len(set(samples)) == 2
-                ), "Non parallelized {} " "chains are identical.".format(stepper)
+                idata = sample(chains=2, cores=1, draws=20, tune=0, step=step, random_seed=1)
+                samples = idata.posterior["x"].values[:, 5]
+                assert len(set(samples)) == 2, f"Non parallelized {stepper} chains are identical."
 
     def test_parallelized_chains_are_random(self):
         """Test that parallel chain are
@@ -1149,11 +1153,9 @@ def test_parallelized_chains_are_random(self):
             Normal("x", 0, 1)
             for stepper in TestMLDA.steppers:
                 step = stepper(coarse_models=[coarse_model])
-                trace = sample(chains=2, cores=2, draws=20, tune=0, step=step, random_seed=1)
-                samples = np.array(trace.get_values("x", combine=False))[:, 5]
-                assert len(set(samples)) == 2, "Parallelized {} " "chains are identical.".format(
-                    stepper
-                )
+                idata = sample(chains=2, cores=2, draws=20, tune=0, step=step, random_seed=1)
+                samples = idata.posterior["x"].values[:, 5]
+                assert len(set(samples)) == 2, f"Parallelized {stepper} chains are identical."
 
     def test_acceptance_rate_against_coarseness(self):
         """Test that the acceptance rate increases
@@ -1175,8 +1177,8 @@ def test_acceptance_rate_against_coarseness(self):
             Normal("x", 5.0, 1.0)
             for coarse_model in possible_coarse_models:
                 step = MLDA(coarse_models=[coarse_model], subsampling_rates=3)
-                trace = sample(chains=1, draws=500, tune=100, step=step, random_seed=1)
-                acc.append(trace.get_sampler_stats("accepted").mean())
+                idata = sample(chains=1, draws=500, tune=100, step=step, random_seed=1)
+                acc.append(idata.sample_stats["accepted"].mean())
             assert acc[0] > acc[1] > acc[2], (
                 "Acceptance rate is not "
                 "strictly increasing when"
@@ -1252,6 +1254,7 @@ def test_tuning_and_scaling_on(self):
                 chains=1,
                 discard_tuned_samples=False,
                 random_seed=1234,
+                return_inferencedata=False,
             )
 
             trace_1 = sample(
@@ -1266,6 +1269,7 @@ def test_tuning_and_scaling_on(self):
                 chains=1,
                 discard_tuned_samples=False,
                 random_seed=1234,
+                return_inferencedata=False,
             )
 
             trace_2 = sample(
@@ -1275,6 +1279,7 @@ def test_tuning_and_scaling_on(self):
                 chains=1,
                 discard_tuned_samples=False,
                 random_seed=1234,
+                return_inferencedata=False,
             )
 
         assert trace_0.get_sampler_stats("tune", chains=0)[0]
@@ -1322,6 +1327,7 @@ def test_tuning_and_scaling_off(self):
                 chains=1,
                 discard_tuned_samples=False,
                 random_seed=12345,
+                return_inferencedata=False,
             )
 
         ts_1 = 100
@@ -1339,6 +1345,7 @@ def test_tuning_and_scaling_off(self):
                 chains=1,
                 discard_tuned_samples=False,
                 random_seed=12345,
+                return_inferencedata=False,
             )
 
         assert not trace_0.get_sampler_stats("tune", chains=0)[0]
@@ -1374,6 +1381,7 @@ def test_tuning_and_scaling_off(self):
                 chains=1,
                 discard_tuned_samples=False,
                 random_seed=12345,
+                return_inferencedata=False,
             )
 
         assert not trace_2.get_sampler_stats("tune", chains=0)[0]
@@ -1395,8 +1403,9 @@ def test_trace_length(self):
         with Model():
             Normal("n", 0, 2, size=(3,))
             step = MLDA(coarse_models=[coarse_model])
-            trace = sample(tune=tune, draws=draws, step=step, chains=1, discard_tuned_samples=False)
-            assert len(trace) == tune + draws
+            idata = sample(tune=tune, draws=draws, step=step, chains=1, discard_tuned_samples=False)
+            assert len(idata.warmup_posterior.draw) == tune
+            assert len(idata.posterior.draw) == draws
 
     @pytest.mark.parametrize(
         "variable,has_grad,outcome",
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index a4a470dfe08..cf6a85850db 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -177,7 +177,7 @@ def three_var_approx_single_group_mf(three_var_model):
 @pytest.fixture
 def test_sample_simple(three_var_approx, request):
     backend, name = request.param
-    trace = three_var_approx.sample(100, name=name)
+    trace = three_var_approx.sample(100, name=name, return_inferencedata=False)
     assert set(trace.varnames) == {"one", "one_log__", "three", "two"}
     assert len(trace) == 100
     assert trace[0]["one"].shape == (10, 2)
@@ -236,7 +236,7 @@ def test_sample_aevb(three_var_aevb_approx, aevb_initial):
         1, more_replacements={aevb_initial: np.zeros_like(aevb_initial.get_value())[:1]}
     )
     aevb_initial.set_value(np.random.rand(7, 7).astype("float32"))
-    trace = three_var_aevb_approx.sample(500)
+    trace = three_var_aevb_approx.sample(500, return_inferencedata=False)
     assert set(trace.varnames) == {"one", "one_log__", "two", "three"}
     assert len(trace) == 500
     assert trace[0]["one"].shape == (7, 2)
@@ -244,7 +244,7 @@ def test_sample_aevb(three_var_aevb_approx, aevb_initial):
     assert trace[0]["three"].shape == (10, 1, 2)
 
     aevb_initial.set_value(np.random.rand(13, 7).astype("float32"))
-    trace = three_var_aevb_approx.sample(500)
+    trace = three_var_aevb_approx.sample(500, return_inferencedata=False)
     assert set(trace.varnames) == {"one", "one_log__", "two", "three"}
     assert len(trace) == 500
     assert trace[0]["one"].shape == (13, 2)
@@ -984,10 +984,10 @@ def test_var_replacement():
 def test_empirical_from_trace(another_simple_model):
     with another_simple_model:
         step = pm.Metropolis()
-        trace = pm.sample(100, step=step, chains=1, tune=0)
+        trace = pm.sample(100, step=step, chains=1, tune=0, return_inferencedata=False)
         emp = Empirical(trace)
         assert emp.histogram.shape[0].eval() == 100
-        trace = pm.sample(100, step=step, chains=4, tune=0)
+        trace = pm.sample(100, step=step, chains=4, tune=0, return_inferencedata=False)
         emp = Empirical(trace)
         assert emp.histogram.shape[0].eval() == 400