diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py
new file mode 100644
index 00000000000..a4f8db2786b
--- /dev/null
+++ b/asv_bench/benchmarks/combine.py
@@ -0,0 +1,38 @@
+import numpy as np
+
+import xarray as xr
+
+
+class Combine:
+    """Benchmark concatenating and merging large datasets"""
+
+    def setup(self):
+        """Create 4 datasets with two different variables"""
+
+        t_size, x_size, y_size = 50, 450, 400
+        t = np.arange(t_size)
+        data = np.random.randn(t_size, x_size, y_size)
+
+        self.dsA0 = xr.Dataset(
+            {"A": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))}
+        )
+        self.dsA1 = xr.Dataset(
+            {"A": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))}
+        )
+        self.dsB0 = xr.Dataset(
+            {"B": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))}
+        )
+        self.dsB1 = xr.Dataset(
+            {"B": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))}
+        )
+
+    def time_combine_nested(self):
+        datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]]
+
+        xr.combine_nested(datasets, concat_dim=[None, "T"])
+
+    def time_combine_by_coords(self):
+        """Also has to load and arrange t coordinate"""
+        datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1]
+
+        xr.combine_by_coords(datasets)
diff --git a/asv_bench/benchmarks/dataarray_missing.py b/asv_bench/benchmarks/dataarray_missing.py
new file mode 100644
index 00000000000..f89fe7f8eb9
--- /dev/null
+++ b/asv_bench/benchmarks/dataarray_missing.py
@@ -0,0 +1,80 @@
+import pandas as pd
+
+import xarray as xr
+
+from . import parameterized, randn, requires_dask
+
+
+def make_bench_data(shape, frac_nan, chunks):
+    vals = randn(shape, frac_nan)
+    coords = {"time": pd.date_range("2000-01-01", freq="D", periods=shape[0])}
+    da = xr.DataArray(vals, dims=("time", "x", "y"), coords=coords)
+
+    if chunks is not None:
+        da = da.chunk(chunks)
+
+    return da
+
+
+def requires_bottleneck():
+    try:
+        import bottleneck  # noqa: F401
+    except ImportError:
+        raise NotImplementedError()
+
+
+class DataArrayMissingInterpolateNA:
+    def setup(self, shape, chunks, limit):
+        if chunks is not None:
+            requires_dask()
+        self.da = make_bench_data(shape, 0.1, chunks)
+
+    @parameterized(
+        ["shape", "chunks", "limit"],
+        (
+            [(365, 75, 75)],
+            [None, {"x": 25, "y": 25}],
+            [None, 3],
+        ),
+    )
+    def time_interpolate_na(self, shape, chunks, limit):
+        actual = self.da.interpolate_na(dim="time", method="linear", limit=limit)
+
+        if chunks is not None:
+            actual = actual.compute()
+
+
+class DataArrayMissingBottleneck:
+    def setup(self, shape, chunks, limit):
+        requires_bottleneck()
+        if chunks is not None:
+            requires_dask()
+        self.da = make_bench_data(shape, 0.1, chunks)
+
+    @parameterized(
+        ["shape", "chunks", "limit"],
+        (
+            [(365, 75, 75)],
+            [None, {"x": 25, "y": 25}],
+            [None, 3],
+        ),
+    )
+    def time_ffill(self, shape, chunks, limit):
+        actual = self.da.ffill(dim="time", limit=limit)
+
+        if chunks is not None:
+            actual = actual.compute()
+
+    @parameterized(
+        ["shape", "chunks", "limit"],
+        (
+            [(365, 75, 75)],
+            [None, {"x": 25, "y": 25}],
+            [None, 3],
+        ),
+    )
+    def time_bfill(self, shape, chunks, limit):
+        actual = self.da.ffill(dim="time", limit=limit)
+
+        if chunks is not None:
+            actual = actual.compute()
diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py
new file mode 100644
index 00000000000..6c2e15c54e9
--- /dev/null
+++ b/asv_bench/benchmarks/dataset_io.py
@@ -0,0 +1,478 @@
+import os
+
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+from . import _skip_slow, randint, randn, requires_dask
+
+try:
+    import dask
+    import dask.multiprocessing
+except ImportError:
+    pass
+
+
+os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
+
+
+class IOSingleNetCDF:
+    """
+    A few examples that benchmark reading/writing a single netCDF file with
+    xarray
+    """
+
+    timeout = 300.0
+    repeat = 1
+    number = 5
+
+    def make_ds(self):
+        # TODO: Lazily skipped in CI as it is very demanding and slow.
+        # Improve times and remove errors.
+        _skip_slow()
+
+        # single Dataset
+        self.ds = xr.Dataset()
+        self.nt = 1000
+        self.nx = 90
+        self.ny = 45
+
+        self.block_chunks = {
+            "time": self.nt / 4,
+            "lon": self.nx / 3,
+            "lat": self.ny / 3,
+        }
+
+        self.time_chunks = {"time": int(self.nt / 36)}
+
+        times = pd.date_range("1970-01-01", periods=self.nt, freq="D")
+        lons = xr.DataArray(
+            np.linspace(0, 360, self.nx),
+            dims=("lon",),
+            attrs={"units": "degrees east", "long_name": "longitude"},
+        )
+        lats = xr.DataArray(
+            np.linspace(-90, 90, self.ny),
+            dims=("lat",),
+            attrs={"units": "degrees north", "long_name": "latitude"},
+        )
+        self.ds["foo"] = xr.DataArray(
+            randn((self.nt, self.nx, self.ny), frac_nan=0.2),
+            coords={"lon": lons, "lat": lats, "time": times},
+            dims=("time", "lon", "lat"),
+            name="foo",
+            attrs={"units": "foo units", "description": "a description"},
+        )
+        self.ds["bar"] = xr.DataArray(
+            randn((self.nt, self.nx, self.ny), frac_nan=0.2),
+            coords={"lon": lons, "lat": lats, "time": times},
+            dims=("time", "lon", "lat"),
+            name="bar",
+            attrs={"units": "bar units", "description": "a description"},
+        )
+        self.ds["baz"] = xr.DataArray(
+            randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32),
+            coords={"lon": lons, "lat": lats},
+            dims=("lon", "lat"),
+            name="baz",
+            attrs={"units": "baz units", "description": "a description"},
+        )
+
+        self.ds.attrs = {"history": "created for xarray benchmarking"}
+
+        self.oinds = {
+            "time": randint(0, self.nt, 120),
+            "lon": randint(0, self.nx, 20),
+            "lat": randint(0, self.ny, 10),
+        }
+        self.vinds = {
+            "time": xr.DataArray(randint(0, self.nt, 120), dims="x"),
+            "lon": xr.DataArray(randint(0, self.nx, 120), dims="x"),
+            "lat": slice(3, 20),
+        }
+
+
+class IOWriteSingleNetCDF3(IOSingleNetCDF):
+    def setup(self):
+        self.format = "NETCDF3_64BIT"
+        self.make_ds()
+
+    def time_write_dataset_netcdf4(self):
+        self.ds.to_netcdf("test_netcdf4_write.nc", engine="netcdf4", format=self.format)
+
+    def time_write_dataset_scipy(self):
+        self.ds.to_netcdf("test_scipy_write.nc", engine="scipy", format=self.format)
+
+
+class IOReadSingleNetCDF4(IOSingleNetCDF):
+    def setup(self):
+
+        self.make_ds()
+
+        self.filepath = "test_single_file.nc4.nc"
+        self.format = "NETCDF4"
+        self.ds.to_netcdf(self.filepath, format=self.format)
+
+    def time_load_dataset_netcdf4(self):
+        xr.open_dataset(self.filepath, engine="netcdf4").load()
+
+    def time_orthogonal_indexing(self):
+        ds = xr.open_dataset(self.filepath, engine="netcdf4")
+        ds = ds.isel(**self.oinds).load()
+
+    def time_vectorized_indexing(self):
+        ds = xr.open_dataset(self.filepath, engine="netcdf4")
+        ds = ds.isel(**self.vinds).load()
+
+
+class IOReadSingleNetCDF3(IOReadSingleNetCDF4):
+    def setup(self):
+
+        self.make_ds()
+
+        self.filepath = "test_single_file.nc3.nc"
+        self.format = "NETCDF3_64BIT"
+        self.ds.to_netcdf(self.filepath, format=self.format)
+
+    def time_load_dataset_scipy(self):
+        xr.open_dataset(self.filepath, engine="scipy").load()
+
+    def time_orthogonal_indexing(self):
+        ds = xr.open_dataset(self.filepath, engine="scipy")
+        ds = ds.isel(**self.oinds).load()
+
+    def time_vectorized_indexing(self):
+        ds = xr.open_dataset(self.filepath, engine="scipy")
+        ds = ds.isel(**self.vinds).load()
+
+
+class IOReadSingleNetCDF4Dask(IOSingleNetCDF):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+
+        self.filepath = "test_single_file.nc4.nc"
+        self.format = "NETCDF4"
+        self.ds.to_netcdf(self.filepath, format=self.format)
+
+    def time_load_dataset_netcdf4_with_block_chunks(self):
+        xr.open_dataset(
+            self.filepath, engine="netcdf4", chunks=self.block_chunks
+        ).load()
+
+    def time_load_dataset_netcdf4_with_block_chunks_oindexing(self):
+        ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks)
+        ds = ds.isel(**self.oinds).load()
+
+    def time_load_dataset_netcdf4_with_block_chunks_vindexing(self):
+        ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks)
+        ds = ds.isel(**self.vinds).load()
+
+    def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_dataset(
+                self.filepath, engine="netcdf4", chunks=self.block_chunks
+            ).load()
+
+    def time_load_dataset_netcdf4_with_time_chunks(self):
+        xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.time_chunks).load()
+
+    def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_dataset(
+                self.filepath, engine="netcdf4", chunks=self.time_chunks
+            ).load()
+
+
+class IOReadSingleNetCDF3Dask(IOReadSingleNetCDF4Dask):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+
+        self.filepath = "test_single_file.nc3.nc"
+        self.format = "NETCDF3_64BIT"
+        self.ds.to_netcdf(self.filepath, format=self.format)
+
+    def time_load_dataset_scipy_with_block_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_dataset(
+                self.filepath, engine="scipy", chunks=self.block_chunks
+            ).load()
+
+    def time_load_dataset_scipy_with_block_chunks_oindexing(self):
+        ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks)
+        ds = ds.isel(**self.oinds).load()
+
+    def time_load_dataset_scipy_with_block_chunks_vindexing(self):
+        ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks)
+        ds = ds.isel(**self.vinds).load()
+
+    def time_load_dataset_scipy_with_time_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_dataset(
+                self.filepath, engine="scipy", chunks=self.time_chunks
+            ).load()
+
+
+class IOMultipleNetCDF:
+    """
+    A few examples that benchmark reading/writing multiple netCDF files with
+    xarray
+    """
+
+    timeout = 300.0
+    repeat = 1
+    number = 5
+
+    def make_ds(self, nfiles=10):
+        # TODO: Lazily skipped in CI as it is very demanding and slow.
+        # Improve times and remove errors.
+        _skip_slow()
+
+        # multiple Dataset
+        self.ds = xr.Dataset()
+        self.nt = 1000
+        self.nx = 90
+        self.ny = 45
+        self.nfiles = nfiles
+
+        self.block_chunks = {
+            "time": self.nt / 4,
+            "lon": self.nx / 3,
+            "lat": self.ny / 3,
+        }
+
+        self.time_chunks = {"time": int(self.nt / 36)}
+
+        self.time_vars = np.split(
+            pd.date_range("1970-01-01", periods=self.nt, freq="D"), self.nfiles
+        )
+
+        self.ds_list = []
+        self.filenames_list = []
+        for i, times in enumerate(self.time_vars):
+            ds = xr.Dataset()
+            nt = len(times)
+            lons = xr.DataArray(
+                np.linspace(0, 360, self.nx),
+                dims=("lon",),
+                attrs={"units": "degrees east", "long_name": "longitude"},
+            )
+            lats = xr.DataArray(
+                np.linspace(-90, 90, self.ny),
+                dims=("lat",),
+                attrs={"units": "degrees north", "long_name": "latitude"},
+            )
+            ds["foo"] = xr.DataArray(
+                randn((nt, self.nx, self.ny), frac_nan=0.2),
+                coords={"lon": lons, "lat": lats, "time": times},
+                dims=("time", "lon", "lat"),
+                name="foo",
+                attrs={"units": "foo units", "description": "a description"},
+            )
+            ds["bar"] = xr.DataArray(
+                randn((nt, self.nx, self.ny), frac_nan=0.2),
+                coords={"lon": lons, "lat": lats, "time": times},
+                dims=("time", "lon", "lat"),
+                name="bar",
+                attrs={"units": "bar units", "description": "a description"},
+            )
+            ds["baz"] = xr.DataArray(
+                randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32),
+                coords={"lon": lons, "lat": lats},
+                dims=("lon", "lat"),
+                name="baz",
+                attrs={"units": "baz units", "description": "a description"},
+            )
+
+            ds.attrs = {"history": "created for xarray benchmarking"}
+
+            self.ds_list.append(ds)
+            self.filenames_list.append("test_netcdf_%i.nc" % i)
+
+
+class IOWriteMultipleNetCDF3(IOMultipleNetCDF):
+    def setup(self):
+        self.make_ds()
+        self.format = "NETCDF3_64BIT"
+
+    def time_write_dataset_netcdf4(self):
+        xr.save_mfdataset(
+            self.ds_list, self.filenames_list, engine="netcdf4", format=self.format
+        )
+
+    def time_write_dataset_scipy(self):
+        xr.save_mfdataset(
+            self.ds_list, self.filenames_list, engine="scipy", format=self.format
+        )
+
+
+class IOReadMultipleNetCDF4(IOMultipleNetCDF):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+        self.format = "NETCDF4"
+        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
+
+    def time_load_dataset_netcdf4(self):
+        xr.open_mfdataset(self.filenames_list, engine="netcdf4").load()
+
+    def time_open_dataset_netcdf4(self):
+        xr.open_mfdataset(self.filenames_list, engine="netcdf4")
+
+
+class IOReadMultipleNetCDF3(IOReadMultipleNetCDF4):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+        self.format = "NETCDF3_64BIT"
+        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
+
+    def time_load_dataset_scipy(self):
+        xr.open_mfdataset(self.filenames_list, engine="scipy").load()
+
+    def time_open_dataset_scipy(self):
+        xr.open_mfdataset(self.filenames_list, engine="scipy")
+
+
+class IOReadMultipleNetCDF4Dask(IOMultipleNetCDF):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+        self.format = "NETCDF4"
+        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
+
+    def time_load_dataset_netcdf4_with_block_chunks(self):
+        xr.open_mfdataset(
+            self.filenames_list, engine="netcdf4", chunks=self.block_chunks
+        ).load()
+
+    def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="netcdf4", chunks=self.block_chunks
+            ).load()
+
+    def time_load_dataset_netcdf4_with_time_chunks(self):
+        xr.open_mfdataset(
+            self.filenames_list, engine="netcdf4", chunks=self.time_chunks
+        ).load()
+
+    def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="netcdf4", chunks=self.time_chunks
+            ).load()
+
+    def time_open_dataset_netcdf4_with_block_chunks(self):
+        xr.open_mfdataset(
+            self.filenames_list, engine="netcdf4", chunks=self.block_chunks
+        )
+
+    def time_open_dataset_netcdf4_with_block_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="netcdf4", chunks=self.block_chunks
+            )
+
+    def time_open_dataset_netcdf4_with_time_chunks(self):
+        xr.open_mfdataset(
+            self.filenames_list, engine="netcdf4", chunks=self.time_chunks
+        )
+
+    def time_open_dataset_netcdf4_with_time_chunks_multiprocessing(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="netcdf4", chunks=self.time_chunks
+            )
+
+
+class IOReadMultipleNetCDF3Dask(IOReadMultipleNetCDF4Dask):
+    def setup(self):
+
+        requires_dask()
+
+        self.make_ds()
+        self.format = "NETCDF3_64BIT"
+        xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format)
+
+    def time_load_dataset_scipy_with_block_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="scipy", chunks=self.block_chunks
+            ).load()
+
+    def time_load_dataset_scipy_with_time_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="scipy", chunks=self.time_chunks
+            ).load()
+
+    def time_open_dataset_scipy_with_block_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="scipy", chunks=self.block_chunks
+            )
+
+    def time_open_dataset_scipy_with_time_chunks(self):
+        with dask.config.set(scheduler="multiprocessing"):
+            xr.open_mfdataset(
+                self.filenames_list, engine="scipy", chunks=self.time_chunks
+            )
+
+
+def create_delayed_write():
+    import dask.array as da
+
+    # TODO: Lazily skipped in CI as it is very demanding and slow.
+    # Improve times and remove errors.
+    _skip_slow()
+
+    vals = da.random.random(300, chunks=(1,))
+    ds = xr.Dataset({"vals": (["a"], vals)})
+    return ds.to_netcdf("file.nc", engine="netcdf4", compute=False)
+
+
+class IOWriteNetCDFDask:
+    timeout = 60
+    repeat = 1
+    number = 5
+
+    def setup(self):
+        requires_dask()
+        self.write = create_delayed_write()
+
+    def time_write(self):
+        self.write.compute()
+
+
+class IOWriteNetCDFDaskDistributed:
+    def setup(self):
+        try:
+            import distributed
+        except ImportError:
+            raise NotImplementedError()
+
+        # TODO: Lazily skipped in CI as it is very demanding and slow.
+        # Improve times and remove errors.
+        _skip_slow()
+
+        self.client = distributed.Client()
+        self.write = create_delayed_write()
+
+    def cleanup(self):
+        self.client.shutdown()
+
+    def time_write(self):
+        self.write.compute()
diff --git a/asv_bench/benchmarks/import_xarray.py b/asv_bench/benchmarks/import_xarray.py
new file mode 100644
index 00000000000..94652e3b82a
--- /dev/null
+++ b/asv_bench/benchmarks/import_xarray.py
@@ -0,0 +1,9 @@
+class ImportXarray:
+    def setup(self, *args, **kwargs):
+        def import_xr():
+            import xarray  # noqa: F401
+
+        self._import_xr = import_xr
+
+    def time_import_xarray(self):
+        self._import_xr()
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
new file mode 100644
index 00000000000..15212ec0c61
--- /dev/null
+++ b/asv_bench/benchmarks/indexing.py
@@ -0,0 +1,149 @@
+import os
+
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+from . import parameterized, randint, randn, requires_dask
+
+nx = 2000
+ny = 1000
+nt = 500
+
+basic_indexes = {
+    "1slice": {"x": slice(0, 3)},
+    "1slice-1scalar": {"x": 0, "y": slice(None, None, 3)},
+    "2slicess-1scalar": {"x": slice(3, -3, 3), "y": 1, "t": slice(None, -3, 3)},
+}
+
+basic_assignment_values = {
+    "1slice": xr.DataArray(randn((3, ny), frac_nan=0.1), dims=["x", "y"]),
+    "1slice-1scalar": xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1), dims=["y"]),
+    "2slicess-1scalar": xr.DataArray(
+        randn(np.empty(nx)[slice(3, -3, 3)].size, frac_nan=0.1), dims=["x"]
+    ),
+}
+
+outer_indexes = {
+    "1d": {"x": randint(0, nx, 400)},
+    "2d": {"x": randint(0, nx, 500), "y": randint(0, ny, 400)},
+    "2d-1scalar": {"x": randint(0, nx, 100), "y": 1, "t": randint(0, nt, 400)},
+}
+
+outer_assignment_values = {
+    "1d": xr.DataArray(randn((400, ny), frac_nan=0.1), dims=["x", "y"]),
+    "2d": xr.DataArray(randn((500, 400), frac_nan=0.1), dims=["x", "y"]),
+    "2d-1scalar": xr.DataArray(randn(100, frac_nan=0.1), dims=["x"]),
+}
+
+vectorized_indexes = {
+    "1-1d": {"x": xr.DataArray(randint(0, nx, 400), dims="a")},
+    "2-1d": {
+        "x": xr.DataArray(randint(0, nx, 400), dims="a"),
+        "y": xr.DataArray(randint(0, ny, 400), dims="a"),
+    },
+    "3-2d": {
+        "x": xr.DataArray(randint(0, nx, 400).reshape(4, 100), dims=["a", "b"]),
+        "y": xr.DataArray(randint(0, ny, 400).reshape(4, 100), dims=["a", "b"]),
+        "t": xr.DataArray(randint(0, nt, 400).reshape(4, 100), dims=["a", "b"]),
+    },
+}
+
+vectorized_assignment_values = {
+    "1-1d": xr.DataArray(randn((400, ny)), dims=["a", "y"], coords={"a": randn(400)}),
+    "2-1d": xr.DataArray(randn(400), dims=["a"], coords={"a": randn(400)}),
+    "3-2d": xr.DataArray(
+        randn((4, 100)), dims=["a", "b"], coords={"a": randn(4), "b": randn(100)}
+    ),
+}
+
+
+class Base:
+    def setup(self, key):
+        self.ds = xr.Dataset(
+            {
+                "var1": (("x", "y"), randn((nx, ny), frac_nan=0.1)),
+                "var2": (("x", "t"), randn((nx, nt))),
+                "var3": (("t",), randn(nt)),
+            },
+            coords={
+                "x": np.arange(nx),
+                "y": np.linspace(0, 1, ny),
+                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
+                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
+            },
+        )
+
+
+class Indexing(Base):
+    @parameterized(["key"], [list(basic_indexes.keys())])
+    def time_indexing_basic(self, key):
+        self.ds.isel(**basic_indexes[key]).load()
+
+    @parameterized(["key"], [list(outer_indexes.keys())])
+    def time_indexing_outer(self, key):
+        self.ds.isel(**outer_indexes[key]).load()
+
+    @parameterized(["key"], [list(vectorized_indexes.keys())])
+    def time_indexing_vectorized(self, key):
+        self.ds.isel(**vectorized_indexes[key]).load()
+
+
+class Assignment(Base):
+    @parameterized(["key"], [list(basic_indexes.keys())])
+    def time_assignment_basic(self, key):
+        ind = basic_indexes[key]
+        val = basic_assignment_values[key]
+        self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val
+
+    @parameterized(["key"], [list(outer_indexes.keys())])
+    def time_assignment_outer(self, key):
+        ind = outer_indexes[key]
+        val = outer_assignment_values[key]
+        self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val
+
+    @parameterized(["key"], [list(vectorized_indexes.keys())])
+    def time_assignment_vectorized(self, key):
+        ind = vectorized_indexes[key]
+        val = vectorized_assignment_values[key]
+        self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val
+
+
+class IndexingDask(Indexing):
+    def setup(self, key):
+        requires_dask()
+        super().setup(key)
+        self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50})
+
+
+class BooleanIndexing:
+    # https://github.com/pydata/xarray/issues/2227
+    def setup(self):
+        self.ds = xr.Dataset(
+            {"a": ("time", np.arange(10_000_000))},
+            coords={"time": np.arange(10_000_000)},
+        )
+        self.time_filter = self.ds.time > 50_000
+
+    def time_indexing(self):
+        self.ds.isel(time=self.time_filter)
+
+
+class HugeAxisSmallSliceIndexing:
+    # https://github.com/pydata/xarray/pull/4560
+    def setup(self):
+        self.filepath = "test_indexing_huge_axis_small_slice.nc"
+        if not os.path.isfile(self.filepath):
+            xr.Dataset(
+                {"a": ("x", np.arange(10_000_000))},
+                coords={"x": np.arange(10_000_000)},
+            ).to_netcdf(self.filepath, format="NETCDF4")
+
+        self.ds = xr.open_dataset(self.filepath)
+
+    def time_indexing(self):
+        self.ds.isel(x=slice(100))
+
+    def cleanup(self):
+        self.ds.close()
diff --git a/asv_bench/benchmarks/interp.py b/asv_bench/benchmarks/interp.py
new file mode 100644
index 00000000000..4b6691bcc0a
--- /dev/null
+++ b/asv_bench/benchmarks/interp.py
@@ -0,0 +1,51 @@
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+from . import parameterized, randn, requires_dask
+
+nx = 1500
+ny = 1000
+nt = 500
+
+randn_xy = randn((nx, ny), frac_nan=0.1)
+randn_xt = randn((nx, nt))
+randn_t = randn((nt,))
+
+new_x_short = np.linspace(0.3 * nx, 0.7 * nx, 100)
+new_x_long = np.linspace(0.3 * nx, 0.7 * nx, 500)
+new_y_long = np.linspace(0.1, 0.9, 500)
+
+
+class Interpolation:
+    def setup(self, *args, **kwargs):
+        self.ds = xr.Dataset(
+            {
+                "var1": (("x", "y"), randn_xy),
+                "var2": (("x", "t"), randn_xt),
+                "var3": (("t",), randn_t),
+            },
+            coords={
+                "x": np.arange(nx),
+                "y": np.linspace(0, 1, ny),
+                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
+                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
+            },
+        )
+
+    @parameterized(["method", "is_short"], (["linear", "cubic"], [True, False]))
+    def time_interpolation(self, method, is_short):
+        new_x = new_x_short if is_short else new_x_long
+        self.ds.interp(x=new_x, method=method).load()
+
+    @parameterized(["method"], (["linear", "nearest"]))
+    def time_interpolation_2d(self, method):
+        self.ds.interp(x=new_x_long, y=new_y_long, method=method).load()
+
+
+class InterpolationDask(Interpolation):
+    def setup(self, *args, **kwargs):
+        requires_dask()
+        super().setup(**kwargs)
+        self.ds = self.ds.chunk({"t": 50})
diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py
new file mode 100644
index 00000000000..8aaa515d417
--- /dev/null
+++ b/asv_bench/benchmarks/pandas.py
@@ -0,0 +1,26 @@
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+from . import parameterized
+
+
+class MultiIndexSeries:
+    def setup(self, dtype, subset):
+        data = np.random.rand(100000).astype(dtype)
+        index = pd.MultiIndex.from_product(
+            [
+                list("abcdefhijk"),
+                list("abcdefhijk"),
+                pd.date_range(start="2000-01-01", periods=1000, freq="B"),
+            ]
+        )
+        series = pd.Series(data, index)
+        if subset:
+            series = series[::3]
+        self.series = series
+
+    @parameterized(["dtype", "subset"], ([int, float], [True, False]))
+    def time_from_series(self, dtype, subset):
+        xr.DataArray.from_series(self.series)
diff --git a/asv_bench/benchmarks/reindexing.py b/asv_bench/benchmarks/reindexing.py
new file mode 100644
index 00000000000..9d0767fc3b3
--- /dev/null
+++ b/asv_bench/benchmarks/reindexing.py
@@ -0,0 +1,52 @@
+import numpy as np
+
+import xarray as xr
+
+from . import requires_dask
+
+ntime = 500
+nx = 50
+ny = 50
+
+
+class Reindex:
+    def setup(self):
+        data = np.random.RandomState(0).randn(ntime, nx, ny)
+        self.ds = xr.Dataset(
+            {"temperature": (("time", "x", "y"), data)},
+            coords={"time": np.arange(ntime), "x": np.arange(nx), "y": np.arange(ny)},
+        )
+
+    def time_1d_coarse(self):
+        self.ds.reindex(time=np.arange(0, ntime, 5)).load()
+
+    def time_1d_fine_all_found(self):
+        self.ds.reindex(time=np.arange(0, ntime, 0.5), method="nearest").load()
+
+    def time_1d_fine_some_missing(self):
+        self.ds.reindex(
+            time=np.arange(0, ntime, 0.5), method="nearest", tolerance=0.1
+        ).load()
+
+    def time_2d_coarse(self):
+        self.ds.reindex(x=np.arange(0, nx, 2), y=np.arange(0, ny, 2)).load()
+
+    def time_2d_fine_all_found(self):
+        self.ds.reindex(
+            x=np.arange(0, nx, 0.5), y=np.arange(0, ny, 0.5), method="nearest"
+        ).load()
+
+    def time_2d_fine_some_missing(self):
+        self.ds.reindex(
+            x=np.arange(0, nx, 0.5),
+            y=np.arange(0, ny, 0.5),
+            method="nearest",
+            tolerance=0.1,
+        ).load()
+
+
+class ReindexDask(Reindex):
+    def setup(self):
+        requires_dask()
+        super().setup()
+        self.ds = self.ds.chunk({"time": 100})
diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py
new file mode 100644
index 00000000000..4bf2ace352d
--- /dev/null
+++ b/asv_bench/benchmarks/repr.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+
+class Repr:
+    def setup(self):
+        a = np.arange(0, 100)
+        data_vars = dict()
+        for i in a:
+            data_vars[f"long_variable_name_{i}"] = xr.DataArray(
+                name=f"long_variable_name_{i}",
+                data=np.arange(0, 20),
+                dims=[f"long_coord_name_{i}_x"],
+                coords={f"long_coord_name_{i}_x": np.arange(0, 20) * 2},
+            )
+        self.ds = xr.Dataset(data_vars)
+        self.ds.attrs = {f"attr_{k}": 2 for k in a}
+
+    def time_repr(self):
+        repr(self.ds)
+
+    def time_repr_html(self):
+        self.ds._repr_html_()
+
+
+class ReprMultiIndex:
+    def setup(self):
+        index = pd.MultiIndex.from_product(
+            [range(1000), range(1000)], names=("level_0", "level_1")
+        )
+        series = pd.Series(range(1000 * 1000), index=index)
+        self.da = xr.DataArray(series)
+
+    def time_repr(self):
+        repr(self.da)
+
+    def time_repr_html(self):
+        self.da._repr_html_()
diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py
new file mode 100644
index 00000000000..f0e18bf2153
--- /dev/null
+++ b/asv_bench/benchmarks/rolling.py
@@ -0,0 +1,110 @@
+import numpy as np
+import pandas as pd
+
+import xarray as xr
+
+from . import parameterized, randn, requires_dask
+
+nx = 300
+long_nx = 30000
+ny = 200
+nt = 100
+window = 20
+
+randn_xy = randn((nx, ny), frac_nan=0.1)
+randn_xt = randn((nx, nt))
+randn_t = randn((nt,))
+randn_long = randn((long_nx,), frac_nan=0.1)
+
+
+class Rolling:
+    def setup(self, *args, **kwargs):
+        self.ds = xr.Dataset(
+            {
+                "var1": (("x", "y"), randn_xy),
+                "var2": (("x", "t"), randn_xt),
+                "var3": (("t",), randn_t),
+            },
+            coords={
+                "x": np.arange(nx),
+                "y": np.linspace(0, 1, ny),
+                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
+                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
+            },
+        )
+        self.da_long = xr.DataArray(
+            randn_long, dims="x", coords={"x": np.arange(long_nx) * 0.1}
+        )
+
+    @parameterized(["func", "center"], (["mean", "count"], [True, False]))
+    def time_rolling(self, func, center):
+        getattr(self.ds.rolling(x=window, center=center), func)().load()
+
+    @parameterized(["func", "pandas"], (["mean", "count"], [True, False]))
+    def time_rolling_long(self, func, pandas):
+        if pandas:
+            se = self.da_long.to_series()
+            getattr(se.rolling(window=window, min_periods=window), func)()
+        else:
+            getattr(self.da_long.rolling(x=window, min_periods=window), func)().load()
+
+    @parameterized(["window_", "min_periods"], ([20, 40], [5, 5]))
+    def time_rolling_np(self, window_, min_periods):
+        self.ds.rolling(x=window_, center=False, min_periods=min_periods).reduce(
+            getattr(np, "nansum")
+        ).load()
+
+    @parameterized(["center", "stride"], ([True, False], [1, 1]))
+    def time_rolling_construct(self, center, stride):
+        self.ds.rolling(x=window, center=center).construct(
+            "window_dim", stride=stride
+        ).sum(dim="window_dim").load()
+
+
+class RollingDask(Rolling):
+    def setup(self, *args, **kwargs):
+        requires_dask()
+        super().setup(**kwargs)
+        self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50})
+        self.da_long = self.da_long.chunk({"x": 10000})
+
+
+class RollingMemory:
+    def setup(self, *args, **kwargs):
+        self.ds = xr.Dataset(
+            {
+                "var1": (("x", "y"), randn_xy),
+                "var2": (("x", "t"), randn_xt),
+                "var3": (("t",), randn_t),
+            },
+            coords={
+                "x": np.arange(nx),
+                "y": np.linspace(0, 1, ny),
+                "t": pd.date_range("1970-01-01", periods=nt, freq="D"),
+                "x_coords": ("x", np.linspace(1.1, 2.1, nx)),
+            },
+        )
+
+
+class DataArrayRollingMemory(RollingMemory):
+    @parameterized("func", ["sum", "max", "mean"])
+    def peakmem_ndrolling_reduce(self, func):
+        roll = self.ds.var1.rolling(x=10, y=4)
+        getattr(roll, func)()
+
+    @parameterized("func", ["sum", "max", "mean"])
+    def peakmem_1drolling_reduce(self, func):
+        roll = self.ds.var3.rolling(t=100)
+        getattr(roll, func)()
+
+
+class DatasetRollingMemory(RollingMemory):
+    @parameterized("func", ["sum", "max", "mean"])
+    def peakmem_ndrolling_reduce(self, func):
+        roll = self.ds.rolling(x=10, y=4)
+        getattr(roll, func)()
+
+    @parameterized("func", ["sum", "max", "mean"])
+    def peakmem_1drolling_reduce(self, func):
+        roll = self.ds.rolling(t=100)
+        getattr(roll, func)()
diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py
new file mode 100644
index 00000000000..2c5b7ca7821
--- /dev/null
+++ b/asv_bench/benchmarks/unstacking.py
@@ -0,0 +1,29 @@
+import numpy as np
+
+import xarray as xr
+
+from . import requires_dask
+
+
+class Unstacking:
+    def setup(self):
+        data = np.random.RandomState(0).randn(250, 500)
+        self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
+        self.da_missing = self.da_full[:-1]
+        self.df_missing = self.da_missing.to_pandas()
+
+    def time_unstack_fast(self):
+        self.da_full.unstack("flat_dim")
+
+    def time_unstack_slow(self):
+        self.da_missing.unstack("flat_dim")
+
+    def time_unstack_pandas_slow(self):
+        self.df_missing.unstack()
+
+
+class UnstackingDask(Unstacking):
+    def setup(self, *args, **kwargs):
+        requires_dask()
+        super().setup(**kwargs)
+        self.da_full = self.da_full.chunk({"flat_dim": 25})