diff --git a/asv_bench/benchmarks/combine.py b/asv_bench/benchmarks/combine.py new file mode 100644 index 00000000000..a4f8db2786b --- /dev/null +++ b/asv_bench/benchmarks/combine.py @@ -0,0 +1,38 @@ +import numpy as np + +import xarray as xr + + +class Combine: + """Benchmark concatenating and merging large datasets""" + + def setup(self): + """Create 4 datasets with two different variables""" + + t_size, x_size, y_size = 50, 450, 400 + t = np.arange(t_size) + data = np.random.randn(t_size, x_size, y_size) + + self.dsA0 = xr.Dataset( + {"A": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))} + ) + self.dsA1 = xr.Dataset( + {"A": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))} + ) + self.dsB0 = xr.Dataset( + {"B": xr.DataArray(data, coords={"T": t}, dims=("T", "X", "Y"))} + ) + self.dsB1 = xr.Dataset( + {"B": xr.DataArray(data, coords={"T": t + t_size}, dims=("T", "X", "Y"))} + ) + + def time_combine_nested(self): + datasets = [[self.dsA0, self.dsA1], [self.dsB0, self.dsB1]] + + xr.combine_nested(datasets, concat_dim=[None, "T"]) + + def time_combine_by_coords(self): + """Also has to load and arrange t coordinate""" + datasets = [self.dsA0, self.dsA1, self.dsB0, self.dsB1] + + xr.combine_by_coords(datasets) diff --git a/asv_bench/benchmarks/dataarray_missing.py b/asv_bench/benchmarks/dataarray_missing.py new file mode 100644 index 00000000000..f89fe7f8eb9 --- /dev/null +++ b/asv_bench/benchmarks/dataarray_missing.py @@ -0,0 +1,80 @@ +import pandas as pd + +import xarray as xr + +from . import parameterized, randn, requires_dask + + +def make_bench_data(shape, frac_nan, chunks): + vals = randn(shape, frac_nan) + coords = {"time": pd.date_range("2000-01-01", freq="D", periods=shape[0])} + da = xr.DataArray(vals, dims=("time", "x", "y"), coords=coords) + + if chunks is not None: + da = da.chunk(chunks) + + return da + + +def requires_bottleneck(): + try: + import bottleneck # noqa: F401 + except ImportError: + raise NotImplementedError() + + +class DataArrayMissingInterpolateNA: + def setup(self, shape, chunks, limit): + if chunks is not None: + requires_dask() + self.da = make_bench_data(shape, 0.1, chunks) + + @parameterized( + ["shape", "chunks", "limit"], + ( + [(365, 75, 75)], + [None, {"x": 25, "y": 25}], + [None, 3], + ), + ) + def time_interpolate_na(self, shape, chunks, limit): + actual = self.da.interpolate_na(dim="time", method="linear", limit=limit) + + if chunks is not None: + actual = actual.compute() + + +class DataArrayMissingBottleneck: + def setup(self, shape, chunks, limit): + requires_bottleneck() + if chunks is not None: + requires_dask() + self.da = make_bench_data(shape, 0.1, chunks) + + @parameterized( + ["shape", "chunks", "limit"], + ( + [(365, 75, 75)], + [None, {"x": 25, "y": 25}], + [None, 3], + ), + ) + def time_ffill(self, shape, chunks, limit): + actual = self.da.ffill(dim="time", limit=limit) + + if chunks is not None: + actual = actual.compute() + + @parameterized( + ["shape", "chunks", "limit"], + ( + [(365, 75, 75)], + [None, {"x": 25, "y": 25}], + [None, 3], + ), + ) + def time_bfill(self, shape, chunks, limit): + actual = self.da.ffill(dim="time", limit=limit) + + if chunks is not None: + actual = actual.compute() diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py new file mode 100644 index 00000000000..6c2e15c54e9 --- /dev/null +++ b/asv_bench/benchmarks/dataset_io.py @@ -0,0 +1,478 @@ +import os + +import numpy as np +import pandas as pd + +import xarray as xr + +from . import _skip_slow, randint, randn, requires_dask + +try: + import dask + import dask.multiprocessing +except ImportError: + pass + + +os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE" + + +class IOSingleNetCDF: + """ + A few examples that benchmark reading/writing a single netCDF file with + xarray + """ + + timeout = 300.0 + repeat = 1 + number = 5 + + def make_ds(self): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + # single Dataset + self.ds = xr.Dataset() + self.nt = 1000 + self.nx = 90 + self.ny = 45 + + self.block_chunks = { + "time": self.nt / 4, + "lon": self.nx / 3, + "lat": self.ny / 3, + } + + self.time_chunks = {"time": int(self.nt / 36)} + + times = pd.date_range("1970-01-01", periods=self.nt, freq="D") + lons = xr.DataArray( + np.linspace(0, 360, self.nx), + dims=("lon",), + attrs={"units": "degrees east", "long_name": "longitude"}, + ) + lats = xr.DataArray( + np.linspace(-90, 90, self.ny), + dims=("lat",), + attrs={"units": "degrees north", "long_name": "latitude"}, + ) + self.ds["foo"] = xr.DataArray( + randn((self.nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="foo", + attrs={"units": "foo units", "description": "a description"}, + ) + self.ds["bar"] = xr.DataArray( + randn((self.nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="bar", + attrs={"units": "bar units", "description": "a description"}, + ) + self.ds["baz"] = xr.DataArray( + randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32), + coords={"lon": lons, "lat": lats}, + dims=("lon", "lat"), + name="baz", + attrs={"units": "baz units", "description": "a description"}, + ) + + self.ds.attrs = {"history": "created for xarray benchmarking"} + + self.oinds = { + "time": randint(0, self.nt, 120), + "lon": randint(0, self.nx, 20), + "lat": randint(0, self.ny, 10), + } + self.vinds = { + "time": xr.DataArray(randint(0, self.nt, 120), dims="x"), + "lon": xr.DataArray(randint(0, self.nx, 120), dims="x"), + "lat": slice(3, 20), + } + + +class IOWriteSingleNetCDF3(IOSingleNetCDF): + def setup(self): + self.format = "NETCDF3_64BIT" + self.make_ds() + + def time_write_dataset_netcdf4(self): + self.ds.to_netcdf("test_netcdf4_write.nc", engine="netcdf4", format=self.format) + + def time_write_dataset_scipy(self): + self.ds.to_netcdf("test_scipy_write.nc", engine="scipy", format=self.format) + + +class IOReadSingleNetCDF4(IOSingleNetCDF): + def setup(self): + + self.make_ds() + + self.filepath = "test_single_file.nc4.nc" + self.format = "NETCDF4" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_netcdf4(self): + xr.open_dataset(self.filepath, engine="netcdf4").load() + + def time_orthogonal_indexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4") + ds = ds.isel(**self.oinds).load() + + def time_vectorized_indexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4") + ds = ds.isel(**self.vinds).load() + + +class IOReadSingleNetCDF3(IOReadSingleNetCDF4): + def setup(self): + + self.make_ds() + + self.filepath = "test_single_file.nc3.nc" + self.format = "NETCDF3_64BIT" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_scipy(self): + xr.open_dataset(self.filepath, engine="scipy").load() + + def time_orthogonal_indexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy") + ds = ds.isel(**self.oinds).load() + + def time_vectorized_indexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy") + ds = ds.isel(**self.vinds).load() + + +class IOReadSingleNetCDF4Dask(IOSingleNetCDF): + def setup(self): + + requires_dask() + + self.make_ds() + + self.filepath = "test_single_file.nc4.nc" + self.format = "NETCDF4" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_netcdf4_with_block_chunks(self): + xr.open_dataset( + self.filepath, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_block_chunks_oindexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks) + ds = ds.isel(**self.oinds).load() + + def time_load_dataset_netcdf4_with_block_chunks_vindexing(self): + ds = xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.block_chunks) + ds = ds.isel(**self.vinds).load() + + def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_time_chunks(self): + xr.open_dataset(self.filepath, engine="netcdf4", chunks=self.time_chunks).load() + + def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="netcdf4", chunks=self.time_chunks + ).load() + + +class IOReadSingleNetCDF3Dask(IOReadSingleNetCDF4Dask): + def setup(self): + + requires_dask() + + self.make_ds() + + self.filepath = "test_single_file.nc3.nc" + self.format = "NETCDF3_64BIT" + self.ds.to_netcdf(self.filepath, format=self.format) + + def time_load_dataset_scipy_with_block_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="scipy", chunks=self.block_chunks + ).load() + + def time_load_dataset_scipy_with_block_chunks_oindexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks) + ds = ds.isel(**self.oinds).load() + + def time_load_dataset_scipy_with_block_chunks_vindexing(self): + ds = xr.open_dataset(self.filepath, engine="scipy", chunks=self.block_chunks) + ds = ds.isel(**self.vinds).load() + + def time_load_dataset_scipy_with_time_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_dataset( + self.filepath, engine="scipy", chunks=self.time_chunks + ).load() + + +class IOMultipleNetCDF: + """ + A few examples that benchmark reading/writing multiple netCDF files with + xarray + """ + + timeout = 300.0 + repeat = 1 + number = 5 + + def make_ds(self, nfiles=10): + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + # multiple Dataset + self.ds = xr.Dataset() + self.nt = 1000 + self.nx = 90 + self.ny = 45 + self.nfiles = nfiles + + self.block_chunks = { + "time": self.nt / 4, + "lon": self.nx / 3, + "lat": self.ny / 3, + } + + self.time_chunks = {"time": int(self.nt / 36)} + + self.time_vars = np.split( + pd.date_range("1970-01-01", periods=self.nt, freq="D"), self.nfiles + ) + + self.ds_list = [] + self.filenames_list = [] + for i, times in enumerate(self.time_vars): + ds = xr.Dataset() + nt = len(times) + lons = xr.DataArray( + np.linspace(0, 360, self.nx), + dims=("lon",), + attrs={"units": "degrees east", "long_name": "longitude"}, + ) + lats = xr.DataArray( + np.linspace(-90, 90, self.ny), + dims=("lat",), + attrs={"units": "degrees north", "long_name": "latitude"}, + ) + ds["foo"] = xr.DataArray( + randn((nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="foo", + attrs={"units": "foo units", "description": "a description"}, + ) + ds["bar"] = xr.DataArray( + randn((nt, self.nx, self.ny), frac_nan=0.2), + coords={"lon": lons, "lat": lats, "time": times}, + dims=("time", "lon", "lat"), + name="bar", + attrs={"units": "bar units", "description": "a description"}, + ) + ds["baz"] = xr.DataArray( + randn((self.nx, self.ny), frac_nan=0.2).astype(np.float32), + coords={"lon": lons, "lat": lats}, + dims=("lon", "lat"), + name="baz", + attrs={"units": "baz units", "description": "a description"}, + ) + + ds.attrs = {"history": "created for xarray benchmarking"} + + self.ds_list.append(ds) + self.filenames_list.append("test_netcdf_%i.nc" % i) + + +class IOWriteMultipleNetCDF3(IOMultipleNetCDF): + def setup(self): + self.make_ds() + self.format = "NETCDF3_64BIT" + + def time_write_dataset_netcdf4(self): + xr.save_mfdataset( + self.ds_list, self.filenames_list, engine="netcdf4", format=self.format + ) + + def time_write_dataset_scipy(self): + xr.save_mfdataset( + self.ds_list, self.filenames_list, engine="scipy", format=self.format + ) + + +class IOReadMultipleNetCDF4(IOMultipleNetCDF): + def setup(self): + + requires_dask() + + self.make_ds() + self.format = "NETCDF4" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_netcdf4(self): + xr.open_mfdataset(self.filenames_list, engine="netcdf4").load() + + def time_open_dataset_netcdf4(self): + xr.open_mfdataset(self.filenames_list, engine="netcdf4") + + +class IOReadMultipleNetCDF3(IOReadMultipleNetCDF4): + def setup(self): + + requires_dask() + + self.make_ds() + self.format = "NETCDF3_64BIT" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_scipy(self): + xr.open_mfdataset(self.filenames_list, engine="scipy").load() + + def time_open_dataset_scipy(self): + xr.open_mfdataset(self.filenames_list, engine="scipy") + + +class IOReadMultipleNetCDF4Dask(IOMultipleNetCDF): + def setup(self): + + requires_dask() + + self.make_ds() + self.format = "NETCDF4" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_netcdf4_with_block_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_block_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ).load() + + def time_load_dataset_netcdf4_with_time_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ).load() + + def time_load_dataset_netcdf4_with_time_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ).load() + + def time_open_dataset_netcdf4_with_block_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ) + + def time_open_dataset_netcdf4_with_block_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.block_chunks + ) + + def time_open_dataset_netcdf4_with_time_chunks(self): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ) + + def time_open_dataset_netcdf4_with_time_chunks_multiprocessing(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="netcdf4", chunks=self.time_chunks + ) + + +class IOReadMultipleNetCDF3Dask(IOReadMultipleNetCDF4Dask): + def setup(self): + + requires_dask() + + self.make_ds() + self.format = "NETCDF3_64BIT" + xr.save_mfdataset(self.ds_list, self.filenames_list, format=self.format) + + def time_load_dataset_scipy_with_block_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.block_chunks + ).load() + + def time_load_dataset_scipy_with_time_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.time_chunks + ).load() + + def time_open_dataset_scipy_with_block_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.block_chunks + ) + + def time_open_dataset_scipy_with_time_chunks(self): + with dask.config.set(scheduler="multiprocessing"): + xr.open_mfdataset( + self.filenames_list, engine="scipy", chunks=self.time_chunks + ) + + +def create_delayed_write(): + import dask.array as da + + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + vals = da.random.random(300, chunks=(1,)) + ds = xr.Dataset({"vals": (["a"], vals)}) + return ds.to_netcdf("file.nc", engine="netcdf4", compute=False) + + +class IOWriteNetCDFDask: + timeout = 60 + repeat = 1 + number = 5 + + def setup(self): + requires_dask() + self.write = create_delayed_write() + + def time_write(self): + self.write.compute() + + +class IOWriteNetCDFDaskDistributed: + def setup(self): + try: + import distributed + except ImportError: + raise NotImplementedError() + + # TODO: Lazily skipped in CI as it is very demanding and slow. + # Improve times and remove errors. + _skip_slow() + + self.client = distributed.Client() + self.write = create_delayed_write() + + def cleanup(self): + self.client.shutdown() + + def time_write(self): + self.write.compute() diff --git a/asv_bench/benchmarks/import_xarray.py b/asv_bench/benchmarks/import_xarray.py new file mode 100644 index 00000000000..94652e3b82a --- /dev/null +++ b/asv_bench/benchmarks/import_xarray.py @@ -0,0 +1,9 @@ +class ImportXarray: + def setup(self, *args, **kwargs): + def import_xr(): + import xarray # noqa: F401 + + self._import_xr = import_xr + + def time_import_xarray(self): + self._import_xr() diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py new file mode 100644 index 00000000000..15212ec0c61 --- /dev/null +++ b/asv_bench/benchmarks/indexing.py @@ -0,0 +1,149 @@ +import os + +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized, randint, randn, requires_dask + +nx = 2000 +ny = 1000 +nt = 500 + +basic_indexes = { + "1slice": {"x": slice(0, 3)}, + "1slice-1scalar": {"x": 0, "y": slice(None, None, 3)}, + "2slicess-1scalar": {"x": slice(3, -3, 3), "y": 1, "t": slice(None, -3, 3)}, +} + +basic_assignment_values = { + "1slice": xr.DataArray(randn((3, ny), frac_nan=0.1), dims=["x", "y"]), + "1slice-1scalar": xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1), dims=["y"]), + "2slicess-1scalar": xr.DataArray( + randn(np.empty(nx)[slice(3, -3, 3)].size, frac_nan=0.1), dims=["x"] + ), +} + +outer_indexes = { + "1d": {"x": randint(0, nx, 400)}, + "2d": {"x": randint(0, nx, 500), "y": randint(0, ny, 400)}, + "2d-1scalar": {"x": randint(0, nx, 100), "y": 1, "t": randint(0, nt, 400)}, +} + +outer_assignment_values = { + "1d": xr.DataArray(randn((400, ny), frac_nan=0.1), dims=["x", "y"]), + "2d": xr.DataArray(randn((500, 400), frac_nan=0.1), dims=["x", "y"]), + "2d-1scalar": xr.DataArray(randn(100, frac_nan=0.1), dims=["x"]), +} + +vectorized_indexes = { + "1-1d": {"x": xr.DataArray(randint(0, nx, 400), dims="a")}, + "2-1d": { + "x": xr.DataArray(randint(0, nx, 400), dims="a"), + "y": xr.DataArray(randint(0, ny, 400), dims="a"), + }, + "3-2d": { + "x": xr.DataArray(randint(0, nx, 400).reshape(4, 100), dims=["a", "b"]), + "y": xr.DataArray(randint(0, ny, 400).reshape(4, 100), dims=["a", "b"]), + "t": xr.DataArray(randint(0, nt, 400).reshape(4, 100), dims=["a", "b"]), + }, +} + +vectorized_assignment_values = { + "1-1d": xr.DataArray(randn((400, ny)), dims=["a", "y"], coords={"a": randn(400)}), + "2-1d": xr.DataArray(randn(400), dims=["a"], coords={"a": randn(400)}), + "3-2d": xr.DataArray( + randn((4, 100)), dims=["a", "b"], coords={"a": randn(4), "b": randn(100)} + ), +} + + +class Base: + def setup(self, key): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn((nx, ny), frac_nan=0.1)), + "var2": (("x", "t"), randn((nx, nt))), + "var3": (("t",), randn(nt)), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + + +class Indexing(Base): + @parameterized(["key"], [list(basic_indexes.keys())]) + def time_indexing_basic(self, key): + self.ds.isel(**basic_indexes[key]).load() + + @parameterized(["key"], [list(outer_indexes.keys())]) + def time_indexing_outer(self, key): + self.ds.isel(**outer_indexes[key]).load() + + @parameterized(["key"], [list(vectorized_indexes.keys())]) + def time_indexing_vectorized(self, key): + self.ds.isel(**vectorized_indexes[key]).load() + + +class Assignment(Base): + @parameterized(["key"], [list(basic_indexes.keys())]) + def time_assignment_basic(self, key): + ind = basic_indexes[key] + val = basic_assignment_values[key] + self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val + + @parameterized(["key"], [list(outer_indexes.keys())]) + def time_assignment_outer(self, key): + ind = outer_indexes[key] + val = outer_assignment_values[key] + self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val + + @parameterized(["key"], [list(vectorized_indexes.keys())]) + def time_assignment_vectorized(self, key): + ind = vectorized_indexes[key] + val = vectorized_assignment_values[key] + self.ds["var1"][ind.get("x", slice(None)), ind.get("y", slice(None))] = val + + +class IndexingDask(Indexing): + def setup(self, key): + requires_dask() + super().setup(key) + self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) + + +class BooleanIndexing: + # https://github.com/pydata/xarray/issues/2227 + def setup(self): + self.ds = xr.Dataset( + {"a": ("time", np.arange(10_000_000))}, + coords={"time": np.arange(10_000_000)}, + ) + self.time_filter = self.ds.time > 50_000 + + def time_indexing(self): + self.ds.isel(time=self.time_filter) + + +class HugeAxisSmallSliceIndexing: + # https://github.com/pydata/xarray/pull/4560 + def setup(self): + self.filepath = "test_indexing_huge_axis_small_slice.nc" + if not os.path.isfile(self.filepath): + xr.Dataset( + {"a": ("x", np.arange(10_000_000))}, + coords={"x": np.arange(10_000_000)}, + ).to_netcdf(self.filepath, format="NETCDF4") + + self.ds = xr.open_dataset(self.filepath) + + def time_indexing(self): + self.ds.isel(x=slice(100)) + + def cleanup(self): + self.ds.close() diff --git a/asv_bench/benchmarks/interp.py b/asv_bench/benchmarks/interp.py new file mode 100644 index 00000000000..4b6691bcc0a --- /dev/null +++ b/asv_bench/benchmarks/interp.py @@ -0,0 +1,51 @@ +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized, randn, requires_dask + +nx = 1500 +ny = 1000 +nt = 500 + +randn_xy = randn((nx, ny), frac_nan=0.1) +randn_xt = randn((nx, nt)) +randn_t = randn((nt,)) + +new_x_short = np.linspace(0.3 * nx, 0.7 * nx, 100) +new_x_long = np.linspace(0.3 * nx, 0.7 * nx, 500) +new_y_long = np.linspace(0.1, 0.9, 500) + + +class Interpolation: + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn_xy), + "var2": (("x", "t"), randn_xt), + "var3": (("t",), randn_t), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + + @parameterized(["method", "is_short"], (["linear", "cubic"], [True, False])) + def time_interpolation(self, method, is_short): + new_x = new_x_short if is_short else new_x_long + self.ds.interp(x=new_x, method=method).load() + + @parameterized(["method"], (["linear", "nearest"])) + def time_interpolation_2d(self, method): + self.ds.interp(x=new_x_long, y=new_y_long, method=method).load() + + +class InterpolationDask(Interpolation): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + self.ds = self.ds.chunk({"t": 50}) diff --git a/asv_bench/benchmarks/pandas.py b/asv_bench/benchmarks/pandas.py new file mode 100644 index 00000000000..8aaa515d417 --- /dev/null +++ b/asv_bench/benchmarks/pandas.py @@ -0,0 +1,26 @@ +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized + + +class MultiIndexSeries: + def setup(self, dtype, subset): + data = np.random.rand(100000).astype(dtype) + index = pd.MultiIndex.from_product( + [ + list("abcdefhijk"), + list("abcdefhijk"), + pd.date_range(start="2000-01-01", periods=1000, freq="B"), + ] + ) + series = pd.Series(data, index) + if subset: + series = series[::3] + self.series = series + + @parameterized(["dtype", "subset"], ([int, float], [True, False])) + def time_from_series(self, dtype, subset): + xr.DataArray.from_series(self.series) diff --git a/asv_bench/benchmarks/reindexing.py b/asv_bench/benchmarks/reindexing.py new file mode 100644 index 00000000000..9d0767fc3b3 --- /dev/null +++ b/asv_bench/benchmarks/reindexing.py @@ -0,0 +1,52 @@ +import numpy as np + +import xarray as xr + +from . import requires_dask + +ntime = 500 +nx = 50 +ny = 50 + + +class Reindex: + def setup(self): + data = np.random.RandomState(0).randn(ntime, nx, ny) + self.ds = xr.Dataset( + {"temperature": (("time", "x", "y"), data)}, + coords={"time": np.arange(ntime), "x": np.arange(nx), "y": np.arange(ny)}, + ) + + def time_1d_coarse(self): + self.ds.reindex(time=np.arange(0, ntime, 5)).load() + + def time_1d_fine_all_found(self): + self.ds.reindex(time=np.arange(0, ntime, 0.5), method="nearest").load() + + def time_1d_fine_some_missing(self): + self.ds.reindex( + time=np.arange(0, ntime, 0.5), method="nearest", tolerance=0.1 + ).load() + + def time_2d_coarse(self): + self.ds.reindex(x=np.arange(0, nx, 2), y=np.arange(0, ny, 2)).load() + + def time_2d_fine_all_found(self): + self.ds.reindex( + x=np.arange(0, nx, 0.5), y=np.arange(0, ny, 0.5), method="nearest" + ).load() + + def time_2d_fine_some_missing(self): + self.ds.reindex( + x=np.arange(0, nx, 0.5), + y=np.arange(0, ny, 0.5), + method="nearest", + tolerance=0.1, + ).load() + + +class ReindexDask(Reindex): + def setup(self): + requires_dask() + super().setup() + self.ds = self.ds.chunk({"time": 100}) diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py new file mode 100644 index 00000000000..4bf2ace352d --- /dev/null +++ b/asv_bench/benchmarks/repr.py @@ -0,0 +1,40 @@ +import numpy as np +import pandas as pd + +import xarray as xr + + +class Repr: + def setup(self): + a = np.arange(0, 100) + data_vars = dict() + for i in a: + data_vars[f"long_variable_name_{i}"] = xr.DataArray( + name=f"long_variable_name_{i}", + data=np.arange(0, 20), + dims=[f"long_coord_name_{i}_x"], + coords={f"long_coord_name_{i}_x": np.arange(0, 20) * 2}, + ) + self.ds = xr.Dataset(data_vars) + self.ds.attrs = {f"attr_{k}": 2 for k in a} + + def time_repr(self): + repr(self.ds) + + def time_repr_html(self): + self.ds._repr_html_() + + +class ReprMultiIndex: + def setup(self): + index = pd.MultiIndex.from_product( + [range(1000), range(1000)], names=("level_0", "level_1") + ) + series = pd.Series(range(1000 * 1000), index=index) + self.da = xr.DataArray(series) + + def time_repr(self): + repr(self.da) + + def time_repr_html(self): + self.da._repr_html_() diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py new file mode 100644 index 00000000000..f0e18bf2153 --- /dev/null +++ b/asv_bench/benchmarks/rolling.py @@ -0,0 +1,110 @@ +import numpy as np +import pandas as pd + +import xarray as xr + +from . import parameterized, randn, requires_dask + +nx = 300 +long_nx = 30000 +ny = 200 +nt = 100 +window = 20 + +randn_xy = randn((nx, ny), frac_nan=0.1) +randn_xt = randn((nx, nt)) +randn_t = randn((nt,)) +randn_long = randn((long_nx,), frac_nan=0.1) + + +class Rolling: + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn_xy), + "var2": (("x", "t"), randn_xt), + "var3": (("t",), randn_t), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + self.da_long = xr.DataArray( + randn_long, dims="x", coords={"x": np.arange(long_nx) * 0.1} + ) + + @parameterized(["func", "center"], (["mean", "count"], [True, False])) + def time_rolling(self, func, center): + getattr(self.ds.rolling(x=window, center=center), func)().load() + + @parameterized(["func", "pandas"], (["mean", "count"], [True, False])) + def time_rolling_long(self, func, pandas): + if pandas: + se = self.da_long.to_series() + getattr(se.rolling(window=window, min_periods=window), func)() + else: + getattr(self.da_long.rolling(x=window, min_periods=window), func)().load() + + @parameterized(["window_", "min_periods"], ([20, 40], [5, 5])) + def time_rolling_np(self, window_, min_periods): + self.ds.rolling(x=window_, center=False, min_periods=min_periods).reduce( + getattr(np, "nansum") + ).load() + + @parameterized(["center", "stride"], ([True, False], [1, 1])) + def time_rolling_construct(self, center, stride): + self.ds.rolling(x=window, center=center).construct( + "window_dim", stride=stride + ).sum(dim="window_dim").load() + + +class RollingDask(Rolling): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + self.ds = self.ds.chunk({"x": 100, "y": 50, "t": 50}) + self.da_long = self.da_long.chunk({"x": 10000}) + + +class RollingMemory: + def setup(self, *args, **kwargs): + self.ds = xr.Dataset( + { + "var1": (("x", "y"), randn_xy), + "var2": (("x", "t"), randn_xt), + "var3": (("t",), randn_t), + }, + coords={ + "x": np.arange(nx), + "y": np.linspace(0, 1, ny), + "t": pd.date_range("1970-01-01", periods=nt, freq="D"), + "x_coords": ("x", np.linspace(1.1, 2.1, nx)), + }, + ) + + +class DataArrayRollingMemory(RollingMemory): + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_ndrolling_reduce(self, func): + roll = self.ds.var1.rolling(x=10, y=4) + getattr(roll, func)() + + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_1drolling_reduce(self, func): + roll = self.ds.var3.rolling(t=100) + getattr(roll, func)() + + +class DatasetRollingMemory(RollingMemory): + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_ndrolling_reduce(self, func): + roll = self.ds.rolling(x=10, y=4) + getattr(roll, func)() + + @parameterized("func", ["sum", "max", "mean"]) + def peakmem_1drolling_reduce(self, func): + roll = self.ds.rolling(t=100) + getattr(roll, func)() diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py new file mode 100644 index 00000000000..2c5b7ca7821 --- /dev/null +++ b/asv_bench/benchmarks/unstacking.py @@ -0,0 +1,29 @@ +import numpy as np + +import xarray as xr + +from . import requires_dask + + +class Unstacking: + def setup(self): + data = np.random.RandomState(0).randn(250, 500) + self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...]) + self.da_missing = self.da_full[:-1] + self.df_missing = self.da_missing.to_pandas() + + def time_unstack_fast(self): + self.da_full.unstack("flat_dim") + + def time_unstack_slow(self): + self.da_missing.unstack("flat_dim") + + def time_unstack_pandas_slow(self): + self.df_missing.unstack() + + +class UnstackingDask(Unstacking): + def setup(self, *args, **kwargs): + requires_dask() + super().setup(**kwargs) + self.da_full = self.da_full.chunk({"flat_dim": 25})