From 3cc3ab1f5504dcab07a0757343f9af1ff4a05ebb Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 19 Aug 2021 16:18:56 -0600 Subject: [PATCH] Refactor more groupby and resample tests (#5707) --- xarray/tests/test_dataarray.py | 739 ----------------------- xarray/tests/test_dataset.py | 242 -------- xarray/tests/test_groupby.py | 1020 +++++++++++++++++++++++++++++++- 3 files changed, 1019 insertions(+), 982 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 8ab8bc872da..c3223432b38 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -8,7 +8,6 @@ import pandas as pd import pytest from pandas.core.computation.ops import UndefinedVariableError -from pandas.tseries.frequencies import to_offset import xarray as xr from xarray import ( @@ -1263,25 +1262,6 @@ def test_selection_multiindex_from_level(self): expected = data.isel(xy=[0, 1]).unstack("xy").squeeze("y").drop_vars("y") assert_equal(actual, expected) - def test_stack_groupby_unsorted_coord(self): - data = [[0, 1], [2, 3]] - data_flat = [0, 1, 2, 3] - dims = ["x", "y"] - y_vals = [2, 3] - - arr = xr.DataArray(data, dims=dims, coords={"y": y_vals}) - actual1 = arr.stack(z=dims).groupby("z").first() - midx1 = pd.MultiIndex.from_product([[0, 1], [2, 3]], names=dims) - expected1 = xr.DataArray(data_flat, dims=["z"], coords={"z": midx1}) - xr.testing.assert_equal(actual1, expected1) - - # GH: 3287. Note that y coord values are not in sorted order. - arr = xr.DataArray(data, dims=dims, coords={"y": y_vals[::-1]}) - actual2 = arr.stack(z=dims).groupby("z").first() - midx2 = pd.MultiIndex.from_product([[0, 1], [3, 2]], names=dims) - expected2 = xr.DataArray(data_flat, dims=["z"], coords={"z": midx2}) - xr.testing.assert_equal(actual2, expected2) - def test_virtual_default_coords(self): array = DataArray(np.zeros((5,)), dims="x") expected = DataArray(range(5), dims="x", name="x") @@ -1446,12 +1426,6 @@ def test_assign_coords(self): expected = DataArray(10, {"c": 42}) assert_identical(actual, expected) - array = DataArray([1, 2, 3, 4], {"c": ("x", [0, 0, 1, 1])}, dims="x") - actual = array.groupby("c").assign_coords(d=lambda a: a.mean()) - expected = array.copy() - expected.coords["d"] = ("x", [1.5, 1.5, 3.5, 3.5]) - assert_identical(actual, expected) - with pytest.raises(ValueError, match=r"conflicting MultiIndex"): self.mda.assign_coords(level_1=range(4)) @@ -2614,719 +2588,6 @@ def test_fillna(self): with pytest.raises(ValueError, match=r"broadcast"): a.fillna([1, 2]) - fill_value = DataArray([0, 1], dims="y") - actual = a.fillna(fill_value) - expected = DataArray( - [[0, 1], [1, 1], [0, 1], [3, 3]], coords={"x": range(4)}, dims=("x", "y") - ) - assert_identical(expected, actual) - - expected = b.copy() - for target in [a, expected]: - target.coords["b"] = ("x", [0, 0, 1, 1]) - actual = a.groupby("b").fillna(DataArray([0, 2], dims="b")) - assert_identical(expected, actual) - - def test_groupby_iter(self): - for ((act_x, act_dv), (exp_x, exp_ds)) in zip( - self.dv.groupby("y"), self.ds.groupby("y") - ): - assert exp_x == act_x - assert_identical(exp_ds["foo"], act_dv) - for ((_, exp_dv), act_dv) in zip(self.dv.groupby("x"), self.dv): - assert_identical(exp_dv, act_dv) - - def make_groupby_example_array(self): - da = self.dv.copy() - da.coords["abc"] = ("y", np.array(["a"] * 9 + ["c"] + ["b"] * 10)) - da.coords["y"] = 20 + 100 * da["y"] - return da - - def test_groupby_properties(self): - grouped = self.make_groupby_example_array().groupby("abc") - expected_groups = {"a": range(0, 9), "c": [9], "b": range(10, 20)} - assert expected_groups.keys() == grouped.groups.keys() - for key in expected_groups: - assert_array_equal(expected_groups[key], grouped.groups[key]) - assert 3 == len(grouped) - - def test_groupby_map_identity(self): - expected = self.make_groupby_example_array() - idx = expected.coords["y"] - - def identity(x): - return x - - for g in ["x", "y", "abc", idx]: - for shortcut in [False, True]: - for squeeze in [False, True]: - grouped = expected.groupby(g, squeeze=squeeze) - actual = grouped.map(identity, shortcut=shortcut) - assert_identical(expected, actual) - - def test_groupby_sum(self): - array = self.make_groupby_example_array() - grouped = array.groupby("abc") - - expected_sum_all = Dataset( - { - "foo": Variable( - ["abc"], - np.array( - [ - self.x[:, :9].sum(), - self.x[:, 10:].sum(), - self.x[:, 9:10].sum(), - ] - ).T, - ), - "abc": Variable(["abc"], np.array(["a", "b", "c"])), - } - )["foo"] - assert_allclose(expected_sum_all, grouped.reduce(np.sum, dim=...)) - assert_allclose(expected_sum_all, grouped.sum(...)) - - expected = DataArray( - [ - array["y"].values[idx].sum() - for idx in [slice(9), slice(10, None), slice(9, 10)] - ], - [["a", "b", "c"]], - ["abc"], - ) - actual = array["y"].groupby("abc").map(np.sum) - assert_allclose(expected, actual) - actual = array["y"].groupby("abc").sum(...) - assert_allclose(expected, actual) - - expected_sum_axis1 = Dataset( - { - "foo": ( - ["x", "abc"], - np.array( - [ - self.x[:, :9].sum(1), - self.x[:, 10:].sum(1), - self.x[:, 9:10].sum(1), - ] - ).T, - ), - "abc": Variable(["abc"], np.array(["a", "b", "c"])), - } - )["foo"] - assert_allclose(expected_sum_axis1, grouped.reduce(np.sum, "y")) - assert_allclose(expected_sum_axis1, grouped.sum("y")) - - def test_groupby_sum_default(self): - array = self.make_groupby_example_array() - grouped = array.groupby("abc") - - expected_sum_all = Dataset( - { - "foo": Variable( - ["x", "abc"], - np.array( - [ - self.x[:, :9].sum(axis=-1), - self.x[:, 10:].sum(axis=-1), - self.x[:, 9:10].sum(axis=-1), - ] - ).T, - ), - "abc": Variable(["abc"], np.array(["a", "b", "c"])), - } - )["foo"] - - assert_allclose(expected_sum_all, grouped.sum(dim="y")) - - def test_groupby_count(self): - array = DataArray( - [0, 0, np.nan, np.nan, 0, 0], - coords={"cat": ("x", ["a", "b", "b", "c", "c", "c"])}, - dims="x", - ) - actual = array.groupby("cat").count() - expected = DataArray([1, 1, 2], coords=[("cat", ["a", "b", "c"])]) - assert_identical(actual, expected) - - @pytest.mark.skip("needs to be fixed for shortcut=False, keep_attrs=False") - def test_groupby_reduce_attrs(self): - array = self.make_groupby_example_array() - array.attrs["foo"] = "bar" - - for shortcut in [True, False]: - for keep_attrs in [True, False]: - print(f"shortcut={shortcut}, keep_attrs={keep_attrs}") - actual = array.groupby("abc").reduce( - np.mean, keep_attrs=keep_attrs, shortcut=shortcut - ) - expected = array.groupby("abc").mean() - if keep_attrs: - expected.attrs["foo"] = "bar" - assert_identical(expected, actual) - - def test_groupby_map_center(self): - def center(x): - return x - np.mean(x) - - array = self.make_groupby_example_array() - grouped = array.groupby("abc") - - expected_ds = array.to_dataset() - exp_data = np.hstack( - [center(self.x[:, :9]), center(self.x[:, 9:10]), center(self.x[:, 10:])] - ) - expected_ds["foo"] = (["x", "y"], exp_data) - expected_centered = expected_ds["foo"] - assert_allclose(expected_centered, grouped.map(center)) - - def test_groupby_map_ndarray(self): - # regression test for #326 - array = self.make_groupby_example_array() - grouped = array.groupby("abc") - actual = grouped.map(np.asarray) - assert_equal(array, actual) - - def test_groupby_map_changes_metadata(self): - def change_metadata(x): - x.coords["x"] = x.coords["x"] * 2 - x.attrs["fruit"] = "lemon" - return x - - array = self.make_groupby_example_array() - grouped = array.groupby("abc") - actual = grouped.map(change_metadata) - expected = array.copy() - expected = change_metadata(expected) - assert_equal(expected, actual) - - def test_groupby_math(self): - array = self.make_groupby_example_array() - for squeeze in [True, False]: - grouped = array.groupby("x", squeeze=squeeze) - - expected = array + array.coords["x"] - actual = grouped + array.coords["x"] - assert_identical(expected, actual) - - actual = array.coords["x"] + grouped - assert_identical(expected, actual) - - ds = array.coords["x"].to_dataset(name="X") - expected = array + ds - actual = grouped + ds - assert_identical(expected, actual) - - actual = ds + grouped - assert_identical(expected, actual) - - grouped = array.groupby("abc") - expected_agg = (grouped.mean(...) - np.arange(3)).rename(None) - actual = grouped - DataArray(range(3), [("abc", ["a", "b", "c"])]) - actual_agg = actual.groupby("abc").mean(...) - assert_allclose(expected_agg, actual_agg) - - with pytest.raises(TypeError, match=r"only support binary ops"): - grouped + 1 - with pytest.raises(TypeError, match=r"only support binary ops"): - grouped + grouped - with pytest.raises(TypeError, match=r"in-place operations"): - array += grouped - - def test_groupby_math_not_aligned(self): - array = DataArray( - range(4), {"b": ("x", [0, 0, 1, 1]), "x": [0, 1, 2, 3]}, dims="x" - ) - other = DataArray([10], coords={"b": [0]}, dims="b") - actual = array.groupby("b") + other - expected = DataArray([10, 11, np.nan, np.nan], array.coords) - assert_identical(expected, actual) - - other = DataArray([10], coords={"c": 123, "b": [0]}, dims="b") - actual = array.groupby("b") + other - expected.coords["c"] = (["x"], [123] * 2 + [np.nan] * 2) - assert_identical(expected, actual) - - other = Dataset({"a": ("b", [10])}, {"b": [0]}) - actual = array.groupby("b") + other - expected = Dataset({"a": ("x", [10, 11, np.nan, np.nan])}, array.coords) - assert_identical(expected, actual) - - def test_groupby_restore_dim_order(self): - array = DataArray( - np.random.randn(5, 3), - coords={"a": ("x", range(5)), "b": ("y", range(3))}, - dims=["x", "y"], - ) - for by, expected_dims in [ - ("x", ("x", "y")), - ("y", ("x", "y")), - ("a", ("a", "y")), - ("b", ("x", "b")), - ]: - result = array.groupby(by).map(lambda x: x.squeeze()) - assert result.dims == expected_dims - - def test_groupby_restore_coord_dims(self): - array = DataArray( - np.random.randn(5, 3), - coords={ - "a": ("x", range(5)), - "b": ("y", range(3)), - "c": (("x", "y"), np.random.randn(5, 3)), - }, - dims=["x", "y"], - ) - - for by, expected_dims in [ - ("x", ("x", "y")), - ("y", ("x", "y")), - ("a", ("a", "y")), - ("b", ("x", "b")), - ]: - result = array.groupby(by, restore_coord_dims=True).map( - lambda x: x.squeeze() - )["c"] - assert result.dims == expected_dims - - def test_groupby_first_and_last(self): - array = DataArray([1, 2, 3, 4, 5], dims="x") - by = DataArray(["a"] * 2 + ["b"] * 3, dims="x", name="ab") - - expected = DataArray([1, 3], [("ab", ["a", "b"])]) - actual = array.groupby(by).first() - assert_identical(expected, actual) - - expected = DataArray([2, 5], [("ab", ["a", "b"])]) - actual = array.groupby(by).last() - assert_identical(expected, actual) - - array = DataArray(np.random.randn(5, 3), dims=["x", "y"]) - expected = DataArray(array[[0, 2]], {"ab": ["a", "b"]}, ["ab", "y"]) - actual = array.groupby(by).first() - assert_identical(expected, actual) - - actual = array.groupby("x").first() - expected = array # should be a no-op - assert_identical(expected, actual) - - def make_groupby_multidim_example_array(self): - return DataArray( - [[[0, 1], [2, 3]], [[5, 10], [15, 20]]], - coords={ - "lon": (["ny", "nx"], [[30, 40], [40, 50]]), - "lat": (["ny", "nx"], [[10, 10], [20, 20]]), - }, - dims=["time", "ny", "nx"], - ) - - def test_groupby_multidim(self): - array = self.make_groupby_multidim_example_array() - for dim, expected_sum in [ - ("lon", DataArray([5, 28, 23], coords=[("lon", [30.0, 40.0, 50.0])])), - ("lat", DataArray([16, 40], coords=[("lat", [10.0, 20.0])])), - ]: - actual_sum = array.groupby(dim).sum(...) - assert_identical(expected_sum, actual_sum) - - def test_groupby_multidim_map(self): - array = self.make_groupby_multidim_example_array() - actual = array.groupby("lon").map(lambda x: x - x.mean()) - expected = DataArray( - [[[-2.5, -6.0], [-5.0, -8.5]], [[2.5, 3.0], [8.0, 8.5]]], - coords=array.coords, - dims=array.dims, - ) - assert_identical(expected, actual) - - def test_groupby_bins(self): - array = DataArray(np.arange(4), dims="dim_0") - # the first value should not be part of any group ("right" binning) - array[0] = 99 - # bins follow conventions for pandas.cut - # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html - bins = [0, 1.5, 5] - bin_coords = pd.cut(array["dim_0"], bins).categories - expected = DataArray( - [1, 5], dims="dim_0_bins", coords={"dim_0_bins": bin_coords} - ) - # the problem with this is that it overwrites the dimensions of array! - # actual = array.groupby('dim_0', bins=bins).sum() - actual = array.groupby_bins("dim_0", bins).map(lambda x: x.sum()) - assert_identical(expected, actual) - # make sure original array dims are unchanged - assert len(array.dim_0) == 4 - - def test_groupby_bins_empty(self): - array = DataArray(np.arange(4), [("x", range(4))]) - # one of these bins will be empty - bins = [0, 4, 5] - bin_coords = pd.cut(array["x"], bins).categories - actual = array.groupby_bins("x", bins).sum() - expected = DataArray([6, np.nan], dims="x_bins", coords={"x_bins": bin_coords}) - assert_identical(expected, actual) - # make sure original array is unchanged - # (was a problem in earlier versions) - assert len(array.x) == 4 - - def test_groupby_bins_multidim(self): - array = self.make_groupby_multidim_example_array() - bins = [0, 15, 20] - bin_coords = pd.cut(array["lat"].values.flat, bins).categories - expected = DataArray([16, 40], dims="lat_bins", coords={"lat_bins": bin_coords}) - actual = array.groupby_bins("lat", bins).map(lambda x: x.sum()) - assert_identical(expected, actual) - # modify the array coordinates to be non-monotonic after unstacking - array["lat"].data = np.array([[10.0, 20.0], [20.0, 10.0]]) - expected = DataArray([28, 28], dims="lat_bins", coords={"lat_bins": bin_coords}) - actual = array.groupby_bins("lat", bins).map(lambda x: x.sum()) - assert_identical(expected, actual) - - def test_groupby_bins_sort(self): - data = xr.DataArray( - np.arange(100), dims="x", coords={"x": np.linspace(-100, 100, num=100)} - ) - binned_mean = data.groupby_bins("x", bins=11).mean() - assert binned_mean.to_index().is_monotonic - - def test_resample(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - array = DataArray(np.arange(10), [("time", times)]) - - actual = array.resample(time="24H").mean() - expected = DataArray(array.to_series().resample("24H").mean()) - assert_identical(expected, actual) - - actual = array.resample(time="24H").reduce(np.mean) - assert_identical(expected, actual) - - # Our use of `loffset` may change if we align our API with pandas' changes. - # ref https://github.com/pydata/xarray/pull/4537 - actual = array.resample(time="24H", loffset="-12H").mean() - expected_ = array.to_series().resample("24H").mean() - expected_.index += to_offset("-12H") - expected = DataArray.from_series(expected_) - assert_identical(actual, expected) - - with pytest.raises(ValueError, match=r"index must be monotonic"): - array[[2, 0, 1]].resample(time="1D") - - def test_da_resample_func_args(self): - def func(arg1, arg2, arg3=0.0): - return arg1.mean("time") + arg2 + arg3 - - times = pd.date_range("2000", periods=3, freq="D") - da = xr.DataArray([1.0, 1.0, 1.0], coords=[times], dims=["time"]) - expected = xr.DataArray([3.0, 3.0, 3.0], coords=[times], dims=["time"]) - actual = da.resample(time="D").map(func, args=(1.0,), arg3=1.0) - assert_identical(actual, expected) - - def test_resample_first(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - array = DataArray(np.arange(10), [("time", times)]) - - actual = array.resample(time="1D").first() - expected = DataArray([0, 4, 8], [("time", times[::4])]) - assert_identical(expected, actual) - - # verify that labels don't use the first value - actual = array.resample(time="24H").first() - expected = DataArray(array.to_series().resample("24H").first()) - assert_identical(expected, actual) - - # missing values - array = array.astype(float) - array[:2] = np.nan - actual = array.resample(time="1D").first() - expected = DataArray([2, 4, 8], [("time", times[::4])]) - assert_identical(expected, actual) - - actual = array.resample(time="1D").first(skipna=False) - expected = DataArray([np.nan, 4, 8], [("time", times[::4])]) - assert_identical(expected, actual) - - # regression test for http://stackoverflow.com/questions/33158558/ - array = Dataset({"time": times})["time"] - actual = array.resample(time="1D").last() - expected_times = pd.to_datetime( - ["2000-01-01T18", "2000-01-02T18", "2000-01-03T06"] - ) - expected = DataArray(expected_times, [("time", times[::4])], name="time") - assert_identical(expected, actual) - - def test_resample_bad_resample_dim(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - array = DataArray(np.arange(10), [("__resample_dim__", times)]) - with pytest.raises(ValueError, match=r"Proxy resampling dimension"): - array.resample(**{"__resample_dim__": "1D"}).first() - - @requires_scipy - def test_resample_drop_nondim_coords(self): - xs = np.arange(6) - ys = np.arange(3) - times = pd.date_range("2000-01-01", freq="6H", periods=5) - data = np.tile(np.arange(5), (6, 3, 1)) - xx, yy = np.meshgrid(xs * 5, ys * 2.5) - tt = np.arange(len(times), dtype=int) - array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time")) - xcoord = DataArray(xx.T, {"x": xs, "y": ys}, ("x", "y")) - ycoord = DataArray(yy.T, {"x": xs, "y": ys}, ("x", "y")) - tcoord = DataArray(tt, {"time": times}, ("time",)) - ds = Dataset({"data": array, "xc": xcoord, "yc": ycoord, "tc": tcoord}) - ds = ds.set_coords(["xc", "yc", "tc"]) - - # Select the data now, with the auxiliary coordinates in place - array = ds["data"] - - # Re-sample - actual = array.resample(time="12H", restore_coord_dims=True).mean("time") - assert "tc" not in actual.coords - - # Up-sample - filling - actual = array.resample(time="1H", restore_coord_dims=True).ffill() - assert "tc" not in actual.coords - - # Up-sample - interpolation - actual = array.resample(time="1H", restore_coord_dims=True).interpolate( - "linear" - ) - assert "tc" not in actual.coords - - def test_resample_keep_attrs(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - array = DataArray(np.ones(10), [("time", times)]) - array.attrs["meta"] = "data" - - result = array.resample(time="1D").mean(keep_attrs=True) - expected = DataArray([1, 1, 1], [("time", times[::4])], attrs=array.attrs) - assert_identical(result, expected) - - with pytest.warns( - UserWarning, match="Passing ``keep_attrs`` to ``resample`` has no effect." - ): - array.resample(time="1D", keep_attrs=True) - - def test_resample_skipna(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - array = DataArray(np.ones(10), [("time", times)]) - array[1] = np.nan - - result = array.resample(time="1D").mean(skipna=False) - expected = DataArray([np.nan, 1, 1], [("time", times[::4])]) - assert_identical(result, expected) - - def test_upsample(self): - times = pd.date_range("2000-01-01", freq="6H", periods=5) - array = DataArray(np.arange(5), [("time", times)]) - - # Forward-fill - actual = array.resample(time="3H").ffill() - expected = DataArray(array.to_series().resample("3H").ffill()) - assert_identical(expected, actual) - - # Backward-fill - actual = array.resample(time="3H").bfill() - expected = DataArray(array.to_series().resample("3H").bfill()) - assert_identical(expected, actual) - - # As frequency - actual = array.resample(time="3H").asfreq() - expected = DataArray(array.to_series().resample("3H").asfreq()) - assert_identical(expected, actual) - - # Pad - actual = array.resample(time="3H").pad() - expected = DataArray(array.to_series().resample("3H").pad()) - assert_identical(expected, actual) - - # Nearest - rs = array.resample(time="3H") - actual = rs.nearest() - new_times = rs._full_index - expected = DataArray(array.reindex(time=new_times, method="nearest")) - assert_identical(expected, actual) - - def test_upsample_nd(self): - # Same as before, but now we try on multi-dimensional DataArrays. - xs = np.arange(6) - ys = np.arange(3) - times = pd.date_range("2000-01-01", freq="6H", periods=5) - data = np.tile(np.arange(5), (6, 3, 1)) - array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time")) - - # Forward-fill - actual = array.resample(time="3H").ffill() - expected_data = np.repeat(data, 2, axis=-1) - expected_times = times.to_series().resample("3H").asfreq().index - expected_data = expected_data[..., : len(expected_times)] - expected = DataArray( - expected_data, - {"time": expected_times, "x": xs, "y": ys}, - ("x", "y", "time"), - ) - assert_identical(expected, actual) - - # Backward-fill - actual = array.resample(time="3H").ffill() - expected_data = np.repeat(np.flipud(data.T).T, 2, axis=-1) - expected_data = np.flipud(expected_data.T).T - expected_times = times.to_series().resample("3H").asfreq().index - expected_data = expected_data[..., : len(expected_times)] - expected = DataArray( - expected_data, - {"time": expected_times, "x": xs, "y": ys}, - ("x", "y", "time"), - ) - assert_identical(expected, actual) - - # As frequency - actual = array.resample(time="3H").asfreq() - expected_data = np.repeat(data, 2, axis=-1).astype(float)[..., :-1] - expected_data[..., 1::2] = np.nan - expected_times = times.to_series().resample("3H").asfreq().index - expected = DataArray( - expected_data, - {"time": expected_times, "x": xs, "y": ys}, - ("x", "y", "time"), - ) - assert_identical(expected, actual) - - # Pad - actual = array.resample(time="3H").pad() - expected_data = np.repeat(data, 2, axis=-1) - expected_data[..., 1::2] = expected_data[..., ::2] - expected_data = expected_data[..., :-1] - expected_times = times.to_series().resample("3H").asfreq().index - expected = DataArray( - expected_data, - {"time": expected_times, "x": xs, "y": ys}, - ("x", "y", "time"), - ) - assert_identical(expected, actual) - - def test_upsample_tolerance(self): - # Test tolerance keyword for upsample methods bfill, pad, nearest - times = pd.date_range("2000-01-01", freq="1D", periods=2) - times_upsampled = pd.date_range("2000-01-01", freq="6H", periods=5) - array = DataArray(np.arange(2), [("time", times)]) - - # Forward fill - actual = array.resample(time="6H").ffill(tolerance="12H") - expected = DataArray([0.0, 0.0, 0.0, np.nan, 1.0], [("time", times_upsampled)]) - assert_identical(expected, actual) - - # Backward fill - actual = array.resample(time="6H").bfill(tolerance="12H") - expected = DataArray([0.0, np.nan, 1.0, 1.0, 1.0], [("time", times_upsampled)]) - assert_identical(expected, actual) - - # Nearest - actual = array.resample(time="6H").nearest(tolerance="6H") - expected = DataArray([0, 0, np.nan, 1, 1], [("time", times_upsampled)]) - assert_identical(expected, actual) - - @requires_scipy - def test_upsample_interpolate(self): - from scipy.interpolate import interp1d - - xs = np.arange(6) - ys = np.arange(3) - times = pd.date_range("2000-01-01", freq="6H", periods=5) - - z = np.arange(5) ** 2 - data = np.tile(z, (6, 3, 1)) - array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time")) - - expected_times = times.to_series().resample("1H").asfreq().index - # Split the times into equal sub-intervals to simulate the 6 hour - # to 1 hour up-sampling - new_times_idx = np.linspace(0, len(times) - 1, len(times) * 5) - for kind in ["linear", "nearest", "zero", "slinear", "quadratic", "cubic"]: - actual = array.resample(time="1H").interpolate(kind) - f = interp1d( - np.arange(len(times)), - data, - kind=kind, - axis=-1, - bounds_error=True, - assume_sorted=True, - ) - expected_data = f(new_times_idx) - expected = DataArray( - expected_data, - {"time": expected_times, "x": xs, "y": ys}, - ("x", "y", "time"), - ) - # Use AllClose because there are some small differences in how - # we upsample timeseries versus the integer indexing as I've - # done here due to floating point arithmetic - assert_allclose(expected, actual, rtol=1e-16) - - @requires_scipy - def test_upsample_interpolate_bug_2197(self): - dates = pd.date_range("2007-02-01", "2007-03-01", freq="D") - da = xr.DataArray(np.arange(len(dates)), [("time", dates)]) - result = da.resample(time="M").interpolate("linear") - expected_times = np.array( - [np.datetime64("2007-02-28"), np.datetime64("2007-03-31")] - ) - expected = xr.DataArray([27.0, np.nan], [("time", expected_times)]) - assert_equal(result, expected) - - @requires_scipy - def test_upsample_interpolate_regression_1605(self): - dates = pd.date_range("2016-01-01", "2016-03-31", freq="1D") - expected = xr.DataArray( - np.random.random((len(dates), 2, 3)), - dims=("time", "x", "y"), - coords={"time": dates}, - ) - actual = expected.resample(time="1D").interpolate("linear") - assert_allclose(actual, expected, rtol=1e-16) - - @requires_dask - @requires_scipy - @pytest.mark.parametrize("chunked_time", [True, False]) - def test_upsample_interpolate_dask(self, chunked_time): - from scipy.interpolate import interp1d - - xs = np.arange(6) - ys = np.arange(3) - times = pd.date_range("2000-01-01", freq="6H", periods=5) - - z = np.arange(5) ** 2 - data = np.tile(z, (6, 3, 1)) - array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time")) - chunks = {"x": 2, "y": 1} - if chunked_time: - chunks["time"] = 3 - - expected_times = times.to_series().resample("1H").asfreq().index - # Split the times into equal sub-intervals to simulate the 6 hour - # to 1 hour up-sampling - new_times_idx = np.linspace(0, len(times) - 1, len(times) * 5) - for kind in ["linear", "nearest", "zero", "slinear", "quadratic", "cubic"]: - actual = array.chunk(chunks).resample(time="1H").interpolate(kind) - actual = actual.compute() - f = interp1d( - np.arange(len(times)), - data, - kind=kind, - axis=-1, - bounds_error=True, - assume_sorted=True, - ) - expected_data = f(new_times_idx) - expected = DataArray( - expected_data, - {"time": expected_times, "x": xs, "y": ys}, - ("x", "y", "time"), - ) - # Use AllClose because there are some small differences in how - # we upsample timeseries versus the integer indexing as I've - # done here due to floating point arithmetic - assert_allclose(expected, actual, rtol=1e-16) - def test_align(self): array = DataArray( np.random.random((6, 8)), coords={"x": list("abcdef")}, dims=["x", "y"] diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 1256a44ad81..7b17eae89c8 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -10,7 +10,6 @@ import pytest from pandas.core.computation.ops import UndefinedVariableError from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.tseries.frequencies import to_offset import xarray as xr from xarray import ( @@ -3614,19 +3613,6 @@ def test_assign(self): expected = Dataset({"y": ("x", [0, 1, 4])}, {"z": 2, "x": [0, 1, 2]}) assert_identical(actual, expected) - ds = Dataset({"a": ("x", range(3))}, {"b": ("x", ["A"] * 2 + ["B"])}) - actual = ds.groupby("b").assign(c=lambda ds: 2 * ds.a) - expected = ds.merge({"c": ("x", [0, 2, 4])}) - assert_identical(actual, expected) - - actual = ds.groupby("b").assign(c=lambda ds: ds.a.sum()) - expected = ds.merge({"c": ("x", [1, 1, 2])}) - assert_identical(actual, expected) - - actual = ds.groupby("b").assign_coords(c=lambda ds: ds.a.sum()) - expected = expected.set_coords("c") - assert_identical(actual, expected) - def test_assign_coords(self): ds = Dataset() @@ -3758,200 +3744,6 @@ def test_squeeze_drop(self): selected = data.squeeze(drop=True) assert_identical(data, selected) - def test_resample_and_first(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - ds = Dataset( - { - "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), - "bar": ("time", np.random.randn(10), {"meta": "data"}), - "time": times, - } - ) - - actual = ds.resample(time="1D").first(keep_attrs=True) - expected = ds.isel(time=[0, 4, 8]) - assert_identical(expected, actual) - - # upsampling - expected_time = pd.date_range("2000-01-01", freq="3H", periods=19) - expected = ds.reindex(time=expected_time) - actual = ds.resample(time="3H") - for how in ["mean", "sum", "first", "last"]: - method = getattr(actual, how) - result = method() - assert_equal(expected, result) - for method in [np.mean]: - result = actual.reduce(method) - assert_equal(expected, result) - - def test_resample_min_count(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - ds = Dataset( - { - "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), - "bar": ("time", np.random.randn(10), {"meta": "data"}), - "time": times, - } - ) - # inject nan - ds["foo"] = xr.where(ds["foo"] > 2.0, np.nan, ds["foo"]) - - actual = ds.resample(time="1D").sum(min_count=1) - expected = xr.concat( - [ - ds.isel(time=slice(i * 4, (i + 1) * 4)).sum("time", min_count=1) - for i in range(3) - ], - dim=actual["time"], - ) - assert_equal(expected, actual) - - def test_resample_by_mean_with_keep_attrs(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - ds = Dataset( - { - "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), - "bar": ("time", np.random.randn(10), {"meta": "data"}), - "time": times, - } - ) - ds.attrs["dsmeta"] = "dsdata" - - resampled_ds = ds.resample(time="1D").mean(keep_attrs=True) - actual = resampled_ds["bar"].attrs - expected = ds["bar"].attrs - assert expected == actual - - actual = resampled_ds.attrs - expected = ds.attrs - assert expected == actual - - with pytest.warns( - UserWarning, match="Passing ``keep_attrs`` to ``resample`` has no effect." - ): - ds.resample(time="1D", keep_attrs=True) - - def test_resample_loffset(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - ds = Dataset( - { - "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), - "bar": ("time", np.random.randn(10), {"meta": "data"}), - "time": times, - } - ) - ds.attrs["dsmeta"] = "dsdata" - - # Our use of `loffset` may change if we align our API with pandas' changes. - # ref https://github.com/pydata/xarray/pull/4537 - actual = ds.resample(time="24H", loffset="-12H").mean().bar - expected_ = ds.bar.to_series().resample("24H").mean() - expected_.index += to_offset("-12H") - expected = DataArray.from_series(expected_) - assert_allclose(actual, expected) - - def test_resample_by_mean_discarding_attrs(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - ds = Dataset( - { - "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), - "bar": ("time", np.random.randn(10), {"meta": "data"}), - "time": times, - } - ) - ds.attrs["dsmeta"] = "dsdata" - - resampled_ds = ds.resample(time="1D").mean(keep_attrs=False) - - assert resampled_ds["bar"].attrs == {} - assert resampled_ds.attrs == {} - - def test_resample_by_last_discarding_attrs(self): - times = pd.date_range("2000-01-01", freq="6H", periods=10) - ds = Dataset( - { - "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), - "bar": ("time", np.random.randn(10), {"meta": "data"}), - "time": times, - } - ) - ds.attrs["dsmeta"] = "dsdata" - - resampled_ds = ds.resample(time="1D").last(keep_attrs=False) - - assert resampled_ds["bar"].attrs == {} - assert resampled_ds.attrs == {} - - @requires_scipy - def test_resample_drop_nondim_coords(self): - xs = np.arange(6) - ys = np.arange(3) - times = pd.date_range("2000-01-01", freq="6H", periods=5) - data = np.tile(np.arange(5), (6, 3, 1)) - xx, yy = np.meshgrid(xs * 5, ys * 2.5) - tt = np.arange(len(times), dtype=int) - array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time")) - xcoord = DataArray(xx.T, {"x": xs, "y": ys}, ("x", "y")) - ycoord = DataArray(yy.T, {"x": xs, "y": ys}, ("x", "y")) - tcoord = DataArray(tt, {"time": times}, ("time",)) - ds = Dataset({"data": array, "xc": xcoord, "yc": ycoord, "tc": tcoord}) - ds = ds.set_coords(["xc", "yc", "tc"]) - - # Re-sample - actual = ds.resample(time="12H").mean("time") - assert "tc" not in actual.coords - - # Up-sample - filling - actual = ds.resample(time="1H").ffill() - assert "tc" not in actual.coords - - # Up-sample - interpolation - actual = ds.resample(time="1H").interpolate("linear") - assert "tc" not in actual.coords - - def test_resample_old_api(self): - - times = pd.date_range("2000-01-01", freq="6H", periods=10) - ds = Dataset( - { - "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), - "bar": ("time", np.random.randn(10), {"meta": "data"}), - "time": times, - } - ) - - with pytest.raises(TypeError, match=r"resample\(\) no longer supports"): - ds.resample("1D", "time") - - with pytest.raises(TypeError, match=r"resample\(\) no longer supports"): - ds.resample("1D", dim="time", how="mean") - - with pytest.raises(TypeError, match=r"resample\(\) no longer supports"): - ds.resample("1D", dim="time") - - def test_resample_ds_da_are_the_same(self): - time = pd.date_range("2000-01-01", freq="6H", periods=365 * 4) - ds = xr.Dataset( - { - "foo": (("time", "x"), np.random.randn(365 * 4, 5)), - "time": time, - "x": np.arange(5), - } - ) - assert_identical( - ds.resample(time="M").mean()["foo"], ds.foo.resample(time="M").mean() - ) - - def test_ds_resample_apply_func_args(self): - def func(arg1, arg2, arg3=0.0): - return arg1.mean("time") + arg2 + arg3 - - times = pd.date_range("2000", freq="D", periods=3) - ds = xr.Dataset({"foo": ("time", [1.0, 1.0, 1.0]), "time": times}) - expected = xr.Dataset({"foo": ("time", [3.0, 3.0, 3.0]), "time": times}) - actual = ds.resample(time="D").map(func, args=(1.0,), arg3=1.0) - assert_identical(expected, actual) - def test_to_array(self): ds = Dataset( {"a": 1, "b": ("x", [1, 2, 3])}, @@ -4451,24 +4243,6 @@ def test_fillna(self): expected = ds.assign_coords(c=42) assert_identical(expected, result) - # groupby - expected = Dataset({"a": ("x", range(4))}, {"x": [0, 1, 2, 3]}) - for target in [ds, expected]: - target.coords["b"] = ("x", [0, 0, 1, 1]) - actual = ds.groupby("b").fillna(DataArray([0, 2], dims="b")) - assert_identical(expected, actual) - - actual = ds.groupby("b").fillna(Dataset({"a": ("b", [0, 2])})) - assert_identical(expected, actual) - - # attrs with groupby - ds.attrs["attr"] = "ds" - ds.a.attrs["attr"] = "da" - actual = ds.groupby("b").fillna(Dataset({"a": ("b", [0, 2])})) - assert actual.attrs == ds.attrs - assert actual.a.name == "a" - assert actual.a.attrs == ds.a.attrs - da = DataArray(range(5), name="a", attrs={"attr": "da"}) actual = da.fillna(1) assert actual.name == "a" @@ -4528,22 +4302,6 @@ def test_where(self): actual = ds.where(ds > 0) assert_identical(expected, actual) - # groupby - ds = Dataset({"a": ("x", range(5))}, {"c": ("x", [0, 0, 1, 1, 1])}) - cond = Dataset({"a": ("c", [True, False])}) - expected = ds.copy(deep=True) - expected["a"].values = [0, 1] + [np.nan] * 3 - actual = ds.groupby("c").where(cond) - assert_identical(expected, actual) - - # attrs with groupby - ds.attrs["attr"] = "ds" - ds.a.attrs["attr"] = "da" - actual = ds.groupby("c").where(cond) - assert actual.attrs == ds.attrs - assert actual.a.name == "a" - assert actual.a.attrs == ds.a.attrs - # attrs da = DataArray(range(5), name="a", attrs={"attr": "da"}) actual = da.where(da.values > 1) diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index b2510141d78..ee77865dd24 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -1,12 +1,21 @@ import numpy as np import pandas as pd import pytest +from pandas.tseries.frequencies import to_offset import xarray as xr from xarray import DataArray, Dataset, Variable from xarray.core.groupby import _consolidate_slices -from . import assert_allclose, assert_equal, assert_identical, create_test_data +from . import ( + assert_allclose, + assert_array_equal, + assert_equal, + assert_identical, + create_test_data, + requires_dask, + requires_scipy, +) @pytest.fixture @@ -741,4 +750,1013 @@ def test_groupby_dataset_order(): # .assertEqual(all_vars, all_vars_ref) +def test_groupby_dataset_fillna(): + + ds = Dataset({"a": ("x", [np.nan, 1, np.nan, 3])}, {"x": [0, 1, 2, 3]}) + expected = Dataset({"a": ("x", range(4))}, {"x": [0, 1, 2, 3]}) + for target in [ds, expected]: + target.coords["b"] = ("x", [0, 0, 1, 1]) + actual = ds.groupby("b").fillna(DataArray([0, 2], dims="b")) + assert_identical(expected, actual) + + actual = ds.groupby("b").fillna(Dataset({"a": ("b", [0, 2])})) + assert_identical(expected, actual) + + # attrs with groupby + ds.attrs["attr"] = "ds" + ds.a.attrs["attr"] = "da" + actual = ds.groupby("b").fillna(Dataset({"a": ("b", [0, 2])})) + assert actual.attrs == ds.attrs + assert actual.a.name == "a" + assert actual.a.attrs == ds.a.attrs + + +def test_groupby_dataset_where(): + # groupby + ds = Dataset({"a": ("x", range(5))}, {"c": ("x", [0, 0, 1, 1, 1])}) + cond = Dataset({"a": ("c", [True, False])}) + expected = ds.copy(deep=True) + expected["a"].values = [0, 1] + [np.nan] * 3 + actual = ds.groupby("c").where(cond) + assert_identical(expected, actual) + + # attrs with groupby + ds.attrs["attr"] = "ds" + ds.a.attrs["attr"] = "da" + actual = ds.groupby("c").where(cond) + assert actual.attrs == ds.attrs + assert actual.a.name == "a" + assert actual.a.attrs == ds.a.attrs + + +def test_groupby_dataset_assign(): + ds = Dataset({"a": ("x", range(3))}, {"b": ("x", ["A"] * 2 + ["B"])}) + actual = ds.groupby("b").assign(c=lambda ds: 2 * ds.a) + expected = ds.merge({"c": ("x", [0, 2, 4])}) + assert_identical(actual, expected) + + actual = ds.groupby("b").assign(c=lambda ds: ds.a.sum()) + expected = ds.merge({"c": ("x", [1, 1, 2])}) + assert_identical(actual, expected) + + actual = ds.groupby("b").assign_coords(c=lambda ds: ds.a.sum()) + expected = expected.set_coords("c") + assert_identical(actual, expected) + + +class TestDataArrayGroupBy: + @pytest.fixture(autouse=True) + def setup(self): + self.attrs = {"attr1": "value1", "attr2": 2929} + self.x = np.random.random((10, 20)) + self.v = Variable(["x", "y"], self.x) + self.va = Variable(["x", "y"], self.x, self.attrs) + self.ds = Dataset({"foo": self.v}) + self.dv = self.ds["foo"] + + self.mindex = pd.MultiIndex.from_product( + [["a", "b"], [1, 2]], names=("level_1", "level_2") + ) + self.mda = DataArray([0, 1, 2, 3], coords={"x": self.mindex}, dims="x") + + self.da = self.dv.copy() + self.da.coords["abc"] = ("y", np.array(["a"] * 9 + ["c"] + ["b"] * 10)) + self.da.coords["y"] = 20 + 100 * self.da["y"] + + def test_stack_groupby_unsorted_coord(self): + data = [[0, 1], [2, 3]] + data_flat = [0, 1, 2, 3] + dims = ["x", "y"] + y_vals = [2, 3] + + arr = xr.DataArray(data, dims=dims, coords={"y": y_vals}) + actual1 = arr.stack(z=dims).groupby("z").first() + midx1 = pd.MultiIndex.from_product([[0, 1], [2, 3]], names=dims) + expected1 = xr.DataArray(data_flat, dims=["z"], coords={"z": midx1}) + assert_equal(actual1, expected1) + + # GH: 3287. Note that y coord values are not in sorted order. + arr = xr.DataArray(data, dims=dims, coords={"y": y_vals[::-1]}) + actual2 = arr.stack(z=dims).groupby("z").first() + midx2 = pd.MultiIndex.from_product([[0, 1], [3, 2]], names=dims) + expected2 = xr.DataArray(data_flat, dims=["z"], coords={"z": midx2}) + assert_equal(actual2, expected2) + + def test_groupby_iter(self): + for ((act_x, act_dv), (exp_x, exp_ds)) in zip( + self.dv.groupby("y"), self.ds.groupby("y") + ): + assert exp_x == act_x + assert_identical(exp_ds["foo"], act_dv) + for ((_, exp_dv), act_dv) in zip(self.dv.groupby("x"), self.dv): + assert_identical(exp_dv, act_dv) + + def test_groupby_properties(self): + grouped = self.da.groupby("abc") + expected_groups = {"a": range(0, 9), "c": [9], "b": range(10, 20)} + assert expected_groups.keys() == grouped.groups.keys() + for key in expected_groups: + assert_array_equal(expected_groups[key], grouped.groups[key]) + assert 3 == len(grouped) + + def test_groupby_map_identity(self): + expected = self.da + idx = expected.coords["y"] + + def identity(x): + return x + + for g in ["x", "y", "abc", idx]: + for shortcut in [False, True]: + for squeeze in [False, True]: + grouped = expected.groupby(g, squeeze=squeeze) + actual = grouped.map(identity, shortcut=shortcut) + assert_identical(expected, actual) + + def test_groupby_sum(self): + array = self.da + grouped = array.groupby("abc") + + expected_sum_all = Dataset( + { + "foo": Variable( + ["abc"], + np.array( + [ + self.x[:, :9].sum(), + self.x[:, 10:].sum(), + self.x[:, 9:10].sum(), + ] + ).T, + ), + "abc": Variable(["abc"], np.array(["a", "b", "c"])), + } + )["foo"] + assert_allclose(expected_sum_all, grouped.reduce(np.sum, dim=...)) + assert_allclose(expected_sum_all, grouped.sum(...)) + + expected = DataArray( + [ + array["y"].values[idx].sum() + for idx in [slice(9), slice(10, None), slice(9, 10)] + ], + [["a", "b", "c"]], + ["abc"], + ) + actual = array["y"].groupby("abc").map(np.sum) + assert_allclose(expected, actual) + actual = array["y"].groupby("abc").sum(...) + assert_allclose(expected, actual) + + expected_sum_axis1 = Dataset( + { + "foo": ( + ["x", "abc"], + np.array( + [ + self.x[:, :9].sum(1), + self.x[:, 10:].sum(1), + self.x[:, 9:10].sum(1), + ] + ).T, + ), + "abc": Variable(["abc"], np.array(["a", "b", "c"])), + } + )["foo"] + assert_allclose(expected_sum_axis1, grouped.reduce(np.sum, "y")) + assert_allclose(expected_sum_axis1, grouped.sum("y")) + + def test_groupby_sum_default(self): + array = self.da + grouped = array.groupby("abc") + + expected_sum_all = Dataset( + { + "foo": Variable( + ["x", "abc"], + np.array( + [ + self.x[:, :9].sum(axis=-1), + self.x[:, 10:].sum(axis=-1), + self.x[:, 9:10].sum(axis=-1), + ] + ).T, + ), + "abc": Variable(["abc"], np.array(["a", "b", "c"])), + } + )["foo"] + + assert_allclose(expected_sum_all, grouped.sum(dim="y")) + + def test_groupby_count(self): + array = DataArray( + [0, 0, np.nan, np.nan, 0, 0], + coords={"cat": ("x", ["a", "b", "b", "c", "c", "c"])}, + dims="x", + ) + actual = array.groupby("cat").count() + expected = DataArray([1, 1, 2], coords=[("cat", ["a", "b", "c"])]) + assert_identical(actual, expected) + + @pytest.mark.skip("needs to be fixed for shortcut=False, keep_attrs=False") + def test_groupby_reduce_attrs(self): + array = self.da + array.attrs["foo"] = "bar" + + for shortcut in [True, False]: + for keep_attrs in [True, False]: + print(f"shortcut={shortcut}, keep_attrs={keep_attrs}") + actual = array.groupby("abc").reduce( + np.mean, keep_attrs=keep_attrs, shortcut=shortcut + ) + expected = array.groupby("abc").mean() + if keep_attrs: + expected.attrs["foo"] = "bar" + assert_identical(expected, actual) + + def test_groupby_map_center(self): + def center(x): + return x - np.mean(x) + + array = self.da + grouped = array.groupby("abc") + + expected_ds = array.to_dataset() + exp_data = np.hstack( + [center(self.x[:, :9]), center(self.x[:, 9:10]), center(self.x[:, 10:])] + ) + expected_ds["foo"] = (["x", "y"], exp_data) + expected_centered = expected_ds["foo"] + assert_allclose(expected_centered, grouped.map(center)) + + def test_groupby_map_ndarray(self): + # regression test for #326 + array = self.da + grouped = array.groupby("abc") + actual = grouped.map(np.asarray) + assert_equal(array, actual) + + def test_groupby_map_changes_metadata(self): + def change_metadata(x): + x.coords["x"] = x.coords["x"] * 2 + x.attrs["fruit"] = "lemon" + return x + + array = self.da + grouped = array.groupby("abc") + actual = grouped.map(change_metadata) + expected = array.copy() + expected = change_metadata(expected) + assert_equal(expected, actual) + + def test_groupby_math(self): + array = self.da + for squeeze in [True, False]: + grouped = array.groupby("x", squeeze=squeeze) + + expected = array + array.coords["x"] + actual = grouped + array.coords["x"] + assert_identical(expected, actual) + + actual = array.coords["x"] + grouped + assert_identical(expected, actual) + + ds = array.coords["x"].to_dataset(name="X") + expected = array + ds + actual = grouped + ds + assert_identical(expected, actual) + + actual = ds + grouped + assert_identical(expected, actual) + + grouped = array.groupby("abc") + expected_agg = (grouped.mean(...) - np.arange(3)).rename(None) + actual = grouped - DataArray(range(3), [("abc", ["a", "b", "c"])]) + actual_agg = actual.groupby("abc").mean(...) + assert_allclose(expected_agg, actual_agg) + + with pytest.raises(TypeError, match=r"only support binary ops"): + grouped + 1 + with pytest.raises(TypeError, match=r"only support binary ops"): + grouped + grouped + with pytest.raises(TypeError, match=r"in-place operations"): + array += grouped + + def test_groupby_math_not_aligned(self): + array = DataArray( + range(4), {"b": ("x", [0, 0, 1, 1]), "x": [0, 1, 2, 3]}, dims="x" + ) + other = DataArray([10], coords={"b": [0]}, dims="b") + actual = array.groupby("b") + other + expected = DataArray([10, 11, np.nan, np.nan], array.coords) + assert_identical(expected, actual) + + other = DataArray([10], coords={"c": 123, "b": [0]}, dims="b") + actual = array.groupby("b") + other + expected.coords["c"] = (["x"], [123] * 2 + [np.nan] * 2) + assert_identical(expected, actual) + + other = Dataset({"a": ("b", [10])}, {"b": [0]}) + actual = array.groupby("b") + other + expected = Dataset({"a": ("x", [10, 11, np.nan, np.nan])}, array.coords) + assert_identical(expected, actual) + + def test_groupby_restore_dim_order(self): + array = DataArray( + np.random.randn(5, 3), + coords={"a": ("x", range(5)), "b": ("y", range(3))}, + dims=["x", "y"], + ) + for by, expected_dims in [ + ("x", ("x", "y")), + ("y", ("x", "y")), + ("a", ("a", "y")), + ("b", ("x", "b")), + ]: + result = array.groupby(by).map(lambda x: x.squeeze()) + assert result.dims == expected_dims + + def test_groupby_restore_coord_dims(self): + array = DataArray( + np.random.randn(5, 3), + coords={ + "a": ("x", range(5)), + "b": ("y", range(3)), + "c": (("x", "y"), np.random.randn(5, 3)), + }, + dims=["x", "y"], + ) + + for by, expected_dims in [ + ("x", ("x", "y")), + ("y", ("x", "y")), + ("a", ("a", "y")), + ("b", ("x", "b")), + ]: + result = array.groupby(by, restore_coord_dims=True).map( + lambda x: x.squeeze() + )["c"] + assert result.dims == expected_dims + + def test_groupby_first_and_last(self): + array = DataArray([1, 2, 3, 4, 5], dims="x") + by = DataArray(["a"] * 2 + ["b"] * 3, dims="x", name="ab") + + expected = DataArray([1, 3], [("ab", ["a", "b"])]) + actual = array.groupby(by).first() + assert_identical(expected, actual) + + expected = DataArray([2, 5], [("ab", ["a", "b"])]) + actual = array.groupby(by).last() + assert_identical(expected, actual) + + array = DataArray(np.random.randn(5, 3), dims=["x", "y"]) + expected = DataArray(array[[0, 2]], {"ab": ["a", "b"]}, ["ab", "y"]) + actual = array.groupby(by).first() + assert_identical(expected, actual) + + actual = array.groupby("x").first() + expected = array # should be a no-op + assert_identical(expected, actual) + + def make_groupby_multidim_example_array(self): + return DataArray( + [[[0, 1], [2, 3]], [[5, 10], [15, 20]]], + coords={ + "lon": (["ny", "nx"], [[30, 40], [40, 50]]), + "lat": (["ny", "nx"], [[10, 10], [20, 20]]), + }, + dims=["time", "ny", "nx"], + ) + + def test_groupby_multidim(self): + array = self.make_groupby_multidim_example_array() + for dim, expected_sum in [ + ("lon", DataArray([5, 28, 23], coords=[("lon", [30.0, 40.0, 50.0])])), + ("lat", DataArray([16, 40], coords=[("lat", [10.0, 20.0])])), + ]: + actual_sum = array.groupby(dim).sum(...) + assert_identical(expected_sum, actual_sum) + + def test_groupby_multidim_map(self): + array = self.make_groupby_multidim_example_array() + actual = array.groupby("lon").map(lambda x: x - x.mean()) + expected = DataArray( + [[[-2.5, -6.0], [-5.0, -8.5]], [[2.5, 3.0], [8.0, 8.5]]], + coords=array.coords, + dims=array.dims, + ) + assert_identical(expected, actual) + + def test_groupby_bins(self): + array = DataArray(np.arange(4), dims="dim_0") + # the first value should not be part of any group ("right" binning) + array[0] = 99 + # bins follow conventions for pandas.cut + # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html + bins = [0, 1.5, 5] + bin_coords = pd.cut(array["dim_0"], bins).categories + expected = DataArray( + [1, 5], dims="dim_0_bins", coords={"dim_0_bins": bin_coords} + ) + # the problem with this is that it overwrites the dimensions of array! + # actual = array.groupby('dim_0', bins=bins).sum() + actual = array.groupby_bins("dim_0", bins).map(lambda x: x.sum()) + assert_identical(expected, actual) + # make sure original array dims are unchanged + assert len(array.dim_0) == 4 + + def test_groupby_bins_empty(self): + array = DataArray(np.arange(4), [("x", range(4))]) + # one of these bins will be empty + bins = [0, 4, 5] + bin_coords = pd.cut(array["x"], bins).categories + actual = array.groupby_bins("x", bins).sum() + expected = DataArray([6, np.nan], dims="x_bins", coords={"x_bins": bin_coords}) + assert_identical(expected, actual) + # make sure original array is unchanged + # (was a problem in earlier versions) + assert len(array.x) == 4 + + def test_groupby_bins_multidim(self): + array = self.make_groupby_multidim_example_array() + bins = [0, 15, 20] + bin_coords = pd.cut(array["lat"].values.flat, bins).categories + expected = DataArray([16, 40], dims="lat_bins", coords={"lat_bins": bin_coords}) + actual = array.groupby_bins("lat", bins).map(lambda x: x.sum()) + assert_identical(expected, actual) + # modify the array coordinates to be non-monotonic after unstacking + array["lat"].data = np.array([[10.0, 20.0], [20.0, 10.0]]) + expected = DataArray([28, 28], dims="lat_bins", coords={"lat_bins": bin_coords}) + actual = array.groupby_bins("lat", bins).map(lambda x: x.sum()) + assert_identical(expected, actual) + + def test_groupby_bins_sort(self): + data = xr.DataArray( + np.arange(100), dims="x", coords={"x": np.linspace(-100, 100, num=100)} + ) + binned_mean = data.groupby_bins("x", bins=11).mean() + assert binned_mean.to_index().is_monotonic + + def test_groupby_assign_coords(self): + + array = DataArray([1, 2, 3, 4], {"c": ("x", [0, 0, 1, 1])}, dims="x") + actual = array.groupby("c").assign_coords(d=lambda a: a.mean()) + expected = array.copy() + expected.coords["d"] = ("x", [1.5, 1.5, 3.5, 3.5]) + assert_identical(actual, expected) + + def test_groupby_fillna(self): + a = DataArray([np.nan, 1, np.nan, 3], coords={"x": range(4)}, dims="x") + fill_value = DataArray([0, 1], dims="y") + actual = a.fillna(fill_value) + expected = DataArray( + [[0, 1], [1, 1], [0, 1], [3, 3]], coords={"x": range(4)}, dims=("x", "y") + ) + assert_identical(expected, actual) + + b = DataArray(range(4), coords={"x": range(4)}, dims="x") + expected = b.copy() + for target in [a, expected]: + target.coords["b"] = ("x", [0, 0, 1, 1]) + actual = a.groupby("b").fillna(DataArray([0, 2], dims="b")) + assert_identical(expected, actual) + + +class TestDataArrayResample: + def test_resample(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + array = DataArray(np.arange(10), [("time", times)]) + + actual = array.resample(time="24H").mean() + expected = DataArray(array.to_series().resample("24H").mean()) + assert_identical(expected, actual) + + actual = array.resample(time="24H").reduce(np.mean) + assert_identical(expected, actual) + + # Our use of `loffset` may change if we align our API with pandas' changes. + # ref https://github.com/pydata/xarray/pull/4537 + actual = array.resample(time="24H", loffset="-12H").mean() + expected_ = array.to_series().resample("24H").mean() + expected_.index += to_offset("-12H") + expected = DataArray.from_series(expected_) + assert_identical(actual, expected) + + with pytest.raises(ValueError, match=r"index must be monotonic"): + array[[2, 0, 1]].resample(time="1D") + + def test_da_resample_func_args(self): + def func(arg1, arg2, arg3=0.0): + return arg1.mean("time") + arg2 + arg3 + + times = pd.date_range("2000", periods=3, freq="D") + da = xr.DataArray([1.0, 1.0, 1.0], coords=[times], dims=["time"]) + expected = xr.DataArray([3.0, 3.0, 3.0], coords=[times], dims=["time"]) + actual = da.resample(time="D").map(func, args=(1.0,), arg3=1.0) + assert_identical(actual, expected) + + def test_resample_first(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + array = DataArray(np.arange(10), [("time", times)]) + + actual = array.resample(time="1D").first() + expected = DataArray([0, 4, 8], [("time", times[::4])]) + assert_identical(expected, actual) + + # verify that labels don't use the first value + actual = array.resample(time="24H").first() + expected = DataArray(array.to_series().resample("24H").first()) + assert_identical(expected, actual) + + # missing values + array = array.astype(float) + array[:2] = np.nan + actual = array.resample(time="1D").first() + expected = DataArray([2, 4, 8], [("time", times[::4])]) + assert_identical(expected, actual) + + actual = array.resample(time="1D").first(skipna=False) + expected = DataArray([np.nan, 4, 8], [("time", times[::4])]) + assert_identical(expected, actual) + + # regression test for http://stackoverflow.com/questions/33158558/ + array = Dataset({"time": times})["time"] + actual = array.resample(time="1D").last() + expected_times = pd.to_datetime( + ["2000-01-01T18", "2000-01-02T18", "2000-01-03T06"] + ) + expected = DataArray(expected_times, [("time", times[::4])], name="time") + assert_identical(expected, actual) + + def test_resample_bad_resample_dim(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + array = DataArray(np.arange(10), [("__resample_dim__", times)]) + with pytest.raises(ValueError, match=r"Proxy resampling dimension"): + array.resample(**{"__resample_dim__": "1D"}).first() + + @requires_scipy + def test_resample_drop_nondim_coords(self): + xs = np.arange(6) + ys = np.arange(3) + times = pd.date_range("2000-01-01", freq="6H", periods=5) + data = np.tile(np.arange(5), (6, 3, 1)) + xx, yy = np.meshgrid(xs * 5, ys * 2.5) + tt = np.arange(len(times), dtype=int) + array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time")) + xcoord = DataArray(xx.T, {"x": xs, "y": ys}, ("x", "y")) + ycoord = DataArray(yy.T, {"x": xs, "y": ys}, ("x", "y")) + tcoord = DataArray(tt, {"time": times}, ("time",)) + ds = Dataset({"data": array, "xc": xcoord, "yc": ycoord, "tc": tcoord}) + ds = ds.set_coords(["xc", "yc", "tc"]) + + # Select the data now, with the auxiliary coordinates in place + array = ds["data"] + + # Re-sample + actual = array.resample(time="12H", restore_coord_dims=True).mean("time") + assert "tc" not in actual.coords + + # Up-sample - filling + actual = array.resample(time="1H", restore_coord_dims=True).ffill() + assert "tc" not in actual.coords + + # Up-sample - interpolation + actual = array.resample(time="1H", restore_coord_dims=True).interpolate( + "linear" + ) + assert "tc" not in actual.coords + + def test_resample_keep_attrs(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + array = DataArray(np.ones(10), [("time", times)]) + array.attrs["meta"] = "data" + + result = array.resample(time="1D").mean(keep_attrs=True) + expected = DataArray([1, 1, 1], [("time", times[::4])], attrs=array.attrs) + assert_identical(result, expected) + + with pytest.warns( + UserWarning, match="Passing ``keep_attrs`` to ``resample`` has no effect." + ): + array.resample(time="1D", keep_attrs=True) + + def test_resample_skipna(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + array = DataArray(np.ones(10), [("time", times)]) + array[1] = np.nan + + result = array.resample(time="1D").mean(skipna=False) + expected = DataArray([np.nan, 1, 1], [("time", times[::4])]) + assert_identical(result, expected) + + def test_upsample(self): + times = pd.date_range("2000-01-01", freq="6H", periods=5) + array = DataArray(np.arange(5), [("time", times)]) + + # Forward-fill + actual = array.resample(time="3H").ffill() + expected = DataArray(array.to_series().resample("3H").ffill()) + assert_identical(expected, actual) + + # Backward-fill + actual = array.resample(time="3H").bfill() + expected = DataArray(array.to_series().resample("3H").bfill()) + assert_identical(expected, actual) + + # As frequency + actual = array.resample(time="3H").asfreq() + expected = DataArray(array.to_series().resample("3H").asfreq()) + assert_identical(expected, actual) + + # Pad + actual = array.resample(time="3H").pad() + expected = DataArray(array.to_series().resample("3H").pad()) + assert_identical(expected, actual) + + # Nearest + rs = array.resample(time="3H") + actual = rs.nearest() + new_times = rs._full_index + expected = DataArray(array.reindex(time=new_times, method="nearest")) + assert_identical(expected, actual) + + def test_upsample_nd(self): + # Same as before, but now we try on multi-dimensional DataArrays. + xs = np.arange(6) + ys = np.arange(3) + times = pd.date_range("2000-01-01", freq="6H", periods=5) + data = np.tile(np.arange(5), (6, 3, 1)) + array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time")) + + # Forward-fill + actual = array.resample(time="3H").ffill() + expected_data = np.repeat(data, 2, axis=-1) + expected_times = times.to_series().resample("3H").asfreq().index + expected_data = expected_data[..., : len(expected_times)] + expected = DataArray( + expected_data, + {"time": expected_times, "x": xs, "y": ys}, + ("x", "y", "time"), + ) + assert_identical(expected, actual) + + # Backward-fill + actual = array.resample(time="3H").ffill() + expected_data = np.repeat(np.flipud(data.T).T, 2, axis=-1) + expected_data = np.flipud(expected_data.T).T + expected_times = times.to_series().resample("3H").asfreq().index + expected_data = expected_data[..., : len(expected_times)] + expected = DataArray( + expected_data, + {"time": expected_times, "x": xs, "y": ys}, + ("x", "y", "time"), + ) + assert_identical(expected, actual) + + # As frequency + actual = array.resample(time="3H").asfreq() + expected_data = np.repeat(data, 2, axis=-1).astype(float)[..., :-1] + expected_data[..., 1::2] = np.nan + expected_times = times.to_series().resample("3H").asfreq().index + expected = DataArray( + expected_data, + {"time": expected_times, "x": xs, "y": ys}, + ("x", "y", "time"), + ) + assert_identical(expected, actual) + + # Pad + actual = array.resample(time="3H").pad() + expected_data = np.repeat(data, 2, axis=-1) + expected_data[..., 1::2] = expected_data[..., ::2] + expected_data = expected_data[..., :-1] + expected_times = times.to_series().resample("3H").asfreq().index + expected = DataArray( + expected_data, + {"time": expected_times, "x": xs, "y": ys}, + ("x", "y", "time"), + ) + assert_identical(expected, actual) + + def test_upsample_tolerance(self): + # Test tolerance keyword for upsample methods bfill, pad, nearest + times = pd.date_range("2000-01-01", freq="1D", periods=2) + times_upsampled = pd.date_range("2000-01-01", freq="6H", periods=5) + array = DataArray(np.arange(2), [("time", times)]) + + # Forward fill + actual = array.resample(time="6H").ffill(tolerance="12H") + expected = DataArray([0.0, 0.0, 0.0, np.nan, 1.0], [("time", times_upsampled)]) + assert_identical(expected, actual) + + # Backward fill + actual = array.resample(time="6H").bfill(tolerance="12H") + expected = DataArray([0.0, np.nan, 1.0, 1.0, 1.0], [("time", times_upsampled)]) + assert_identical(expected, actual) + + # Nearest + actual = array.resample(time="6H").nearest(tolerance="6H") + expected = DataArray([0, 0, np.nan, 1, 1], [("time", times_upsampled)]) + assert_identical(expected, actual) + + @requires_scipy + def test_upsample_interpolate(self): + from scipy.interpolate import interp1d + + xs = np.arange(6) + ys = np.arange(3) + times = pd.date_range("2000-01-01", freq="6H", periods=5) + + z = np.arange(5) ** 2 + data = np.tile(z, (6, 3, 1)) + array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time")) + + expected_times = times.to_series().resample("1H").asfreq().index + # Split the times into equal sub-intervals to simulate the 6 hour + # to 1 hour up-sampling + new_times_idx = np.linspace(0, len(times) - 1, len(times) * 5) + for kind in ["linear", "nearest", "zero", "slinear", "quadratic", "cubic"]: + actual = array.resample(time="1H").interpolate(kind) + f = interp1d( + np.arange(len(times)), + data, + kind=kind, + axis=-1, + bounds_error=True, + assume_sorted=True, + ) + expected_data = f(new_times_idx) + expected = DataArray( + expected_data, + {"time": expected_times, "x": xs, "y": ys}, + ("x", "y", "time"), + ) + # Use AllClose because there are some small differences in how + # we upsample timeseries versus the integer indexing as I've + # done here due to floating point arithmetic + assert_allclose(expected, actual, rtol=1e-16) + + @requires_scipy + def test_upsample_interpolate_bug_2197(self): + dates = pd.date_range("2007-02-01", "2007-03-01", freq="D") + da = xr.DataArray(np.arange(len(dates)), [("time", dates)]) + result = da.resample(time="M").interpolate("linear") + expected_times = np.array( + [np.datetime64("2007-02-28"), np.datetime64("2007-03-31")] + ) + expected = xr.DataArray([27.0, np.nan], [("time", expected_times)]) + assert_equal(result, expected) + + @requires_scipy + def test_upsample_interpolate_regression_1605(self): + dates = pd.date_range("2016-01-01", "2016-03-31", freq="1D") + expected = xr.DataArray( + np.random.random((len(dates), 2, 3)), + dims=("time", "x", "y"), + coords={"time": dates}, + ) + actual = expected.resample(time="1D").interpolate("linear") + assert_allclose(actual, expected, rtol=1e-16) + + @requires_dask + @requires_scipy + @pytest.mark.parametrize("chunked_time", [True, False]) + def test_upsample_interpolate_dask(self, chunked_time): + from scipy.interpolate import interp1d + + xs = np.arange(6) + ys = np.arange(3) + times = pd.date_range("2000-01-01", freq="6H", periods=5) + + z = np.arange(5) ** 2 + data = np.tile(z, (6, 3, 1)) + array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time")) + chunks = {"x": 2, "y": 1} + if chunked_time: + chunks["time"] = 3 + + expected_times = times.to_series().resample("1H").asfreq().index + # Split the times into equal sub-intervals to simulate the 6 hour + # to 1 hour up-sampling + new_times_idx = np.linspace(0, len(times) - 1, len(times) * 5) + for kind in ["linear", "nearest", "zero", "slinear", "quadratic", "cubic"]: + actual = array.chunk(chunks).resample(time="1H").interpolate(kind) + actual = actual.compute() + f = interp1d( + np.arange(len(times)), + data, + kind=kind, + axis=-1, + bounds_error=True, + assume_sorted=True, + ) + expected_data = f(new_times_idx) + expected = DataArray( + expected_data, + {"time": expected_times, "x": xs, "y": ys}, + ("x", "y", "time"), + ) + # Use AllClose because there are some small differences in how + # we upsample timeseries versus the integer indexing as I've + # done here due to floating point arithmetic + assert_allclose(expected, actual, rtol=1e-16) + + +class TestDatasetResample: + def test_resample_and_first(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + ds = Dataset( + { + "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), + "bar": ("time", np.random.randn(10), {"meta": "data"}), + "time": times, + } + ) + + actual = ds.resample(time="1D").first(keep_attrs=True) + expected = ds.isel(time=[0, 4, 8]) + assert_identical(expected, actual) + + # upsampling + expected_time = pd.date_range("2000-01-01", freq="3H", periods=19) + expected = ds.reindex(time=expected_time) + actual = ds.resample(time="3H") + for how in ["mean", "sum", "first", "last"]: + method = getattr(actual, how) + result = method() + assert_equal(expected, result) + for method in [np.mean]: + result = actual.reduce(method) + assert_equal(expected, result) + + def test_resample_min_count(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + ds = Dataset( + { + "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), + "bar": ("time", np.random.randn(10), {"meta": "data"}), + "time": times, + } + ) + # inject nan + ds["foo"] = xr.where(ds["foo"] > 2.0, np.nan, ds["foo"]) + + actual = ds.resample(time="1D").sum(min_count=1) + expected = xr.concat( + [ + ds.isel(time=slice(i * 4, (i + 1) * 4)).sum("time", min_count=1) + for i in range(3) + ], + dim=actual["time"], + ) + assert_equal(expected, actual) + + def test_resample_by_mean_with_keep_attrs(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + ds = Dataset( + { + "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), + "bar": ("time", np.random.randn(10), {"meta": "data"}), + "time": times, + } + ) + ds.attrs["dsmeta"] = "dsdata" + + resampled_ds = ds.resample(time="1D").mean(keep_attrs=True) + actual = resampled_ds["bar"].attrs + expected = ds["bar"].attrs + assert expected == actual + + actual = resampled_ds.attrs + expected = ds.attrs + assert expected == actual + + with pytest.warns( + UserWarning, match="Passing ``keep_attrs`` to ``resample`` has no effect." + ): + ds.resample(time="1D", keep_attrs=True) + + def test_resample_loffset(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + ds = Dataset( + { + "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), + "bar": ("time", np.random.randn(10), {"meta": "data"}), + "time": times, + } + ) + ds.attrs["dsmeta"] = "dsdata" + + # Our use of `loffset` may change if we align our API with pandas' changes. + # ref https://github.com/pydata/xarray/pull/4537 + actual = ds.resample(time="24H", loffset="-12H").mean().bar + expected_ = ds.bar.to_series().resample("24H").mean() + expected_.index += to_offset("-12H") + expected = DataArray.from_series(expected_) + assert_allclose(actual, expected) + + def test_resample_by_mean_discarding_attrs(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + ds = Dataset( + { + "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), + "bar": ("time", np.random.randn(10), {"meta": "data"}), + "time": times, + } + ) + ds.attrs["dsmeta"] = "dsdata" + + resampled_ds = ds.resample(time="1D").mean(keep_attrs=False) + + assert resampled_ds["bar"].attrs == {} + assert resampled_ds.attrs == {} + + def test_resample_by_last_discarding_attrs(self): + times = pd.date_range("2000-01-01", freq="6H", periods=10) + ds = Dataset( + { + "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), + "bar": ("time", np.random.randn(10), {"meta": "data"}), + "time": times, + } + ) + ds.attrs["dsmeta"] = "dsdata" + + resampled_ds = ds.resample(time="1D").last(keep_attrs=False) + + assert resampled_ds["bar"].attrs == {} + assert resampled_ds.attrs == {} + + @requires_scipy + def test_resample_drop_nondim_coords(self): + xs = np.arange(6) + ys = np.arange(3) + times = pd.date_range("2000-01-01", freq="6H", periods=5) + data = np.tile(np.arange(5), (6, 3, 1)) + xx, yy = np.meshgrid(xs * 5, ys * 2.5) + tt = np.arange(len(times), dtype=int) + array = DataArray(data, {"time": times, "x": xs, "y": ys}, ("x", "y", "time")) + xcoord = DataArray(xx.T, {"x": xs, "y": ys}, ("x", "y")) + ycoord = DataArray(yy.T, {"x": xs, "y": ys}, ("x", "y")) + tcoord = DataArray(tt, {"time": times}, ("time",)) + ds = Dataset({"data": array, "xc": xcoord, "yc": ycoord, "tc": tcoord}) + ds = ds.set_coords(["xc", "yc", "tc"]) + + # Re-sample + actual = ds.resample(time="12H").mean("time") + assert "tc" not in actual.coords + + # Up-sample - filling + actual = ds.resample(time="1H").ffill() + assert "tc" not in actual.coords + + # Up-sample - interpolation + actual = ds.resample(time="1H").interpolate("linear") + assert "tc" not in actual.coords + + def test_resample_old_api(self): + + times = pd.date_range("2000-01-01", freq="6H", periods=10) + ds = Dataset( + { + "foo": (["time", "x", "y"], np.random.randn(10, 5, 3)), + "bar": ("time", np.random.randn(10), {"meta": "data"}), + "time": times, + } + ) + + with pytest.raises(TypeError, match=r"resample\(\) no longer supports"): + ds.resample("1D", "time") + + with pytest.raises(TypeError, match=r"resample\(\) no longer supports"): + ds.resample("1D", dim="time", how="mean") + + with pytest.raises(TypeError, match=r"resample\(\) no longer supports"): + ds.resample("1D", dim="time") + + def test_resample_ds_da_are_the_same(self): + time = pd.date_range("2000-01-01", freq="6H", periods=365 * 4) + ds = xr.Dataset( + { + "foo": (("time", "x"), np.random.randn(365 * 4, 5)), + "time": time, + "x": np.arange(5), + } + ) + assert_identical( + ds.resample(time="M").mean()["foo"], ds.foo.resample(time="M").mean() + ) + + def test_ds_resample_apply_func_args(self): + def func(arg1, arg2, arg3=0.0): + return arg1.mean("time") + arg2 + arg3 + + times = pd.date_range("2000", freq="D", periods=3) + ds = xr.Dataset({"foo": ("time", [1.0, 1.0, 1.0]), "time": times}) + expected = xr.Dataset({"foo": ("time", [3.0, 3.0, 3.0]), "time": times}) + actual = ds.resample(time="D").map(func, args=(1.0,), arg3=1.0) + assert_identical(expected, actual) + + # TODO: move other groupby tests from test_dataset and test_dataarray over here