From 1ab7295e79c822300b384f37b05cb0f0f40ac8fa Mon Sep 17 00:00:00 2001 From: Alex Goodman Date: Mon, 30 Jan 2023 15:40:58 -0600 Subject: [PATCH 01/17] Update contains_cftime_datetimes to avoid loading entire variable array --- xarray/core/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 095d15e32f1..81438fdf9fa 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1790,7 +1790,10 @@ def _contains_cftime_datetimes(array) -> bool: def contains_cftime_datetimes(var) -> bool: """Check if an xarray.Variable contains cftime.datetime objects""" if var.dtype == np.dtype("O") and var.size > 0: - return _contains_cftime_datetimes(var.data) + ndims = len(var.shape) + first_idx = tuple(np.zeros(ndims, dtype="int32")) + array = var[first_idx].data + return _contains_cftime_datetimes(array) else: return False From 5132446489fb676ef4620c384d55afbbd2f18691 Mon Sep 17 00:00:00 2001 From: Alex Goodman Date: Mon, 30 Jan 2023 15:55:46 -0600 Subject: [PATCH 02/17] Update whats-new.rst --- doc/whats-new.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 26bd72b0727..e7ee37a12ba 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -55,6 +55,8 @@ Bug fixes By `Kai Mühlbauer `_ and `Scott Chamberlin `_. - Handle ``keep_attrs`` option in binary operators of :py:meth:`Dataset` (:issue:`7390`, :pull:`7391`). By `Aron Gergely `_. +- Improved performance in ``open_dataset`` for datasets with large object arrays (:issue:`7484`, :pull:`7494`). + By `Alex Goodman `_. Documentation ~~~~~~~~~~~~~ From cb70040581eea282cb6d3a960428e511af3e2cd2 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Tue, 31 Jan 2023 23:45:06 +0100 Subject: [PATCH 03/17] Convert arrays to variable instead for better control --- xarray/core/accessor_dt.py | 2 +- xarray/core/common.py | 47 +++++++++++++++++++++----------------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/xarray/core/accessor_dt.py b/xarray/core/accessor_dt.py index 5c67af16d99..c7ba95f4298 100644 --- a/xarray/core/accessor_dt.py +++ b/xarray/core/accessor_dt.py @@ -575,7 +575,7 @@ def __new__(cls, obj: T_DataArray) -> CombinedDatetimelikeAccessor: # we need to choose which parent (datetime or timedelta) is # appropriate. Since we're checking the dtypes anyway, we'll just # do all the validation here. - if not _contains_datetime_like_objects(obj): + if not _contains_datetime_like_objects(obj.variable): raise TypeError( "'.dt' accessor only available for " "DataArray with datetime64 timedelta64 dtype or " diff --git a/xarray/core/common.py b/xarray/core/common.py index 81438fdf9fa..bfffb753b7d 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -13,7 +13,7 @@ from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.pycompat import is_duck_dask_array -from xarray.core.utils import Frozen, either_dict_or_kwargs, is_scalar +from xarray.core.utils import Frozen, either_dict_or_kwargs, is_duck_array, is_scalar try: import cftime @@ -40,6 +40,7 @@ ScalarOrArray, SideOptions, T_DataWithCoords, + T_Variable, ) from xarray.core.variable import Variable @@ -1771,34 +1772,38 @@ def is_np_timedelta_like(dtype: DTypeLike) -> bool: return np.issubdtype(dtype, np.timedelta64) -def _contains_cftime_datetimes(array) -> bool: +def _contains_cftime_datetimes(array: Any) -> bool: """Check if an array contains cftime.datetime objects""" + from xarray.core.variable import Variable + + var: T_Variable + if isinstance(array, Variable): + var = array + elif is_duck_array(array): + var = Variable(dims=tuple(f"dim_{v}" for v in range(array.ndim)), data=array) + + return _variable_contains_cftime_datetimes(var) + + +def _variable_contains_cftime_datetimes(var: T_Variable) -> bool: + """Check if an xarray.Variable contains cftime.datetime objects""" if cftime is None: return False - else: - if array.dtype == np.dtype("O") and array.size > 0: - sample = np.asarray(array).flat[0] - if is_duck_dask_array(sample): - sample = sample.compute() - if isinstance(sample, np.ndarray): - sample = sample.item() - return isinstance(sample, cftime.datetime) - else: - return False + + if var.dtype == np.dtype("O") and var.size > 0: + first_idx = tuple(0 for _ in range(var.ndim)) + sample = var[first_idx] + return isinstance(sample.to_numpy().item(), cftime.datetime) + + return False -def contains_cftime_datetimes(var) -> bool: +def contains_cftime_datetimes(var: T_Variable) -> bool: """Check if an xarray.Variable contains cftime.datetime objects""" - if var.dtype == np.dtype("O") and var.size > 0: - ndims = len(var.shape) - first_idx = tuple(np.zeros(ndims, dtype="int32")) - array = var[first_idx].data - return _contains_cftime_datetimes(array) - else: - return False + return _variable_contains_cftime_datetimes(var) -def _contains_datetime_like_objects(var) -> bool: +def _contains_datetime_like_objects(var: T_Variable) -> bool: """Check if a variable contains datetime like objects (either np.datetime64, np.timedelta64, or cftime.datetime) """ From 02edc5991679bb8bfb1d52d254c59678931317ef Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Tue, 31 Jan 2023 23:52:40 +0100 Subject: [PATCH 04/17] fix mypy? --- xarray/core/common.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index bfffb753b7d..a3ccca90317 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1776,7 +1776,6 @@ def _contains_cftime_datetimes(array: Any) -> bool: """Check if an array contains cftime.datetime objects""" from xarray.core.variable import Variable - var: T_Variable if isinstance(array, Variable): var = array elif is_duck_array(array): From b5b7bae0b4e18ed8bae35adde741aeb13f81ac24 Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Wed, 1 Feb 2023 06:57:45 +0100 Subject: [PATCH 05/17] Update common.py --- xarray/core/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index a3ccca90317..8929c001ec8 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -13,7 +13,7 @@ from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.pycompat import is_duck_dask_array -from xarray.core.utils import Frozen, either_dict_or_kwargs, is_duck_array, is_scalar +from xarray.core.utils import Frozen, either_dict_or_kwargs, is_scalar try: import cftime @@ -1778,7 +1778,7 @@ def _contains_cftime_datetimes(array: Any) -> bool: if isinstance(array, Variable): var = array - elif is_duck_array(array): + else: var = Variable(dims=tuple(f"dim_{v}" for v in range(array.ndim)), data=array) return _variable_contains_cftime_datetimes(var) From b191a5148ead9c9e2a4032b9d03521570790c4be Mon Sep 17 00:00:00 2001 From: Illviljan <14371165+Illviljan@users.noreply.github.com> Date: Wed, 1 Feb 2023 12:34:19 +0100 Subject: [PATCH 06/17] Update xarray/core/common.py Co-authored-by: Mathias Hauser --- xarray/core/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 8929c001ec8..b30503957f3 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1790,7 +1790,7 @@ def _variable_contains_cftime_datetimes(var: T_Variable) -> bool: return False if var.dtype == np.dtype("O") and var.size > 0: - first_idx = tuple(0 for _ in range(var.ndim)) + first_idx = (0,) * var.ndim sample = var[first_idx] return isinstance(sample.to_numpy().item(), cftime.datetime) From 3c099486310ce4ba14c82853bc1cf3a0fc12f48a Mon Sep 17 00:00:00 2001 From: Alex Goodman Date: Tue, 14 Feb 2023 14:49:23 -0600 Subject: [PATCH 07/17] Update common.py remove _variable_contains_cftime_datetimes --- xarray/core/common.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index b2e0bbe4bcc..fa1a8393935 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1780,10 +1780,10 @@ def _contains_cftime_datetimes(array: Any) -> bool: else: var = Variable(dims=tuple(f"dim_{v}" for v in range(array.ndim)), data=array) - return _variable_contains_cftime_datetimes(var) + return contains_cftime_datetimes(var) -def _variable_contains_cftime_datetimes(var: T_Variable) -> bool: +def contains_cftime_datetimes(var: T_Variable) -> bool: """Check if an xarray.Variable contains cftime.datetime objects""" if cftime is None: return False @@ -1796,11 +1796,6 @@ def _variable_contains_cftime_datetimes(var: T_Variable) -> bool: return False -def contains_cftime_datetimes(var: T_Variable) -> bool: - """Check if an xarray.Variable contains cftime.datetime objects""" - return _variable_contains_cftime_datetimes(var) - - def _contains_datetime_like_objects(var: T_Variable) -> bool: """Check if a variable contains datetime like objects (either np.datetime64, np.timedelta64, or cftime.datetime) From 55c3f3d8339309c8e9366029381594967cf0afa4 Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 28 Feb 2023 20:44:56 -0700 Subject: [PATCH 08/17] Avoid creating variable. --- xarray/core/common.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index fa1a8393935..518cf030113 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1771,27 +1771,22 @@ def is_np_timedelta_like(dtype: DTypeLike) -> bool: return np.issubdtype(dtype, np.timedelta64) -def _contains_cftime_datetimes(array: Any) -> bool: +def contains_cftime_datetimes(var: T_Variable) -> bool: """Check if an array contains cftime.datetime objects""" - from xarray.core.variable import Variable - - if isinstance(array, Variable): - var = array - else: - var = Variable(dims=tuple(f"dim_{v}" for v in range(array.ndim)), data=array) - - return contains_cftime_datetimes(var) + if cftime is None: + return False + return _contains_cftime_datetimes(var._data) -def contains_cftime_datetimes(var: T_Variable) -> bool: +def _contains_cftime_datetimes(array: Any) -> bool: """Check if an xarray.Variable contains cftime.datetime objects""" if cftime is None: return False - if var.dtype == np.dtype("O") and var.size > 0: - first_idx = (0,) * var.ndim - sample = var[first_idx] - return isinstance(sample.to_numpy().item(), cftime.datetime) + if array.dtype == np.dtype("O") and array.size > 0: + first_idx = (0,) * array.ndim + sample = array[first_idx] + return isinstance(np.asarray(sample).item(), cftime.datetime) return False From ded5dd1091405d0a66e794f2a0ef748985c9707e Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 28 Feb 2023 20:58:30 -0700 Subject: [PATCH 09/17] Add test --- xarray/core/common.py | 7 +++++-- xarray/tests/__init__.py | 11 +++++++++++ xarray/tests/test_accessor_dt.py | 16 +++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 518cf030113..69a7ba71e8a 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -11,6 +11,7 @@ import pandas as pd from xarray.core import dtypes, duck_array_ops, formatting, formatting_html, ops +from xarray.core.indexing import BasicIndexer, ExplicitlyIndexed from xarray.core.options import OPTIONS, _get_keep_attrs from xarray.core.pycompat import is_duck_dask_array from xarray.core.utils import Frozen, either_dict_or_kwargs, is_scalar @@ -1772,19 +1773,21 @@ def is_np_timedelta_like(dtype: DTypeLike) -> bool: def contains_cftime_datetimes(var: T_Variable) -> bool: - """Check if an array contains cftime.datetime objects""" + """Check if an xarray.Variable contains cftime.datetime objects""" if cftime is None: return False return _contains_cftime_datetimes(var._data) def _contains_cftime_datetimes(array: Any) -> bool: - """Check if an xarray.Variable contains cftime.datetime objects""" + """Check if a array inside a Variable contains cftime.datetime objects""" if cftime is None: return False if array.dtype == np.dtype("O") and array.size > 0: first_idx = (0,) * array.ndim + if isinstance(array, ExplicitlyIndexed): + first_idx = BasicIndexer(first_idx) sample = array[first_idx] return isinstance(np.asarray(sample).item(), cftime.datetime) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index ea556847ed8..38d1cb49718 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -145,6 +145,17 @@ def __getitem__(self, key): raise UnexpectedDataAccess("Tried accessing data") +class FirstElementAccessibleArray(utils.NDArrayMixin, ExplicitlyIndexed): + def __init__(self, array): + self.array = array + + def __getitem__(self, key): + tuple_idxr = key.tuple + if len(tuple_idxr) > 1: + raise UnexpectedDataAccess("Tried accessing more than one element.") + return self.array[tuple_idxr] + + class ReturnItem: def __getitem__(self, key): return key diff --git a/xarray/tests/test_accessor_dt.py b/xarray/tests/test_accessor_dt.py index aabdf6a5c75..04e62d2c16e 100644 --- a/xarray/tests/test_accessor_dt.py +++ b/xarray/tests/test_accessor_dt.py @@ -6,6 +6,7 @@ import xarray as xr from xarray.tests import ( + FirstElementAccessibleArray, assert_array_equal, assert_chunks_equal, assert_equal, @@ -418,7 +419,6 @@ def test_calendar_cftime(data) -> None: assert data.time.dt.calendar == expected -@requires_cftime def test_calendar_datetime64_2d() -> None: data = xr.DataArray(np.zeros((4, 5), dtype="datetime64[ns]"), dims=("x", "y")) assert data.dt.calendar == "proleptic_gregorian" @@ -668,3 +668,17 @@ def test_cftime_round_accessor( result = cftime_rounding_dataarray.dt.round(freq) assert_identical(result, expected) + + +@requires_cftime +def test_contains_cftime_lazy() -> None: + import cftime + + from xarray.core.common import _contains_cftime_datetimes + + times = np.array( + [cftime.DatetimeGregorian(1, 1, 2, 0), cftime.DatetimeGregorian(1, 1, 2, 0)], + dtype=object, + ) + array = FirstElementAccessibleArray(times) + assert _contains_cftime_datetimes(array) From 06a8706219c045864a2147dfb067fcbb36338daf Mon Sep 17 00:00:00 2001 From: dcherian Date: Tue, 28 Feb 2023 21:11:35 -0700 Subject: [PATCH 10/17] minimize diff --- xarray/core/common.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 69a7ba71e8a..ab33cb67adb 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1772,13 +1772,6 @@ def is_np_timedelta_like(dtype: DTypeLike) -> bool: return np.issubdtype(dtype, np.timedelta64) -def contains_cftime_datetimes(var: T_Variable) -> bool: - """Check if an xarray.Variable contains cftime.datetime objects""" - if cftime is None: - return False - return _contains_cftime_datetimes(var._data) - - def _contains_cftime_datetimes(array: Any) -> bool: """Check if a array inside a Variable contains cftime.datetime objects""" if cftime is None: @@ -1794,6 +1787,13 @@ def _contains_cftime_datetimes(array: Any) -> bool: return False +def contains_cftime_datetimes(var: T_Variable) -> bool: + """Check if an xarray.Variable contains cftime.datetime objects""" + if cftime is None: + return False + return _contains_cftime_datetimes(var._data) + + def _contains_datetime_like_objects(var: T_Variable) -> bool: """Check if a variable contains datetime like objects (either np.datetime64, np.timedelta64, or cftime.datetime) From 74485fd9bba3d64955863b3b050ba074c1ba2fbe Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 2 Mar 2023 12:31:59 -0700 Subject: [PATCH 11/17] Update tests. --- xarray/tests/__init__.py | 8 ++++---- xarray/tests/test_accessor_dt.py | 15 --------------- xarray/tests/test_coding_times.py | 27 +++++++++++++++++++++------ 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 98bb8077934..eda790694af 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -143,13 +143,13 @@ def __init__(self, array): self.array = array def __getitem__(self, key): - raise UnexpectedDataAccess("Tried accessing data") + raise UnexpectedDataAccess("Tried accessing data.") + def __array__(self): + raise UnexpectedDataAccess("Tried accessing data.") -class FirstElementAccessibleArray(utils.NDArrayMixin, ExplicitlyIndexed): - def __init__(self, array): - self.array = array +class FirstElementAccessibleArray(InaccessibleArray): def __getitem__(self, key): tuple_idxr = key.tuple if len(tuple_idxr) > 1: diff --git a/xarray/tests/test_accessor_dt.py b/xarray/tests/test_accessor_dt.py index 04e62d2c16e..ef91257c4d9 100644 --- a/xarray/tests/test_accessor_dt.py +++ b/xarray/tests/test_accessor_dt.py @@ -6,7 +6,6 @@ import xarray as xr from xarray.tests import ( - FirstElementAccessibleArray, assert_array_equal, assert_chunks_equal, assert_equal, @@ -668,17 +667,3 @@ def test_cftime_round_accessor( result = cftime_rounding_dataarray.dt.round(freq) assert_identical(result, expected) - - -@requires_cftime -def test_contains_cftime_lazy() -> None: - import cftime - - from xarray.core.common import _contains_cftime_datetimes - - times = np.array( - [cftime.DatetimeGregorian(1, 1, 2, 0), cftime.DatetimeGregorian(1, 1, 2, 0)], - dtype=object, - ) - array = FirstElementAccessibleArray(times) - assert _contains_cftime_datetimes(array) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 0746a949cc8..580de878fe6 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -31,6 +31,7 @@ from xarray.core.common import contains_cftime_datetimes from xarray.testing import assert_equal, assert_identical from xarray.tests import ( + FirstElementAccessibleArray, arm_xfail, assert_array_equal, assert_no_warnings, @@ -787,35 +788,35 @@ def times_3d(times): @requires_cftime def test_contains_cftime_datetimes_1d(data) -> None: - assert contains_cftime_datetimes(data.time) + assert contains_cftime_datetimes(data.time.variable) @requires_cftime @requires_dask def test_contains_cftime_datetimes_dask_1d(data) -> None: - assert contains_cftime_datetimes(data.time.chunk()) + assert contains_cftime_datetimes(data.time.variable.chunk()) @requires_cftime def test_contains_cftime_datetimes_3d(times_3d) -> None: - assert contains_cftime_datetimes(times_3d) + assert contains_cftime_datetimes(times_3d.variable) @requires_cftime @requires_dask def test_contains_cftime_datetimes_dask_3d(times_3d) -> None: - assert contains_cftime_datetimes(times_3d.chunk()) + assert contains_cftime_datetimes(times_3d.variable.chunk()) @pytest.mark.parametrize("non_cftime_data", [DataArray([]), DataArray([1, 2])]) def test_contains_cftime_datetimes_non_cftimes(non_cftime_data) -> None: - assert not contains_cftime_datetimes(non_cftime_data) + assert not contains_cftime_datetimes(non_cftime_data.variable) @requires_dask @pytest.mark.parametrize("non_cftime_data", [DataArray([]), DataArray([1, 2])]) def test_contains_cftime_datetimes_non_cftimes_dask(non_cftime_data) -> None: - assert not contains_cftime_datetimes(non_cftime_data.chunk()) + assert not contains_cftime_datetimes(non_cftime_data.variable.chunk()) @requires_cftime @@ -1176,3 +1177,17 @@ def test_scalar_unit() -> None: variable = Variable(("x", "y"), np.array([[0, 1], [2, 3]]), {"units": np.nan}) result = coding.times.CFDatetimeCoder().decode(variable) assert np.isnan(result.attrs["units"]) + + +@requires_cftime +def test_contains_cftime_lazy() -> None: + import cftime + + from xarray.core.common import _contains_cftime_datetimes + + times = np.array( + [cftime.DatetimeGregorian(1, 1, 2, 0), cftime.DatetimeGregorian(1, 1, 2, 0)], + dtype=object, + ) + array = FirstElementAccessibleArray(times) + assert _contains_cftime_datetimes(array) From 5d677f18b63051f524a09a85b3eacfaf8d028584 Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 2 Mar 2023 12:32:45 -0700 Subject: [PATCH 12/17] address comment --- xarray/core/common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index ab33cb67adb..d980e622763 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -1789,8 +1789,6 @@ def _contains_cftime_datetimes(array: Any) -> bool: def contains_cftime_datetimes(var: T_Variable) -> bool: """Check if an xarray.Variable contains cftime.datetime objects""" - if cftime is None: - return False return _contains_cftime_datetimes(var._data) From b38799592d967fce470e42608c443c0ec57ac47e Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 2 Mar 2023 13:37:07 -0700 Subject: [PATCH 13/17] Fix test --- xarray/tests/test_coding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 5bf23819d87..f7579c4b488 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -65,13 +65,13 @@ def test_CFMaskCoder_missing_value() -> None: expected.attrs["missing_value"] = -9999 decoded = xr.decode_cf(expected.to_dataset()) - encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs) + encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs) assert_equal(encoded["tmpk"], expected.variable) decoded.tmpk.encoding["_FillValue"] = -9940 with pytest.raises(ValueError): - encoded, _ = xr.conventions.cf_encoder(decoded, decoded.attrs) + encoded, _ = xr.conventions.cf_encoder(decoded.variables, decoded.attrs) @requires_dask From c0752452dce3e9faf59f86ce59348dd992ca9bb9 Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 2 Mar 2023 13:38:33 -0700 Subject: [PATCH 14/17] Fix whats-new --- doc/whats-new.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f9d0186d0a9..98ebc034fbb 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -42,6 +42,8 @@ Bug fixes - Fix matplotlib raising a UserWarning when plotting a scatter plot with an unfilled marker (:issue:`7313`, :pull:`7318`). By `Jimmy Westling `_. +- Improved performance in ``open_dataset`` for datasets with large object arrays (:issue:`7484`, :pull:`7494`). + By `Alex Goodman `_ and `Deepak Cherian `_. Documentation ~~~~~~~~~~~~~ @@ -99,8 +101,6 @@ Bug fixes - :py:func:`xarray.Dataset.to_zarr` now drops variable encodings that have been added by xarray during reading a dataset. (:issue:`7129`, :pull:`7500`). By `Hauke Schulz `_. -- Improved performance in ``open_dataset`` for datasets with large object arrays (:issue:`7484`, :pull:`7494`). - By `Alex Goodman `_. Documentation ~~~~~~~~~~~~~ From f9e7a2eb15abf1dac670144996dfeab87e651872 Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 2 Mar 2023 15:16:29 -0700 Subject: [PATCH 15/17] Fix more tests --- xarray/coding/frequencies.py | 3 ++- xarray/tests/test_backends.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/xarray/coding/frequencies.py b/xarray/coding/frequencies.py index fef2f5a8319..4d24327aa2f 100644 --- a/xarray/coding/frequencies.py +++ b/xarray/coding/frequencies.py @@ -79,11 +79,12 @@ def infer_freq(index): If there are fewer than three values or the index is not 1D. """ from xarray.core.dataarray import DataArray + from xarray.core.variable import Variable if isinstance(index, (DataArray, pd.Series)): if index.ndim != 1: raise ValueError("'index' must be 1D") - elif not _contains_datetime_like_objects(DataArray(index)): + elif not _contains_datetime_like_objects(Variable("dim", index)): raise ValueError("'index' must contain datetime-like objects") dtype = np.asarray(index).dtype if dtype == "datetime64[ns]": diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 91daabd12d5..bc6b095fc4e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2560,7 +2560,7 @@ def test_open_zarr_use_cftime(self) -> None: ds_a = xr.open_zarr(store_target, **self.version_kwargs) assert_identical(ds, ds_a) ds_b = xr.open_zarr(store_target, use_cftime=True, **self.version_kwargs) - assert xr.coding.times.contains_cftime_datetimes(ds_b.time) + assert xr.coding.times.contains_cftime_datetimes(ds_b.time.variable) def test_write_read_select_write(self) -> None: # Test for https://github.com/pydata/xarray/issues/4084 From 6fe2c25956193504d400cd1cee328ea5e18cd6e9 Mon Sep 17 00:00:00 2001 From: dcherian Date: Thu, 2 Mar 2023 16:36:43 -0700 Subject: [PATCH 16/17] More fixes --- xarray/coding/calendar_ops.py | 6 +++--- xarray/coding/cftime_offsets.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/coding/calendar_ops.py b/xarray/coding/calendar_ops.py index 06f57757619..dc2f95b832e 100644 --- a/xarray/coding/calendar_ops.py +++ b/xarray/coding/calendar_ops.py @@ -147,7 +147,7 @@ def convert_calendar( from xarray.core.dataarray import DataArray time = obj[dim] - if not _contains_datetime_like_objects(time): + if not _contains_datetime_like_objects(time.variable): raise ValueError(f"Coordinate {dim} must contain datetime objects.") use_cftime = _should_cftime_be_used(time, calendar, use_cftime) @@ -319,8 +319,8 @@ def interp_calendar(source, target, dim="time"): target = DataArray(target, dims=(dim,), name=dim) if not _contains_datetime_like_objects( - source[dim] - ) or not _contains_datetime_like_objects(target): + source[dim].variable + ) or not _contains_datetime_like_objects(target.variable): raise ValueError( f"Both 'source.{dim}' and 'target' must contain datetime objects." ) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index bc3e3545892..792724ecc79 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -1267,7 +1267,7 @@ def date_range_like(source, calendar, use_cftime=None): if not isinstance(source, (pd.DatetimeIndex, CFTimeIndex)) and ( isinstance(source, DataArray) and (source.ndim != 1) - or not _contains_datetime_like_objects(source) + or not _contains_datetime_like_objects(source.variable) ): raise ValueError( "'source' must be a 1D array of datetime objects for inferring its range." From 9c07dcbd9c18663fda547250f4e575b03d58bca9 Mon Sep 17 00:00:00 2001 From: dcherian Date: Fri, 3 Mar 2023 16:16:04 -0700 Subject: [PATCH 17/17] fix iris tests --- xarray/tests/test_dataarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 836d30b60b8..ed1abea5fbe 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -6219,7 +6219,7 @@ def test_to_and_from_iris(self) -> None: original_coord = original.coords[orginal_key] assert coord.var_name == original_coord.name assert_array_equal( - coord.points, CFDatetimeCoder().encode(original_coord).values + coord.points, CFDatetimeCoder().encode(original_coord.variable).values ) assert actual.coord_dims(coord) == original.get_axis_num( original.coords[coord.var_name].dims @@ -6295,7 +6295,7 @@ def test_to_and_from_iris_dask(self) -> None: original_coord = original.coords[orginal_key] assert coord.var_name == original_coord.name assert_array_equal( - coord.points, CFDatetimeCoder().encode(original_coord).values + coord.points, CFDatetimeCoder().encode(original_coord.variable).values ) assert actual.coord_dims(coord) == original.get_axis_num( original.coords[coord.var_name].dims