From 3fd79ac487bfc1ca7d6cd5132b257e0d578c9de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 2 Jan 2025 17:26:40 +0100 Subject: [PATCH] time coding refactor (#9906) Co-authored-by: Spencer Clark Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- doc/whats-new.rst | 2 + xarray/coding/times.py | 129 +++++++++++++++++++++--------- xarray/core/pdcompat.py | 16 ++++ xarray/tests/test_coding_times.py | 4 +- 4 files changed, 109 insertions(+), 42 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6a80ac429fc..427130d8432 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -77,6 +77,8 @@ Internal Changes By `Kai Mühlbauer `_. - Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import), add capability to parse negative and/or five-digit years (:pull:`9899`). By `Kai Mühlbauer `_. +- Refactor of time coding to prepare for relaxing nanosecond restriction (:pull:`9906`). + By `Kai Mühlbauer `_. .. _whats-new.2024.11.0: diff --git a/xarray/coding/times.py b/xarray/coding/times.py index f111dcef5bf..74ba97af81b 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -24,7 +24,7 @@ from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like from xarray.core.duck_array_ops import asarray, ravel, reshape from xarray.core.formatting import first_n_items, format_timestamp, last_item -from xarray.core.pdcompat import nanosecond_precision_timestamp +from xarray.core.pdcompat import nanosecond_precision_timestamp, timestamp_as_unit from xarray.core.utils import attempt_import, emit_user_level_warning from xarray.core.variable import Variable from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type @@ -36,7 +36,11 @@ except ImportError: cftime = None -from xarray.core.types import CFCalendar, NPDatetimeUnitOptions, T_DuckArray +from xarray.core.types import ( + CFCalendar, + NPDatetimeUnitOptions, + T_DuckArray, +) T_Name = Union[Hashable, None] @@ -259,18 +263,26 @@ def _parse_iso8601(date_type, timestr): return default.replace(**replace), resolution -def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]: +def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp: + # If the ref_date Timestamp is timezone-aware, convert to UTC and + # make it timezone-naive (GH 2649). + if date.tz is not None: + return date.tz_convert("UTC").tz_convert(None) + return date + + +def _unpack_time_unit_and_ref_date( + units: str, +) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]: # same us _unpack_netcdf_time_units but finalizes ref_date for # processing in encode_cf_datetime - time_units, _ref_date = _unpack_netcdf_time_units(units) + time_unit, _ref_date = _unpack_netcdf_time_units(units) + time_unit = _netcdf_to_numpy_timeunit(time_unit) # TODO: the strict enforcement of nanosecond precision Timestamps can be # relaxed when addressing GitHub issue #7493. ref_date = nanosecond_precision_timestamp(_ref_date) - # If the ref_date Timestamp is timezone-aware, convert to UTC and - # make it timezone-naive (GH 2649). - if ref_date.tz is not None: - ref_date = ref_date.tz_convert(None) - return time_units, ref_date + ref_date = _maybe_strip_tz_from_timestamp(ref_date) + return time_unit, ref_date def _decode_cf_datetime_dtype( @@ -317,6 +329,30 @@ def _decode_datetime_with_cftime( return np.array([], dtype=object) +def _check_date_for_units_since_refdate( + date, unit: str, ref_date: pd.Timestamp +) -> pd.Timestamp: + # check for out-of-bounds floats and raise + if date > np.iinfo("int64").max or date < np.iinfo("int64").min: + raise OutOfBoundsTimedelta( + f"Value {date} can't be represented as Datetime/Timedelta." + ) + delta = date * np.timedelta64(1, unit) + if not np.isnan(delta): + # this will raise on dtype overflow for integer dtypes + if date.dtype.kind in "u" and not np.int64(delta) == date: + raise OutOfBoundsTimedelta( + "DType overflow in Datetime/Timedelta calculation." + ) + # this will raise on overflow if ref_date + delta + # can't be represented in the current ref_date resolution + return timestamp_as_unit(ref_date + delta, ref_date.unit) + else: + # if date is exactly NaT (np.iinfo("int64").min) return NaT + # to make follow-up checks work + return pd.Timestamp("NaT") + + def _decode_datetime_with_pandas( flat_num_dates: np.ndarray, units: str, calendar: str ) -> np.ndarray: @@ -335,12 +371,8 @@ def _decode_datetime_with_pandas( elif flat_num_dates.dtype.kind == "u": flat_num_dates = flat_num_dates.astype(np.uint64) - time_units, ref_date_str = _unpack_netcdf_time_units(units) - time_units = _netcdf_to_numpy_timeunit(time_units) try: - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - ref_date = nanosecond_precision_timestamp(ref_date_str) + time_unit, ref_date = _unpack_time_unit_and_ref_date(units) except ValueError as err: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime @@ -350,8 +382,12 @@ def _decode_datetime_with_pandas( warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) if flat_num_dates.size > 0: # avoid size 0 datetimes GH1329 - pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date - pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date + _check_date_for_units_since_refdate( + flat_num_dates.min(), time_unit, ref_date + ) + _check_date_for_units_since_refdate( + flat_num_dates.max(), time_unit, ref_date + ) # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer and unsigned integer dtype @@ -364,20 +400,24 @@ def _decode_datetime_with_pandas( elif flat_num_dates.dtype.kind in "f": flat_num_dates = flat_num_dates.astype(np.float64) - # Cast input ordinals to integers of nanoseconds because pd.to_timedelta - # works much faster when dealing with integers (GH 1399). - # properly handle NaN/NaT to prevent casting NaN to int + # keep NaT/nan mask nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min) - flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units] - flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64) - flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min - flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64) + # in case we need to change the unit, we fix the numbers here + # this should be safe, as errors would have been raised above + ns_time_unit = _NS_PER_TIME_DELTA[time_unit] + ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_date.unit] + if ns_time_unit > ns_ref_date_unit: + flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit) + time_unit = ref_date.unit - # Use pd.to_timedelta to safely cast integer values to timedeltas, - # and add those to a Timestamp to safely produce a DatetimeIndex. This - # ensures that we do not encounter integer overflow at any point in the - # process without raising OutOfBoundsDatetime. - return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values + # Cast input ordinals to integers and properly handle NaN/NaT + # to prevent casting NaN to int + flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64) + flat_num_dates_int[nan] = np.iinfo(np.int64).min + flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64) + + # cast to timedelta64[time_unit] and add to ref_date + return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_unit}]") def decode_cf_datetime( @@ -409,11 +449,15 @@ def decode_cf_datetime( dates = _decode_datetime_with_cftime( flat_num_dates.astype(float), units, calendar ) - - if ( - dates[np.nanargmin(num_dates)].year < 1678 - or dates[np.nanargmax(num_dates)].year >= 2262 - ): + # retrieve cftype + dates_min = dates[np.nanargmin(num_dates)] + cftype = type(dates_min) + # "ns" borders + # between ['1677-09-21T00:12:43.145224193', '2262-04-11T23:47:16.854775807'] + lower = cftype(1677, 9, 21, 0, 12, 43, 145224) + upper = cftype(2262, 4, 11, 23, 47, 16, 854775) + + if dates_min < lower or dates[np.nanargmax(num_dates)] > upper: if _is_standard_calendar(calendar): warnings.warn( "Unable to decode time axis into full " @@ -833,8 +877,8 @@ def _eagerly_encode_cf_datetime( raise OutOfBoundsDatetime assert dates.dtype == "datetime64[ns]" - time_units, ref_date = _unpack_time_units_and_ref_date(units) - time_delta = _time_units_to_timedelta64(time_units) + time_unit, ref_date = _unpack_time_unit_and_ref_date(units) + time_delta = np.timedelta64(1, time_unit) # Wrap the dates in a DatetimeIndex to do the subtraction to ensure # an OverflowError is raised if the ref_date is too far away from @@ -843,16 +887,17 @@ def _eagerly_encode_cf_datetime( time_deltas = dates_as_index - ref_date # retrieve needed units to faithfully encode to int64 - needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units) + needed_unit, data_ref_date = _unpack_time_unit_and_ref_date(data_units) + needed_units = _numpy_to_netcdf_timeunit(needed_unit) if data_units != units: # this accounts for differences in the reference times ref_delta = abs(data_ref_date - ref_date).to_timedelta64() - data_delta = _time_units_to_timedelta64(needed_units) + data_delta = np.timedelta64(1, needed_unit) if (ref_delta % data_delta) > np.timedelta64(0, "ns"): needed_units = _infer_time_units_from_diff(ref_delta) # needed time delta to encode faithfully to int64 - needed_time_delta = _time_units_to_timedelta64(needed_units) + needed_time_delta = _unit_timedelta_numpy(needed_units) floor_division = np.issubdtype(dtype, np.integer) or dtype is None if time_delta > needed_time_delta: @@ -865,6 +910,7 @@ def _eagerly_encode_cf_datetime( f"Set encoding['dtype'] to floating point dtype to silence this warning." ) elif np.issubdtype(dtype, np.integer) and allow_units_modification: + floor_division = True new_units = f"{needed_units} since {format_timestamp(ref_date)}" emit_user_level_warning( f"Times can't be serialized faithfully to int64 with requested units {units!r}. " @@ -874,9 +920,12 @@ def _eagerly_encode_cf_datetime( ) units = new_units time_delta = needed_time_delta - floor_division = True - num = _division(time_deltas, time_delta, floor_division) + # get resolution of TimedeltaIndex and align time_delta + # todo: check, if this works in any case + num = _division( + time_deltas, time_delta.astype(f"=m8[{time_deltas.unit}]"), floor_division + ) num = reshape(num.values, dates.shape) except (OutOfBoundsDatetime, OverflowError, ValueError): diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index ae4febd6beb..4c54715e2a0 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -41,6 +41,8 @@ import pandas as pd from packaging.version import Version +from xarray.core.types import PDDatetimeUnitOptions + def count_not_none(*args) -> int: """Compute the number of non-None arguments. @@ -73,6 +75,20 @@ def __repr__(self) -> str: NoDefault = Literal[_NoDefault.no_default] # For typing following pandas +def timestamp_as_unit(date: pd.Timestamp, unit: PDDatetimeUnitOptions) -> pd.Timestamp: + """Convert the underlying int64 representation to the given unit. + + Compatibility function for pandas issue where "as_unit" is not defined + for pandas.Timestamp in pandas versions < 2.2. Can be removed minimum + pandas version is >= 2.2. + """ + if hasattr(date, "as_unit"): + date = date.as_unit(unit) + elif hasattr(date, "_as_unit"): + date = date._as_unit(unit) + return date + + def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp: """Return a nanosecond-precision Timestamp object. diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 9a51ca40d07..685767b71bb 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -167,8 +167,8 @@ def test_decode_cf_datetime_overflow() -> None: units = "days since 2000-01-01 00:00:00" # date after 2262 and before 1678 - days = (-117608, 95795) - expected = (datetime(1677, 12, 31), datetime(2262, 4, 12)) + days = (-117710, 95795) + expected = (datetime(1677, 9, 20), datetime(2262, 4, 12)) for i, day in enumerate(days): with warnings.catch_warnings():