time coding refactor (#9906)

Co-authored-by: Spencer Clark <spencerkclark@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
pydata · Jan 2, 2025 · 3fd79ac · 3fd79ac
1 parent dc03b80
commit 3fd79ac
Show file tree

Hide file tree

Showing 4 changed files with 109 additions and 42 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -77,6 +77,8 @@ Internal Changes
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 - Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import), add capability to parse negative and/or five-digit years (:pull:`9899`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
+- Refactor of time coding to prepare for relaxing nanosecond restriction (:pull:`9906`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
 
 
 .. _whats-new.2024.11.0:

diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -24,7 +24,7 @@
 from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like
 from xarray.core.duck_array_ops import asarray, ravel, reshape
 from xarray.core.formatting import first_n_items, format_timestamp, last_item
-from xarray.core.pdcompat import nanosecond_precision_timestamp
+from xarray.core.pdcompat import nanosecond_precision_timestamp, timestamp_as_unit
 from xarray.core.utils import attempt_import, emit_user_level_warning
 from xarray.core.variable import Variable
 from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type
@@ -36,7 +36,11 @@
 except ImportError:
     cftime = None
 
-from xarray.core.types import CFCalendar, NPDatetimeUnitOptions, T_DuckArray
+from xarray.core.types import (
+    CFCalendar,
+    NPDatetimeUnitOptions,
+    T_DuckArray,
+)
 
 T_Name = Union[Hashable, None]
 
@@ -259,18 +263,26 @@ def _parse_iso8601(date_type, timestr):
     return default.replace(**replace), resolution
 
 
-def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
+def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp:
+    # If the ref_date Timestamp is timezone-aware, convert to UTC and
+    # make it timezone-naive (GH 2649).
+    if date.tz is not None:
+        return date.tz_convert("UTC").tz_convert(None)
+    return date
+
+
+def _unpack_time_unit_and_ref_date(
+    units: str,
+) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]:
     # same us _unpack_netcdf_time_units but finalizes ref_date for
     # processing in encode_cf_datetime
-    time_units, _ref_date = _unpack_netcdf_time_units(units)
+    time_unit, _ref_date = _unpack_netcdf_time_units(units)
+    time_unit = _netcdf_to_numpy_timeunit(time_unit)
     # TODO: the strict enforcement of nanosecond precision Timestamps can be
     # relaxed when addressing GitHub issue #7493.
     ref_date = nanosecond_precision_timestamp(_ref_date)
-    # If the ref_date Timestamp is timezone-aware, convert to UTC and
-    # make it timezone-naive (GH 2649).
-    if ref_date.tz is not None:
-        ref_date = ref_date.tz_convert(None)
-    return time_units, ref_date
+    ref_date = _maybe_strip_tz_from_timestamp(ref_date)
+    return time_unit, ref_date
 
 
 def _decode_cf_datetime_dtype(
@@ -317,6 +329,30 @@ def _decode_datetime_with_cftime(
         return np.array([], dtype=object)
 
 
+def _check_date_for_units_since_refdate(
+    date, unit: str, ref_date: pd.Timestamp
+) -> pd.Timestamp:
+    # check for out-of-bounds floats and raise
+    if date > np.iinfo("int64").max or date < np.iinfo("int64").min:
+        raise OutOfBoundsTimedelta(
+            f"Value {date} can't be represented as Datetime/Timedelta."
+        )
+    delta = date * np.timedelta64(1, unit)
+    if not np.isnan(delta):
+        # this will raise on dtype overflow for integer dtypes
+        if date.dtype.kind in "u" and not np.int64(delta) == date:
+            raise OutOfBoundsTimedelta(
+                "DType overflow in Datetime/Timedelta calculation."
+            )
+        # this will raise on overflow if ref_date + delta
+        # can't be represented in the current ref_date resolution
+        return timestamp_as_unit(ref_date + delta, ref_date.unit)
+    else:
+        # if date is exactly NaT (np.iinfo("int64").min) return NaT
+        # to make follow-up checks work
+        return pd.Timestamp("NaT")
+
+
 def _decode_datetime_with_pandas(
     flat_num_dates: np.ndarray, units: str, calendar: str
 ) -> np.ndarray:
@@ -335,12 +371,8 @@ def _decode_datetime_with_pandas(
     elif flat_num_dates.dtype.kind == "u":
         flat_num_dates = flat_num_dates.astype(np.uint64)
 
-    time_units, ref_date_str = _unpack_netcdf_time_units(units)
-    time_units = _netcdf_to_numpy_timeunit(time_units)
     try:
-        # TODO: the strict enforcement of nanosecond precision Timestamps can be
-        # relaxed when addressing GitHub issue #7493.
-        ref_date = nanosecond_precision_timestamp(ref_date_str)
+        time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
     except ValueError as err:
         # ValueError is raised by pd.Timestamp for non-ISO timestamp
         # strings, in which case we fall back to using cftime
@@ -350,8 +382,12 @@ def _decode_datetime_with_pandas(
         warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
         if flat_num_dates.size > 0:
             # avoid size 0 datetimes GH1329
-            pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date
-            pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date
+            _check_date_for_units_since_refdate(
+                flat_num_dates.min(), time_unit, ref_date
+            )
+            _check_date_for_units_since_refdate(
+                flat_num_dates.max(), time_unit, ref_date
+            )
 
     # To avoid integer overflow when converting to nanosecond units for integer
     # dtypes smaller than np.int64 cast all integer and unsigned integer dtype
@@ -364,20 +400,24 @@ def _decode_datetime_with_pandas(
     elif flat_num_dates.dtype.kind in "f":
         flat_num_dates = flat_num_dates.astype(np.float64)
 
-    # Cast input ordinals to integers of nanoseconds because pd.to_timedelta
-    # works much faster when dealing with integers (GH 1399).
-    # properly handle NaN/NaT to prevent casting NaN to int
+    # keep NaT/nan mask
     nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
-    flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units]
-    flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
-    flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
-    flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)
+    # in case we need to change the unit, we fix the numbers here
+    # this should be safe, as errors would have been raised above
+    ns_time_unit = _NS_PER_TIME_DELTA[time_unit]
+    ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_date.unit]
+    if ns_time_unit > ns_ref_date_unit:
+        flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit)
+        time_unit = ref_date.unit
 
-    # Use pd.to_timedelta to safely cast integer values to timedeltas,
-    # and add those to a Timestamp to safely produce a DatetimeIndex.  This
-    # ensures that we do not encounter integer overflow at any point in the
-    # process without raising OutOfBoundsDatetime.
-    return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values
+    # Cast input ordinals to integers and properly handle NaN/NaT
+    # to prevent casting NaN to int
+    flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64)
+    flat_num_dates_int[nan] = np.iinfo(np.int64).min
+    flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64)
+
+    # cast to timedelta64[time_unit] and add to ref_date
+    return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_unit}]")
 
 
 def decode_cf_datetime(
@@ -409,11 +449,15 @@ def decode_cf_datetime(
             dates = _decode_datetime_with_cftime(
                 flat_num_dates.astype(float), units, calendar
             )
-
-            if (
-                dates[np.nanargmin(num_dates)].year < 1678
-                or dates[np.nanargmax(num_dates)].year >= 2262
-            ):
+            # retrieve cftype
+            dates_min = dates[np.nanargmin(num_dates)]
+            cftype = type(dates_min)
+            # "ns" borders
+            # between ['1677-09-21T00:12:43.145224193', '2262-04-11T23:47:16.854775807']
+            lower = cftype(1677, 9, 21, 0, 12, 43, 145224)
+            upper = cftype(2262, 4, 11, 23, 47, 16, 854775)
+
+            if dates_min < lower or dates[np.nanargmax(num_dates)] > upper:
                 if _is_standard_calendar(calendar):
                     warnings.warn(
                         "Unable to decode time axis into full "
@@ -833,8 +877,8 @@ def _eagerly_encode_cf_datetime(
             raise OutOfBoundsDatetime
         assert dates.dtype == "datetime64[ns]"
 
-        time_units, ref_date = _unpack_time_units_and_ref_date(units)
-        time_delta = _time_units_to_timedelta64(time_units)
+        time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
+        time_delta = np.timedelta64(1, time_unit)
 
         # Wrap the dates in a DatetimeIndex to do the subtraction to ensure
         # an OverflowError is raised if the ref_date is too far away from
@@ -843,16 +887,17 @@ def _eagerly_encode_cf_datetime(
         time_deltas = dates_as_index - ref_date
 
         # retrieve needed units to faithfully encode to int64
-        needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units)
+        needed_unit, data_ref_date = _unpack_time_unit_and_ref_date(data_units)
+        needed_units = _numpy_to_netcdf_timeunit(needed_unit)
         if data_units != units:
             # this accounts for differences in the reference times
             ref_delta = abs(data_ref_date - ref_date).to_timedelta64()
-            data_delta = _time_units_to_timedelta64(needed_units)
+            data_delta = np.timedelta64(1, needed_unit)
             if (ref_delta % data_delta) > np.timedelta64(0, "ns"):
                 needed_units = _infer_time_units_from_diff(ref_delta)
 
         # needed time delta to encode faithfully to int64
-        needed_time_delta = _time_units_to_timedelta64(needed_units)
+        needed_time_delta = _unit_timedelta_numpy(needed_units)
 
         floor_division = np.issubdtype(dtype, np.integer) or dtype is None
         if time_delta > needed_time_delta:
@@ -865,6 +910,7 @@ def _eagerly_encode_cf_datetime(
                     f"Set encoding['dtype'] to floating point dtype to silence this warning."
                 )
             elif np.issubdtype(dtype, np.integer) and allow_units_modification:
+                floor_division = True
                 new_units = f"{needed_units} since {format_timestamp(ref_date)}"
                 emit_user_level_warning(
                     f"Times can't be serialized faithfully to int64 with requested units {units!r}. "
@@ -874,9 +920,12 @@ def _eagerly_encode_cf_datetime(
                 )
                 units = new_units
                 time_delta = needed_time_delta
-                floor_division = True
 
-        num = _division(time_deltas, time_delta, floor_division)
+        # get resolution of TimedeltaIndex and align time_delta
+        # todo: check, if this works in any case
+        num = _division(
+            time_deltas, time_delta.astype(f"=m8[{time_deltas.unit}]"), floor_division
+        )
         num = reshape(num.values, dates.shape)
 
     except (OutOfBoundsDatetime, OverflowError, ValueError):

diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py
@@ -41,6 +41,8 @@
 import pandas as pd
 from packaging.version import Version
 
+from xarray.core.types import PDDatetimeUnitOptions
+
 
 def count_not_none(*args) -> int:
     """Compute the number of non-None arguments.
@@ -73,6 +75,20 @@ def __repr__(self) -> str:
 NoDefault = Literal[_NoDefault.no_default]  # For typing following pandas
 
 
+def timestamp_as_unit(date: pd.Timestamp, unit: PDDatetimeUnitOptions) -> pd.Timestamp:
+    """Convert the underlying int64 representation to the given unit.
+
+    Compatibility function for pandas issue where "as_unit" is not defined
+    for pandas.Timestamp in pandas versions < 2.2. Can be removed minimum
+    pandas version is >= 2.2.
+    """
+    if hasattr(date, "as_unit"):
+        date = date.as_unit(unit)
+    elif hasattr(date, "_as_unit"):
+        date = date._as_unit(unit)
+    return date
+
+
 def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp:
     """Return a nanosecond-precision Timestamp object.
 

diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py
@@ -167,8 +167,8 @@ def test_decode_cf_datetime_overflow() -> None:
     units = "days since 2000-01-01 00:00:00"
 
     # date after 2262 and before 1678
-    days = (-117608, 95795)
-    expected = (datetime(1677, 12, 31), datetime(2262, 4, 12))
+    days = (-117710, 95795)
+    expected = (datetime(1677, 9, 20), datetime(2262, 4, 12))
 
     for i, day in enumerate(days):
         with warnings.catch_warnings():