Skip to content

Commit

Permalink
time coding refactor (#9906)
Browse files Browse the repository at this point in the history
Co-authored-by: Spencer Clark <spencerkclark@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Jan 2, 2025
1 parent dc03b80 commit 3fd79ac
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 42 deletions.
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ Internal Changes
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
- Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import), add capability to parse negative and/or five-digit years (:pull:`9899`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
- Refactor of time coding to prepare for relaxing nanosecond restriction (:pull:`9906`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.


.. _whats-new.2024.11.0:
Expand Down
129 changes: 89 additions & 40 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like
from xarray.core.duck_array_ops import asarray, ravel, reshape
from xarray.core.formatting import first_n_items, format_timestamp, last_item
from xarray.core.pdcompat import nanosecond_precision_timestamp
from xarray.core.pdcompat import nanosecond_precision_timestamp, timestamp_as_unit
from xarray.core.utils import attempt_import, emit_user_level_warning
from xarray.core.variable import Variable
from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type
Expand All @@ -36,7 +36,11 @@
except ImportError:
cftime = None

from xarray.core.types import CFCalendar, NPDatetimeUnitOptions, T_DuckArray
from xarray.core.types import (
CFCalendar,
NPDatetimeUnitOptions,
T_DuckArray,
)

T_Name = Union[Hashable, None]

Expand Down Expand Up @@ -259,18 +263,26 @@ def _parse_iso8601(date_type, timestr):
return default.replace(**replace), resolution


def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp:
# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if date.tz is not None:
return date.tz_convert("UTC").tz_convert(None)
return date


def _unpack_time_unit_and_ref_date(
units: str,
) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]:
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
time_units, _ref_date = _unpack_netcdf_time_units(units)
time_unit, _ref_date = _unpack_netcdf_time_units(units)
time_unit = _netcdf_to_numpy_timeunit(time_unit)
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(_ref_date)
# If the ref_date Timestamp is timezone-aware, convert to UTC and
# make it timezone-naive (GH 2649).
if ref_date.tz is not None:
ref_date = ref_date.tz_convert(None)
return time_units, ref_date
ref_date = _maybe_strip_tz_from_timestamp(ref_date)
return time_unit, ref_date


def _decode_cf_datetime_dtype(
Expand Down Expand Up @@ -317,6 +329,30 @@ def _decode_datetime_with_cftime(
return np.array([], dtype=object)


def _check_date_for_units_since_refdate(
date, unit: str, ref_date: pd.Timestamp
) -> pd.Timestamp:
# check for out-of-bounds floats and raise
if date > np.iinfo("int64").max or date < np.iinfo("int64").min:
raise OutOfBoundsTimedelta(
f"Value {date} can't be represented as Datetime/Timedelta."
)
delta = date * np.timedelta64(1, unit)
if not np.isnan(delta):
# this will raise on dtype overflow for integer dtypes
if date.dtype.kind in "u" and not np.int64(delta) == date:
raise OutOfBoundsTimedelta(
"DType overflow in Datetime/Timedelta calculation."
)
# this will raise on overflow if ref_date + delta
# can't be represented in the current ref_date resolution
return timestamp_as_unit(ref_date + delta, ref_date.unit)
else:
# if date is exactly NaT (np.iinfo("int64").min) return NaT
# to make follow-up checks work
return pd.Timestamp("NaT")


def _decode_datetime_with_pandas(
flat_num_dates: np.ndarray, units: str, calendar: str
) -> np.ndarray:
Expand All @@ -335,12 +371,8 @@ def _decode_datetime_with_pandas(
elif flat_num_dates.dtype.kind == "u":
flat_num_dates = flat_num_dates.astype(np.uint64)

time_units, ref_date_str = _unpack_netcdf_time_units(units)
time_units = _netcdf_to_numpy_timeunit(time_units)
try:
# TODO: the strict enforcement of nanosecond precision Timestamps can be
# relaxed when addressing GitHub issue #7493.
ref_date = nanosecond_precision_timestamp(ref_date_str)
time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
except ValueError as err:
# ValueError is raised by pd.Timestamp for non-ISO timestamp
# strings, in which case we fall back to using cftime
Expand All @@ -350,8 +382,12 @@ def _decode_datetime_with_pandas(
warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning)
if flat_num_dates.size > 0:
# avoid size 0 datetimes GH1329
pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date
pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date
_check_date_for_units_since_refdate(
flat_num_dates.min(), time_unit, ref_date
)
_check_date_for_units_since_refdate(
flat_num_dates.max(), time_unit, ref_date
)

# To avoid integer overflow when converting to nanosecond units for integer
# dtypes smaller than np.int64 cast all integer and unsigned integer dtype
Expand All @@ -364,20 +400,24 @@ def _decode_datetime_with_pandas(
elif flat_num_dates.dtype.kind in "f":
flat_num_dates = flat_num_dates.astype(np.float64)

# Cast input ordinals to integers of nanoseconds because pd.to_timedelta
# works much faster when dealing with integers (GH 1399).
# properly handle NaN/NaT to prevent casting NaN to int
# keep NaT/nan mask
nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min)
flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units]
flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min
flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64)
# in case we need to change the unit, we fix the numbers here
# this should be safe, as errors would have been raised above
ns_time_unit = _NS_PER_TIME_DELTA[time_unit]
ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_date.unit]
if ns_time_unit > ns_ref_date_unit:
flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit)
time_unit = ref_date.unit

# Use pd.to_timedelta to safely cast integer values to timedeltas,
# and add those to a Timestamp to safely produce a DatetimeIndex. This
# ensures that we do not encounter integer overflow at any point in the
# process without raising OutOfBoundsDatetime.
return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values
# Cast input ordinals to integers and properly handle NaN/NaT
# to prevent casting NaN to int
flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64)
flat_num_dates_int[nan] = np.iinfo(np.int64).min
flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64)

# cast to timedelta64[time_unit] and add to ref_date
return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_unit}]")


def decode_cf_datetime(
Expand Down Expand Up @@ -409,11 +449,15 @@ def decode_cf_datetime(
dates = _decode_datetime_with_cftime(
flat_num_dates.astype(float), units, calendar
)

if (
dates[np.nanargmin(num_dates)].year < 1678
or dates[np.nanargmax(num_dates)].year >= 2262
):
# retrieve cftype
dates_min = dates[np.nanargmin(num_dates)]
cftype = type(dates_min)
# "ns" borders
# between ['1677-09-21T00:12:43.145224193', '2262-04-11T23:47:16.854775807']
lower = cftype(1677, 9, 21, 0, 12, 43, 145224)
upper = cftype(2262, 4, 11, 23, 47, 16, 854775)

if dates_min < lower or dates[np.nanargmax(num_dates)] > upper:
if _is_standard_calendar(calendar):
warnings.warn(
"Unable to decode time axis into full "
Expand Down Expand Up @@ -833,8 +877,8 @@ def _eagerly_encode_cf_datetime(
raise OutOfBoundsDatetime
assert dates.dtype == "datetime64[ns]"

time_units, ref_date = _unpack_time_units_and_ref_date(units)
time_delta = _time_units_to_timedelta64(time_units)
time_unit, ref_date = _unpack_time_unit_and_ref_date(units)
time_delta = np.timedelta64(1, time_unit)

# Wrap the dates in a DatetimeIndex to do the subtraction to ensure
# an OverflowError is raised if the ref_date is too far away from
Expand All @@ -843,16 +887,17 @@ def _eagerly_encode_cf_datetime(
time_deltas = dates_as_index - ref_date

# retrieve needed units to faithfully encode to int64
needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units)
needed_unit, data_ref_date = _unpack_time_unit_and_ref_date(data_units)
needed_units = _numpy_to_netcdf_timeunit(needed_unit)
if data_units != units:
# this accounts for differences in the reference times
ref_delta = abs(data_ref_date - ref_date).to_timedelta64()
data_delta = _time_units_to_timedelta64(needed_units)
data_delta = np.timedelta64(1, needed_unit)
if (ref_delta % data_delta) > np.timedelta64(0, "ns"):
needed_units = _infer_time_units_from_diff(ref_delta)

# needed time delta to encode faithfully to int64
needed_time_delta = _time_units_to_timedelta64(needed_units)
needed_time_delta = _unit_timedelta_numpy(needed_units)

floor_division = np.issubdtype(dtype, np.integer) or dtype is None
if time_delta > needed_time_delta:
Expand All @@ -865,6 +910,7 @@ def _eagerly_encode_cf_datetime(
f"Set encoding['dtype'] to floating point dtype to silence this warning."
)
elif np.issubdtype(dtype, np.integer) and allow_units_modification:
floor_division = True
new_units = f"{needed_units} since {format_timestamp(ref_date)}"
emit_user_level_warning(
f"Times can't be serialized faithfully to int64 with requested units {units!r}. "
Expand All @@ -874,9 +920,12 @@ def _eagerly_encode_cf_datetime(
)
units = new_units
time_delta = needed_time_delta
floor_division = True

num = _division(time_deltas, time_delta, floor_division)
# get resolution of TimedeltaIndex and align time_delta
# todo: check, if this works in any case
num = _division(
time_deltas, time_delta.astype(f"=m8[{time_deltas.unit}]"), floor_division
)
num = reshape(num.values, dates.shape)

except (OutOfBoundsDatetime, OverflowError, ValueError):
Expand Down
16 changes: 16 additions & 0 deletions xarray/core/pdcompat.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
import pandas as pd
from packaging.version import Version

from xarray.core.types import PDDatetimeUnitOptions


def count_not_none(*args) -> int:
"""Compute the number of non-None arguments.
Expand Down Expand Up @@ -73,6 +75,20 @@ def __repr__(self) -> str:
NoDefault = Literal[_NoDefault.no_default] # For typing following pandas


def timestamp_as_unit(date: pd.Timestamp, unit: PDDatetimeUnitOptions) -> pd.Timestamp:
"""Convert the underlying int64 representation to the given unit.
Compatibility function for pandas issue where "as_unit" is not defined
for pandas.Timestamp in pandas versions < 2.2. Can be removed minimum
pandas version is >= 2.2.
"""
if hasattr(date, "as_unit"):
date = date.as_unit(unit)
elif hasattr(date, "_as_unit"):
date = date._as_unit(unit)
return date


def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp:
"""Return a nanosecond-precision Timestamp object.
Expand Down
4 changes: 2 additions & 2 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,8 @@ def test_decode_cf_datetime_overflow() -> None:
units = "days since 2000-01-01 00:00:00"

# date after 2262 and before 1678
days = (-117608, 95795)
expected = (datetime(1677, 12, 31), datetime(2262, 4, 12))
days = (-117710, 95795)
expected = (datetime(1677, 9, 20), datetime(2262, 4, 12))

for i, day in enumerate(days):
with warnings.catch_warnings():
Expand Down

0 comments on commit 3fd79ac

Please sign in to comment.