Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[POC] CLN: use ExtensionBlock for datetime tz data #27072

Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
243 changes: 10 additions & 233 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1913,6 +1913,16 @@ def external_values(self, dtype=None):
return self.values.astype(object)


class DatetimeTZBlock(ExtensionBlock):
"""
Block providing backwards-compatibility for `.values` for tz-aware data.
"""
# is_datetimetz = True

def external_values(self, dtype=None):
return self.values._data


class NumericBlock(Block):
__slots__ = ()
is_numeric = True
Expand Down Expand Up @@ -2196,239 +2206,6 @@ def external_values(self):
return np.asarray(self.values.astype('datetime64[ns]', copy=False))


class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
""" implement a datetime64 block with a tz attribute """
__slots__ = ()
is_datetimetz = True
is_extension = True

@property
def _holder(self):
return DatetimeArray

def _maybe_coerce_values(self, values):
"""Input validation for values passed to __init__. Ensure that
we have datetime64TZ, coercing if necessary.

Parameters
----------
values : array-like
Must be convertible to datetime64

Returns
-------
values : DatetimeArray
"""
if not isinstance(values, self._holder):
values = self._holder(values)

if values.tz is None:
raise ValueError("cannot create a DatetimeTZBlock without a tz")

return values

@property
def is_view(self):
""" return a boolean if I am possibly a view """
# check the ndarray values of the DatetimeIndex values
return self.values._data.base is not None

def copy(self, deep=True):
""" copy constructor """
values = self.values
if deep:
values = values.copy(deep=True)
return self.make_block_same_class(values)

def get_values(self, dtype=None):
"""
Returns an ndarray of values.

Parameters
----------
dtype : np.dtype
Only `object`-like dtypes are respected here (not sure
why).

Returns
-------
values : ndarray
When ``dtype=object``, then and object-dtype ndarray of
boxed values is returned. Otherwise, an M8[ns] ndarray
is returned.

DatetimeArray is always 1-d. ``get_values`` will reshape
the return value to be the same dimensionality as the
block.
"""
values = self.values
if is_object_dtype(dtype):
values = values._box_values(values._data)

values = np.asarray(values)

if self.ndim == 2:
# Ensure that our shape is correct for DataFrame.
# ExtensionArrays are always 1-D, even in a DataFrame when
# the analogous NumPy-backed column would be a 2-D ndarray.
values = values.reshape(1, -1)
return values

def to_dense(self):
# we request M8[ns] dtype here, even though it discards tzinfo,
# as lots of code (e.g. anything using values_from_object)
# expects that behavior.
return np.asarray(self.values, dtype=_NS_DTYPE)

def _slice(self, slicer):
""" return a slice of my values """
if isinstance(slicer, tuple):
col, loc = slicer
if not com.is_null_slice(col) and col != 0:
raise IndexError("{0} only contains one item".format(self))
return self.values[loc]
return self.values[slicer]

def _try_coerce_args(self, values, other):
"""
localize and return i8 for the values

Parameters
----------
values : ndarray-like
other : ndarray-like or scalar

Returns
-------
base-type values, base-type other
"""
# asi8 is a view, needs copy
values = _block_shape(values.view("i8"), ndim=self.ndim)

if isinstance(other, ABCSeries):
other = self._holder(other)

if isinstance(other, bool):
raise TypeError
elif is_datetime64_dtype(other):
# add the tz back
other = self._holder(other, dtype=self.dtype)

elif is_null_datetimelike(other):
other = tslibs.iNaT
elif isinstance(other, self._holder):
if other.tz != self.values.tz:
raise ValueError("incompatible or non tz-aware value")
other = _block_shape(other.asi8, ndim=self.ndim)
elif isinstance(other, (np.datetime64, datetime, date)):
other = tslibs.Timestamp(other)
tz = getattr(other, 'tz', None)

# test we can have an equal time zone
if tz is None or str(tz) != str(self.values.tz):
raise ValueError("incompatible or non tz-aware value")
other = other.value
else:
raise TypeError(other)

return values, other

def _try_coerce_result(self, result):
""" reverse of try_coerce_args """
if isinstance(result, np.ndarray):
if result.dtype.kind in ['i', 'f']:
result = result.astype('M8[ns]')

elif isinstance(result, (np.integer, np.float, np.datetime64)):
result = self._box_func(result)

if isinstance(result, np.ndarray):
# allow passing of > 1dim if its trivial

if result.ndim > 1:
result = result.reshape(np.prod(result.shape))
# GH#24096 new values invalidates a frequency
result = self._holder._simple_new(result, freq=None,
dtype=self.values.dtype)

return result

@property
def _box_func(self):
return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz)

def diff(self, n, axis=0):
"""1st discrete difference

Parameters
----------
n : int, number of periods to diff
axis : int, axis to diff upon. default 0

Returns
-------
A list with a new TimeDeltaBlock.

Notes
-----
The arguments here are mimicking shift so they are called correctly
by apply.
"""
if axis == 0:
# Cannot currently calculate diff across multiple blocks since this
# function is invoked via apply
raise NotImplementedError
new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8

# Reshape the new_values like how algos.diff does for timedelta data
new_values = new_values.reshape(1, len(new_values))
new_values = new_values.astype('timedelta64[ns]')
return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]

def concat_same_type(self, to_concat, placement=None):
# need to handle concat([tz1, tz2]) here, since DatetimeArray
# only handles cases where all the tzs are the same.
# Instead of placing the condition here, it could also go into the
# is_uniform_join_units check, but I'm not sure what is better.
if len({x.dtype for x in to_concat}) > 1:
values = _concat._concat_datetime([x.values for x in to_concat])
placement = placement or slice(0, len(values), 1)

if self.ndim > 1:
values = np.atleast_2d(values)
return ObjectBlock(values, ndim=self.ndim, placement=placement)
return super().concat_same_type(to_concat, placement)

def fillna(self, value, limit=None, inplace=False, downcast=None):
# We support filling a DatetimeTZ with a `value` whose timezone
# is different by coercing to object.
try:
return super().fillna(value, limit, inplace, downcast)
except (ValueError, TypeError):
# different timezones, or a non-tz
return self.astype(object).fillna(
value, limit=limit, inplace=inplace, downcast=downcast
)

def setitem(self, indexer, value):
# https://github.com/pandas-dev/pandas/issues/24020
# Need a dedicated setitem until #24020 (type promotion in setitem
# for extension arrays) is designed and implemented.
try:
return super().setitem(indexer, value)
except (ValueError, TypeError):
newb = make_block(self.values.astype(object),
placement=self.mgr_locs,
klass=ObjectBlock)
return newb.setitem(indexer, value)

def equals(self, other):
# override for significant performance improvement
if self.dtype != other.dtype or self.shape != other.shape:
return False
return (self.values.view('i8') == other.values.view('i8')).all()


class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock):
__slots__ = ()
is_timedelta = True
Expand Down