Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG/API: make setitem-inplace preserve dtype when possible with PandasArray, IntegerArray, FloatingArray #39044

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
6fffb02
BUG/API: make setitem-inplace preserve dtype when possible with Panda…
jbrockmendel Jan 8, 2021
160f3f7
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Jan 12, 2021
de10708
do patching inside test_numpy
jbrockmendel Jan 13, 2021
ba98a99
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Jan 15, 2021
84261a7
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Jan 15, 2021
284f36a
move kludge to test_numpy
jbrockmendel Jan 15, 2021
2639b5c
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Jan 16, 2021
b2aa366
staticmethod -> function
jbrockmendel Jan 16, 2021
7aeb2b5
typo fixup+test
jbrockmendel Jan 20, 2021
715a602
Merge branch 'master' into bug-38896
jbrockmendel Jan 25, 2021
b55155b
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Jan 25, 2021
c72e566
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Jan 25, 2021
0b9f343
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Jan 27, 2021
cd6adbe
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Jan 28, 2021
071ab1b
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Jan 28, 2021
6ab44af
docstring
jbrockmendel Jan 28, 2021
daacff8
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Jan 29, 2021
450bf73
comment, revert floating
jbrockmendel Jan 29, 2021
dba7c11
Merge branch 'master' into bug-38896
jbrockmendel Feb 2, 2021
9258cbb
Merge branch 'master' into bug-38896
jbrockmendel Feb 3, 2021
88309ab
Merge branch 'master' of https://github.com/pandas-dev/pandas into bu…
jbrockmendel Feb 3, 2021
1847209
Merge branch 'master' into bug-38896
jbrockmendel Feb 7, 2021
6b8cc31
Merge branch 'master' into bug-38896
jbrockmendel Feb 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,14 +506,31 @@ def array_equals(left: ArrayLike, right: ArrayLike) -> bool:
return array_equivalent(left, right, dtype_equal=True)


def infer_fill_value(val):
def infer_fill_value(val, length: int):
"""
infer the fill value for the nan/NaT from the provided
scalar/ndarray/list-like if we are a NaT, return the correct dtyped
element to provide proper block construction
"""
if not is_list_like(val):
val = [val]

if type(val).__name__ == "PandasArray":
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
# for test_numpy test where we patch PandasArray._typ
val = val.to_numpy()

if is_extension_array_dtype(val):
# We cannot use dtype._na_value bc pd.NA/pd.NaT do not preserve dtype
if len(val) == length:
jreback marked this conversation as resolved.
Show resolved Hide resolved
# TODO: in this case see if we can avoid making a copy later on
return val
if length == 0:
return val[:0].copy()

dtype = val.dtype
cls = dtype.construct_array_type()
return cls._from_sequence([dtype._na_value], dtype=dtype).repeat(length)

val = np.array(val, copy=False)
if needs_i8_conversion(val.dtype):
jreback marked this conversation as resolved.
Show resolved Hide resolved
return np.array("NaT", dtype=val.dtype)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1601,7 +1601,7 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"):
# We are setting an entire column
self.obj[key] = value
else:
self.obj[key] = infer_fill_value(value)
self.obj[key] = infer_fill_value(value, len(self.obj))

new_indexer = convert_from_missing_indexer_tuple(
indexer, self.obj.axes
Expand Down
45 changes: 37 additions & 8 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,11 @@
Categorical,
DatetimeArray,
ExtensionArray,
FloatingArray,
IntegerArray,
PandasArray,
PandasDtype,
StringArray,
TimedeltaArray,
)
from pandas.core.base import PandasObject
Expand Down Expand Up @@ -621,10 +624,17 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"):
)
raise TypeError(msg)

values = self.values
dtype = pandas_dtype(dtype)
if isinstance(dtype, ExtensionDtype) and self.values.ndim == 2:
# TODO(EA2D): kludge not needed with 2D EAs (astype_nansafe would raise)
# note DataFrame.astype has special handling to avoid getting here
if self.shape[0] != 1:
raise NotImplementedError("Need 2D EAs!")
values = values[0]

try:
new_values = self._astype(dtype, copy=copy)
new_values = self._astype(values, dtype, copy=copy)
except (ValueError, TypeError):
# e.g. astype_nansafe can fail on object-dtype of strings
# trying to convert to float
Expand All @@ -643,8 +653,8 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"):
)
return newb

def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike:
values = self.values
@staticmethod
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
def _astype(values, dtype: DtypeObj, copy: bool) -> ArrayLike:

if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype):
return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True)
Expand Down Expand Up @@ -936,6 +946,16 @@ def setitem(self, indexer, value):

return self.astype(dtype).setitem(indexer, value)

value = extract_array(value, extract_numpy=True)
if isinstance(value, PandasArray) and not isinstance(value, StringArray):
# this is redundant with extract_array except for PandasArray tests
# that patch _typ
value = value.to_numpy()

if isinstance(value, (IntegerArray, FloatingArray)) and not value._mask.any():
# GH#38896
value = value.to_numpy(value.dtype.numpy_dtype)

# value must be storable at this moment
if is_extension_array_dtype(getattr(value, "dtype", None)):
# We need to be careful not to allow through strings that
Expand Down Expand Up @@ -1910,6 +1930,10 @@ class FloatBlock(NumericBlock):
def _can_hold_element(self, element: Any) -> bool:
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
if isinstance(element, (IntegerArray, FloatingArray)):
# GH#38896
if element._mask.any():
return False
return issubclass(tipo.type, (np.floating, np.integer)) and not issubclass(
tipo.type, np.timedelta64
)
Expand Down Expand Up @@ -1975,11 +1999,16 @@ class IntBlock(NumericBlock):
def _can_hold_element(self, element: Any) -> bool:
tipo = maybe_infer_dtype_type(element)
if tipo is not None:
return (
issubclass(tipo.type, np.integer)
and not issubclass(tipo.type, np.timedelta64)
and self.dtype.itemsize >= tipo.itemsize
)
if issubclass(tipo.type, np.integer):
if isinstance(element, IntegerArray):
# GH#38896
if element._mask.any():
return False

return (
not issubclass(tipo.type, np.timedelta64)
and self.dtype.itemsize >= tipo.itemsize
)
# We have not inferred an integer from the dtype
# check if we have a builtin int or a float equal to an int
return is_integer(element) or (is_float(element) and element.is_integer())
Expand Down
23 changes: 17 additions & 6 deletions pandas/tests/extension/test_floating.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,22 @@
from pandas.tests.extension import base


def make_data():
def make_data(with_nas: bool = True):
if with_nas:
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
return (
list(np.arange(0.1, 0.9, 0.1))
+ [pd.NA]
+ list(np.arange(1, 9.8, 0.1))
+ [pd.NA]
+ [9.9, 10.0]
)
# case without pd.NA that can be cast to floating ndarray losslessly
# TODO: get cases with np.nan instead of pd.NA GH#39039
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
return (
list(np.arange(0.1, 0.9, 0.1))
+ [pd.NA]
+ [np.nan]
+ list(np.arange(1, 9.8, 0.1))
+ [pd.NA]
+ [np.nan]
+ [9.9, 10.0]
)

Expand All @@ -39,9 +49,10 @@ def dtype(request):
return request.param()


@pytest.fixture
def data(dtype):
return pd.array(make_data(), dtype=dtype)
@pytest.fixture(params=[True, False])
def data(dtype, request):
with_nas = request.param
return pd.array(make_data(with_nas), dtype=dtype)


@pytest.fixture
Expand Down
29 changes: 23 additions & 6 deletions pandas/tests/extension/test_integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,11 @@
from pandas.tests.extension import base


def make_data():
return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100]
def make_data(with_nas: bool = True):
if with_nas:
return list(range(1, 9)) + [pd.NA] + list(range(10, 98)) + [pd.NA] + [99, 100]

return list(range(1, 101))


@pytest.fixture(
Expand All @@ -53,9 +56,10 @@ def dtype(request):
return request.param()


@pytest.fixture
def data(dtype):
return pd.array(make_data(), dtype=dtype)
@pytest.fixture(params=[True, False])
def data(dtype, request):
with_nas = request.param
return pd.array(make_data(with_nas), dtype=dtype)


@pytest.fixture
Expand Down Expand Up @@ -198,7 +202,20 @@ class TestGetitem(base.BaseGetitemTests):


class TestSetitem(base.BaseSetitemTests):
pass
def test_setitem_series(self, data, full_indexer):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you indicate here why it is overriding the base class?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

comment added

# https://github.com/pandas-dev/pandas/issues/32395
ser = expected = pd.Series(data, name="data")
result = pd.Series(index=ser.index, dtype=object, name="data")

key = full_indexer(ser)
result.loc[key] = ser

if not data._mask.any():
# GH#38896 like we do with ndarray, we set the values inplace
# but cast to the new numpy dtype
expected = pd.Series(data.to_numpy(data.dtype.numpy_dtype), name="data")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we converting to the numpy dtype here? That's also not the original dtype?


self.assert_series_equal(result, expected)


class TestMissing(base.BaseMissingTests):
Expand Down
33 changes: 33 additions & 0 deletions pandas/tests/extension/test_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,39 @@ def test_setitem_slice(self, data, box_in_series):
def test_setitem_loc_iloc_slice(self, data):
super().test_setitem_loc_iloc_slice(data)

def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request):
# https://github.com/pandas-dev/pandas/issues/32395
df = pd.DataFrame({"data": pd.Series(data)})
result = pd.DataFrame(index=df.index)

key = full_indexer(df)
result.loc[key, "data"] = df["data"]._values

# For PandasArray we expect to get unboxed to numpy
expected = pd.DataFrame({"data": data.to_numpy()})
if isinstance(key, slice) and (
key == slice(None) or data.dtype.numpy_dtype == object
):
mark = pytest.mark.xfail(
reason="This case goes through a different code path"
)
# Other cases go through Block.setitem
request.node.add_marker(mark)

self.assert_frame_equal(result, expected)

def test_setitem_series(self, data, full_indexer):
# https://github.com/pandas-dev/pandas/issues/32395
ser = pd.Series(data, name="data")
result = pd.Series(index=ser.index, dtype=object, name="data")

key = full_indexer(ser)
result.loc[key] = ser

# For PandasArray we expect to get unboxed to numpy
expected = pd.Series(data.to_numpy(), name="data")
self.assert_series_equal(result, expected)


@skip_nested
class TestParsing(BaseNumPyTests, base.BaseParsingTests):
Expand Down