Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: DataFrame dtype keyword match Series behavior #49313

Merged
merged 2 commits into from
Oct 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,8 @@ Removal of prior version deprecations/changes
- Removed the ``display.column_space`` option in favor of ``df.to_string(col_space=...)`` (:issue:`47280`)
- Removed the deprecated method ``mad`` from pandas classes (:issue:`11787`)
- Removed the deprecated method ``tshift`` from pandas classes (:issue:`11631`)
- Changed behavior of :class:`DataFrame` constructor given floating-point ``data`` and an integer ``dtype``, when the data cannot be cast losslessly, the floating point dtype is retained, matching :class:`Series` behavior (:issue:`41170`)
- Changed behavior of :class:`DataFrame` constructor when passed a ``dtype`` (other than int) that the data cannot be cast to; it now raises instead of silently ignoring the dtype (:issue:`41733`)
- Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
- Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)

Expand Down
74 changes: 15 additions & 59 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,6 @@ def sanitize_array(
index: Index | None,
dtype: DtypeObj | None = None,
copy: bool = False,
raise_cast_failure: bool = True,
*,
allow_2d: bool = False,
) -> ArrayLike:
Expand All @@ -514,19 +513,12 @@ def sanitize_array(
index : Index or None, default None
dtype : np.dtype, ExtensionDtype, or None, default None
copy : bool, default False
raise_cast_failure : bool, default True
allow_2d : bool, default False
If False, raise if we have a 2D Arraylike.

Returns
-------
np.ndarray or ExtensionArray

Notes
-----
raise_cast_failure=False is only intended to be True when called from the
DataFrame constructor, as the dtype keyword there may be interpreted as only
applying to a subset of columns, see GH#24435.
"""
if isinstance(data, ma.MaskedArray):
data = sanitize_masked_array(data)
Expand Down Expand Up @@ -564,7 +556,7 @@ def sanitize_array(
# GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
# casting aligning with IntCastingNaNError below
with np.errstate(invalid="ignore"):
subarr = _try_cast(data, dtype, copy, True)
subarr = _try_cast(data, dtype, copy)
except IntCastingNaNError:
warnings.warn(
"In a future version, passing float-dtype values containing NaN "
Expand All @@ -577,29 +569,18 @@ def sanitize_array(
)
subarr = np.array(data, copy=copy)
except ValueError:
if not raise_cast_failure:
# i.e. called via DataFrame constructor
warnings.warn(
"In a future version, passing float-dtype values and an "
"integer dtype to DataFrame will retain floating dtype "
"if they cannot be cast losslessly (matching Series behavior). "
"To retain the old behavior, use DataFrame(data).astype(dtype)",
FutureWarning,
stacklevel=find_stack_level(),
)
# GH#40110 until the deprecation is enforced, we _dont_
# ignore the dtype for DataFrame, and _do_ cast even though
# it is lossy.
dtype = cast(np.dtype, dtype)
return np.array(data, dtype=dtype, copy=copy)
# Pre-2.0, we would have different behavior for Series vs DataFrame.
# DataFrame would call np.array(data, dtype=dtype, copy=copy),
# which would cast to the integer dtype even if the cast is lossy.
# See GH#40110.

# We ignore the dtype arg and return floating values,
# e.g. test_constructor_floating_data_int_dtype
# TODO: where is the discussion that documents the reason for this?
subarr = np.array(data, copy=copy)
else:
# we will try to copy by-definition here
subarr = _try_cast(data, dtype, copy, raise_cast_failure)
subarr = _try_cast(data, dtype, copy)

elif isinstance(data, ABCExtensionArray):
# it is already ensured above this is not a PandasArray
Expand All @@ -624,7 +605,7 @@ def sanitize_array(

if dtype is not None or len(data) == 0:
try:
subarr = _try_cast(data, dtype, copy, raise_cast_failure)
subarr = _try_cast(data, dtype, copy)
except ValueError:
if is_integer_dtype(dtype):
casted = np.array(data, copy=False)
Expand All @@ -636,7 +617,6 @@ def sanitize_array(
index,
dtype,
copy=False,
raise_cast_failure=raise_cast_failure,
allow_2d=allow_2d,
)
else:
Expand Down Expand Up @@ -750,7 +730,6 @@ def _try_cast(
arr: list | np.ndarray,
dtype: DtypeObj | None,
copy: bool,
raise_cast_failure: bool,
) -> ArrayLike:
"""
Convert input to numpy ndarray and optionally cast to a given dtype.
Expand All @@ -762,9 +741,6 @@ def _try_cast(
dtype : np.dtype, ExtensionDtype or None
copy : bool
If False, don't copy the data if not needed.
raise_cast_failure : bool
If True, and if a dtype is specified, raise errors during casting.
Otherwise an object array is returned.

Returns
-------
Expand Down Expand Up @@ -823,35 +799,15 @@ def _try_cast(
elif dtype.kind in ["m", "M"]:
return maybe_cast_to_datetime(arr, dtype)

try:
# GH#15832: Check if we are requesting a numeric dtype and
# that we can convert the data to the requested dtype.
if is_integer_dtype(dtype):
# this will raise if we have e.g. floats
# GH#15832: Check if we are requesting a numeric dtype and
# that we can convert the data to the requested dtype.
elif is_integer_dtype(dtype):
# this will raise if we have e.g. floats

subarr = maybe_cast_to_integer_array(arr, dtype)
else:
subarr = np.array(arr, dtype=dtype, copy=copy)

subarr = maybe_cast_to_integer_array(arr, dtype)
else:
# 4 tests fail if we move this to a try/except/else; see
# test_constructor_compound_dtypes, test_constructor_cast_failure
# test_constructor_dict_cast2, test_loc_setitem_dtype
subarr = np.array(arr, dtype=dtype, copy=copy)

except (ValueError, TypeError):
if raise_cast_failure:
raise
else:
# we only get here with raise_cast_failure False, which means
# called via the DataFrame constructor
# GH#24435
warnings.warn(
f"Could not cast to {dtype}, falling back to object. This "
"behavior is deprecated. In a future version, when a dtype is "
"passed to 'DataFrame', either all columns will be cast to that "
"dtype, or a TypeError will be raised.",
FutureWarning,
stacklevel=find_stack_level(),
)
subarr = np.array(arr, dtype=object, copy=copy)
return subarr


Expand Down
7 changes: 1 addition & 6 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,14 +331,11 @@ def ndarray_to_mgr(

if dtype is not None and not is_dtype_equal(values.dtype, dtype):
# GH#40110 see similar check inside sanitize_array
rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")

values = sanitize_array(
values,
None,
dtype=dtype,
copy=copy_on_sanitize,
raise_cast_failure=rcf,
allow_2d=True,
)

Expand Down Expand Up @@ -615,9 +612,7 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]:
val = dict(val)
val = lib.fast_multiget(val, oindex._values, default=np.nan)

val = sanitize_array(
val, index, dtype=dtype, copy=False, raise_cast_failure=False
)
val = sanitize_array(val, index, dtype=dtype, copy=False)
com.require_length_match(val, index)

homogenized.append(val)
Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/frame/test_block_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,11 +259,10 @@ def f(dtype):
with pytest.raises(NotImplementedError, match=msg):
f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")])

# these work (though results may be unexpected)
depr_msg = "either all columns will be cast to that dtype, or a TypeError will"
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
# pre-2.0 these used to work (though results may be unexpected)
with pytest.raises(TypeError, match="argument must be"):
f("int64")
with tm.assert_produces_warning(FutureWarning, match=depr_msg):
with pytest.raises(TypeError, match="argument must be"):
f("float64")

# 10822
Expand Down
31 changes: 14 additions & 17 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,10 +245,11 @@ def test_constructor_mixed(self, float_string_frame):
assert float_string_frame["foo"].dtype == np.object_

def test_constructor_cast_failure(self):
msg = "either all columns will be cast to that dtype, or a TypeError will"
with tm.assert_produces_warning(FutureWarning, match=msg):
foo = DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64)
assert foo["a"].dtype == object
# as of 2.0, we raise if we can't respect "dtype", previously we
# silently ignored
msg = "could not convert string to float"
with pytest.raises(ValueError, match=msg):
DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64)

# GH 3010, constructing with odd arrays
df = DataFrame(np.ones((4, 2)))
Expand Down Expand Up @@ -753,13 +754,8 @@ def test_constructor_dict_cast2(self):
"A": dict(zip(range(20), tm.makeStringIndex(20))),
"B": dict(zip(range(15), np.random.randn(15))),
}
msg = "either all columns will be cast to that dtype, or a TypeError will"
with tm.assert_produces_warning(FutureWarning, match=msg):
frame = DataFrame(test_data, dtype=float)

assert len(frame) == 20
assert frame["A"].dtype == np.object_
assert frame["B"].dtype == np.float64
with pytest.raises(ValueError, match="could not convert string"):
DataFrame(test_data, dtype=float)

def test_constructor_dict_dont_upcast(self):
d = {"Col1": {"Row1": "A String", "Row2": np.nan}}
Expand Down Expand Up @@ -2788,13 +2784,14 @@ def test_floating_values_integer_dtype(self):

arr = np.random.randn(10, 5)

msg = "if they cannot be cast losslessly"
with tm.assert_produces_warning(FutureWarning, match=msg):
DataFrame(arr, dtype="i8")
# as of 2.0, we match Series behavior by retaining float dtype instead
# of doing a lossy conversion here. Below we _do_ do the conversion
# since it is lossless.
df = DataFrame(arr, dtype="i8")
assert (df.dtypes == "f8").all()

with tm.assert_produces_warning(None):
# if they can be cast losslessly, no warning
DataFrame(arr.round(), dtype="i8")
df = DataFrame(arr.round(), dtype="i8")
assert (df.dtypes == "i8").all()

# with NaNs, we go through a different path with a different warning
arr[0, 0] = np.nan
Expand Down
25 changes: 8 additions & 17 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,25 +782,16 @@ def test_constructor_floating_data_int_dtype(self, frame_or_series):
# GH#40110
arr = np.random.randn(2)

if frame_or_series is Series:
# Long-standing behavior has been to ignore the dtype on these;
# not clear if this is what we want long-term
expected = frame_or_series(arr)

res = frame_or_series(arr, dtype="i8")
tm.assert_equal(res, expected)
# Long-standing behavior (for Series, new in 2.0 for DataFrame)
# has been to ignore the dtype on these;
# not clear if this is what we want long-term
expected = frame_or_series(arr)

res = frame_or_series(list(arr), dtype="i8")
tm.assert_equal(res, expected)
res = frame_or_series(arr, dtype="i8")
tm.assert_equal(res, expected)

else:
msg = "passing float-dtype values and an integer dtype"
with tm.assert_produces_warning(FutureWarning, match=msg):
# DataFrame will behave like Series
frame_or_series(arr, dtype="i8")
with tm.assert_produces_warning(FutureWarning, match=msg):
# DataFrame will behave like Series
frame_or_series(list(arr), dtype="i8")
res = frame_or_series(list(arr), dtype="i8")
tm.assert_equal(res, expected)

# When we have NaNs, we silently ignore the integer dtype
arr[0] = np.nan
Expand Down