From 1b721461e4c8db8fefd58ba8e44e2c1691010118 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 25 Oct 2022 12:50:13 -0700 Subject: [PATCH] DEPR: DataFrame dtype keyword match Series behavior --- doc/source/whatsnew/v2.0.0.rst | 2 + pandas/core/construction.py | 74 +++++----------------- pandas/core/internals/construction.py | 7 +- pandas/tests/frame/test_block_internals.py | 7 +- pandas/tests/frame/test_constructors.py | 31 ++++----- pandas/tests/series/test_constructors.py | 25 +++----- 6 files changed, 43 insertions(+), 103 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 4c85b3d5dc745..fbb5a552111b2 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -232,6 +232,8 @@ Removal of prior version deprecations/changes - Removed the ``display.column_space`` option in favor of ``df.to_string(col_space=...)`` (:issue:`47280`) - Removed the deprecated method ``mad`` from pandas classes (:issue:`11787`) - Removed the deprecated method ``tshift`` from pandas classes (:issue:`11631`) +- Changed behavior of :class:`DataFrame` constructor given floating-point ``data`` and an integer ``dtype``, when the data cannot be cast losslessly, the floating point dtype is retained, matching :class:`Series` behavior (:issue:`41170`) +- Changed behavior of :class:`DataFrame` constructor when passed a ``dtype`` (other than int) that the data cannot be cast to; it now raises instead of silently ignoring the dtype (:issue:`41733`) - Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 447006572f22d..c7db58fe8c6a3 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -500,7 +500,6 @@ def sanitize_array( index: Index | None, dtype: DtypeObj | None = None, copy: bool = False, - raise_cast_failure: bool = True, *, allow_2d: bool = False, ) -> ArrayLike: @@ -514,19 +513,12 @@ def sanitize_array( index : Index or None, default None dtype : np.dtype, ExtensionDtype, or None, default None copy : bool, default False - raise_cast_failure : bool, default True allow_2d : bool, default False If False, raise if we have a 2D Arraylike. Returns ------- np.ndarray or ExtensionArray - - Notes - ----- - raise_cast_failure=False is only intended to be True when called from the - DataFrame constructor, as the dtype keyword there may be interpreted as only - applying to a subset of columns, see GH#24435. """ if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) @@ -564,7 +556,7 @@ def sanitize_array( # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int # casting aligning with IntCastingNaNError below with np.errstate(invalid="ignore"): - subarr = _try_cast(data, dtype, copy, True) + subarr = _try_cast(data, dtype, copy) except IntCastingNaNError: warnings.warn( "In a future version, passing float-dtype values containing NaN " @@ -577,21 +569,10 @@ def sanitize_array( ) subarr = np.array(data, copy=copy) except ValueError: - if not raise_cast_failure: - # i.e. called via DataFrame constructor - warnings.warn( - "In a future version, passing float-dtype values and an " - "integer dtype to DataFrame will retain floating dtype " - "if they cannot be cast losslessly (matching Series behavior). " - "To retain the old behavior, use DataFrame(data).astype(dtype)", - FutureWarning, - stacklevel=find_stack_level(), - ) - # GH#40110 until the deprecation is enforced, we _dont_ - # ignore the dtype for DataFrame, and _do_ cast even though - # it is lossy. - dtype = cast(np.dtype, dtype) - return np.array(data, dtype=dtype, copy=copy) + # Pre-2.0, we would have different behavior for Series vs DataFrame. + # DataFrame would call np.array(data, dtype=dtype, copy=copy), + # which would cast to the integer dtype even if the cast is lossy. + # See GH#40110. # We ignore the dtype arg and return floating values, # e.g. test_constructor_floating_data_int_dtype @@ -599,7 +580,7 @@ def sanitize_array( subarr = np.array(data, copy=copy) else: # we will try to copy by-definition here - subarr = _try_cast(data, dtype, copy, raise_cast_failure) + subarr = _try_cast(data, dtype, copy) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray @@ -624,7 +605,7 @@ def sanitize_array( if dtype is not None or len(data) == 0: try: - subarr = _try_cast(data, dtype, copy, raise_cast_failure) + subarr = _try_cast(data, dtype, copy) except ValueError: if is_integer_dtype(dtype): casted = np.array(data, copy=False) @@ -636,7 +617,6 @@ def sanitize_array( index, dtype, copy=False, - raise_cast_failure=raise_cast_failure, allow_2d=allow_2d, ) else: @@ -750,7 +730,6 @@ def _try_cast( arr: list | np.ndarray, dtype: DtypeObj | None, copy: bool, - raise_cast_failure: bool, ) -> ArrayLike: """ Convert input to numpy ndarray and optionally cast to a given dtype. @@ -762,9 +741,6 @@ def _try_cast( dtype : np.dtype, ExtensionDtype or None copy : bool If False, don't copy the data if not needed. - raise_cast_failure : bool - If True, and if a dtype is specified, raise errors during casting. - Otherwise an object array is returned. Returns ------- @@ -823,35 +799,15 @@ def _try_cast( elif dtype.kind in ["m", "M"]: return maybe_cast_to_datetime(arr, dtype) - try: - # GH#15832: Check if we are requesting a numeric dtype and - # that we can convert the data to the requested dtype. - if is_integer_dtype(dtype): - # this will raise if we have e.g. floats + # GH#15832: Check if we are requesting a numeric dtype and + # that we can convert the data to the requested dtype. + elif is_integer_dtype(dtype): + # this will raise if we have e.g. floats + + subarr = maybe_cast_to_integer_array(arr, dtype) + else: + subarr = np.array(arr, dtype=dtype, copy=copy) - subarr = maybe_cast_to_integer_array(arr, dtype) - else: - # 4 tests fail if we move this to a try/except/else; see - # test_constructor_compound_dtypes, test_constructor_cast_failure - # test_constructor_dict_cast2, test_loc_setitem_dtype - subarr = np.array(arr, dtype=dtype, copy=copy) - - except (ValueError, TypeError): - if raise_cast_failure: - raise - else: - # we only get here with raise_cast_failure False, which means - # called via the DataFrame constructor - # GH#24435 - warnings.warn( - f"Could not cast to {dtype}, falling back to object. This " - "behavior is deprecated. In a future version, when a dtype is " - "passed to 'DataFrame', either all columns will be cast to that " - "dtype, or a TypeError will be raised.", - FutureWarning, - stacklevel=find_stack_level(), - ) - subarr = np.array(arr, dtype=object, copy=copy) return subarr diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 054663fcd0626..8f3c93259f0c6 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -331,14 +331,11 @@ def ndarray_to_mgr( if dtype is not None and not is_dtype_equal(values.dtype, dtype): # GH#40110 see similar check inside sanitize_array - rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f") - values = sanitize_array( values, None, dtype=dtype, copy=copy_on_sanitize, - raise_cast_failure=rcf, allow_2d=True, ) @@ -615,9 +612,7 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: val = dict(val) val = lib.fast_multiget(val, oindex._values, default=np.nan) - val = sanitize_array( - val, index, dtype=dtype, copy=False, raise_cast_failure=False - ) + val = sanitize_array(val, index, dtype=dtype, copy=False) com.require_length_match(val, index) homogenized.append(val) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 7d33e8b6b0fd1..ed9d7bced9253 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -259,11 +259,10 @@ def f(dtype): with pytest.raises(NotImplementedError, match=msg): f([("A", "datetime64[h]"), ("B", "str"), ("C", "int32")]) - # these work (though results may be unexpected) - depr_msg = "either all columns will be cast to that dtype, or a TypeError will" - with tm.assert_produces_warning(FutureWarning, match=depr_msg): + # pre-2.0 these used to work (though results may be unexpected) + with pytest.raises(TypeError, match="argument must be"): f("int64") - with tm.assert_produces_warning(FutureWarning, match=depr_msg): + with pytest.raises(TypeError, match="argument must be"): f("float64") # 10822 diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index f9cfb0b81a7bd..10b1f8406025a 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -245,10 +245,11 @@ def test_constructor_mixed(self, float_string_frame): assert float_string_frame["foo"].dtype == np.object_ def test_constructor_cast_failure(self): - msg = "either all columns will be cast to that dtype, or a TypeError will" - with tm.assert_produces_warning(FutureWarning, match=msg): - foo = DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64) - assert foo["a"].dtype == object + # as of 2.0, we raise if we can't respect "dtype", previously we + # silently ignored + msg = "could not convert string to float" + with pytest.raises(ValueError, match=msg): + DataFrame({"a": ["a", "b", "c"]}, dtype=np.float64) # GH 3010, constructing with odd arrays df = DataFrame(np.ones((4, 2))) @@ -753,13 +754,8 @@ def test_constructor_dict_cast2(self): "A": dict(zip(range(20), tm.makeStringIndex(20))), "B": dict(zip(range(15), np.random.randn(15))), } - msg = "either all columns will be cast to that dtype, or a TypeError will" - with tm.assert_produces_warning(FutureWarning, match=msg): - frame = DataFrame(test_data, dtype=float) - - assert len(frame) == 20 - assert frame["A"].dtype == np.object_ - assert frame["B"].dtype == np.float64 + with pytest.raises(ValueError, match="could not convert string"): + DataFrame(test_data, dtype=float) def test_constructor_dict_dont_upcast(self): d = {"Col1": {"Row1": "A String", "Row2": np.nan}} @@ -2788,13 +2784,14 @@ def test_floating_values_integer_dtype(self): arr = np.random.randn(10, 5) - msg = "if they cannot be cast losslessly" - with tm.assert_produces_warning(FutureWarning, match=msg): - DataFrame(arr, dtype="i8") + # as of 2.0, we match Series behavior by retaining float dtype instead + # of doing a lossy conversion here. Below we _do_ do the conversion + # since it is lossless. + df = DataFrame(arr, dtype="i8") + assert (df.dtypes == "f8").all() - with tm.assert_produces_warning(None): - # if they can be cast losslessly, no warning - DataFrame(arr.round(), dtype="i8") + df = DataFrame(arr.round(), dtype="i8") + assert (df.dtypes == "i8").all() # with NaNs, we go through a different path with a different warning arr[0, 0] = np.nan diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 35ebd152f447c..2dd34367a1b21 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -782,25 +782,16 @@ def test_constructor_floating_data_int_dtype(self, frame_or_series): # GH#40110 arr = np.random.randn(2) - if frame_or_series is Series: - # Long-standing behavior has been to ignore the dtype on these; - # not clear if this is what we want long-term - expected = frame_or_series(arr) - - res = frame_or_series(arr, dtype="i8") - tm.assert_equal(res, expected) + # Long-standing behavior (for Series, new in 2.0 for DataFrame) + # has been to ignore the dtype on these; + # not clear if this is what we want long-term + expected = frame_or_series(arr) - res = frame_or_series(list(arr), dtype="i8") - tm.assert_equal(res, expected) + res = frame_or_series(arr, dtype="i8") + tm.assert_equal(res, expected) - else: - msg = "passing float-dtype values and an integer dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): - # DataFrame will behave like Series - frame_or_series(arr, dtype="i8") - with tm.assert_produces_warning(FutureWarning, match=msg): - # DataFrame will behave like Series - frame_or_series(list(arr), dtype="i8") + res = frame_or_series(list(arr), dtype="i8") + tm.assert_equal(res, expected) # When we have NaNs, we silently ignore the integer dtype arr[0] = np.nan