From dc24410c0fbbfff2b191247dc7dc963cc92c0321 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:15:06 +0530 Subject: [PATCH 01/33] DOC: fix SA01 for pandas.api.types.is_int64_dtype (#59862) --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 21104c2e00450..3d31781b886ab 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -117,7 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_float PR01,SA01" \ -i "pandas.api.types.is_float_dtype SA01" \ -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ - -i "pandas.api.types.is_int64_dtype SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_interval_dtype SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index ff855f97a352b..0252927241ef4 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -886,6 +886,16 @@ def is_int64_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of the int64 dtype. + See Also + -------- + api.types.is_float_dtype : Check whether the provided array or dtype is of a + float dtype. + api.types.is_bool_dtype : Check whether the provided array or dtype is of a + boolean dtype. + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + numpy.int64 : Numpy's 64-bit integer type. + Notes ----- Depending on system architecture, the return value of `is_int64_dtype( From b91be12f8854d87e0f1c6cf9e2db7a5e68983be1 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:16:03 +0530 Subject: [PATCH 02/33] DOC: fix SA01, ES01 for pandas.api.types.is_float_dtype (#59861) --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3d31781b886ab..119c6e2b33684 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -115,7 +115,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_dict_like PR07,SA01" \ -i "pandas.api.types.is_file_like PR07,SA01" \ -i "pandas.api.types.is_float PR01,SA01" \ - -i "pandas.api.types.is_float_dtype SA01" \ -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_interval_dtype SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 0252927241ef4..48d2106aff124 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1285,6 +1285,9 @@ def is_float_dtype(arr_or_dtype) -> bool: """ Check whether the provided array or dtype is of a float dtype. + The function checks for floating-point data types, which represent real numbers + that may have fractional components. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -1295,6 +1298,15 @@ def is_float_dtype(arr_or_dtype) -> bool: boolean Whether or not the array or dtype is of a float dtype. + See Also + -------- + api.types.is_numeric_dtype : Check whether the provided array or dtype is of + a numeric dtype. + api.types.is_integer_dtype : Check whether the provided array or dtype is of + an integer dtype. + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + Examples -------- >>> from pandas.api.types import is_float_dtype From b81ed16389385ad1272e94d2796db31ce8ccbafd Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:19:34 +0530 Subject: [PATCH 03/33] DOC: fix SA01, ES01 for pandas.Series.sparse.sp_values (#59859) --- ci/code_checks.sh | 1 - pandas/core/arrays/sparse/array.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 119c6e2b33684..42955a6476734 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -100,7 +100,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.sparse.fill_value SA01" \ -i "pandas.Series.sparse.from_coo PR07,SA01" \ -i "pandas.Series.sparse.npoints SA01" \ - -i "pandas.Series.sparse.sp_values SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index a09dc20af3b36..40012357f40cd 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -603,6 +603,18 @@ def sp_values(self) -> np.ndarray: """ An ndarray containing the non- ``fill_value`` values. + This property returns the actual data values stored in the sparse + representation, excluding the values that are equal to the ``fill_value``. + The result is an ndarray of the underlying values, preserving the sparse + structure by omitting the default ``fill_value`` entries. + + See Also + -------- + Series.sparse.to_dense : Convert a Series from sparse values to dense. + Series.sparse.fill_value : Elements in `data` that are `fill_value` are + not stored. + Series.sparse.density : The percent of non- ``fill_value`` points, as decimal. + Examples -------- >>> from pandas.arrays import SparseArray From 7cebd7822ba0598f53fdd6dd8141c66b949c9023 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:22:11 +0530 Subject: [PATCH 04/33] DOC: fix SA01 for pandas.Series.sparse.fill_value (#59858) --- ci/code_checks.sh | 1 - pandas/core/arrays/sparse/array.py | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 42955a6476734..e0d6efa0278e4 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.tz_localize PR01,PR02" \ -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.pad PR01,SA01" \ - -i "pandas.Series.sparse.fill_value SA01" \ -i "pandas.Series.sparse.from_coo PR07,SA01" \ -i "pandas.Series.sparse.npoints SA01" \ -i "pandas.Timedelta.max PR02" \ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 40012357f40cd..c8ec4068ca199 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -635,6 +635,12 @@ def fill_value(self): For memory savings, this should be the most common value in the array. + See Also + -------- + SparseDtype : Dtype for data stored in :class:`SparseArray`. + Series.value_counts : Return a Series containing counts of unique values. + Series.fillna : Fill NA/NaN in a Series with a specified value. + Examples -------- >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]") From 5b6997ca14187b31a87490b9e61e3af4cbdda6d7 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:24:42 +0530 Subject: [PATCH 05/33] DOC: fix SA01, ES01 for pandas.tseries.offsets.SemiMonthEnd (#59856) DOC: fix SA01 for pandas.tseries.offsets.SemiMonthEnd --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/offsets.pyx | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e0d6efa0278e4..7cc314007aabd 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -364,7 +364,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.tseries.offsets.SemiMonthBegin.n GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.normalize GL08" \ -i "pandas.tseries.offsets.SemiMonthBegin.rule_code GL08" \ - -i "pandas.tseries.offsets.SemiMonthEnd SA01" \ -i "pandas.tseries.offsets.SemiMonthEnd.day_of_month GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.is_on_offset GL08" \ -i "pandas.tseries.offsets.SemiMonthEnd.n GL08" \ diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index 4fa1af0ec882c..4db96fbaa3aad 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -3316,6 +3316,11 @@ cdef class SemiMonthEnd(SemiMonthOffset): """ Two DateOffset's per month repeating on the last day of the month & day_of_month. + This offset allows for flexibility in generating date ranges or adjusting dates + to the end of a month or a specific day in the month, such as the 15th or the last + day of the month. It is useful for financial or scheduling applications where + events occur bi-monthly. + Attributes ---------- n : int, default 1 @@ -3325,6 +3330,13 @@ cdef class SemiMonthEnd(SemiMonthOffset): day_of_month : int, {1, 3,...,27}, default 15 A specific integer for the day of the month. + See Also + -------- + tseries.offsets.SemiMonthBegin : Offset for semi-monthly frequencies, starting at + the beginning of the month. + tseries.offsets.MonthEnd : Offset to the last calendar day of the month. + tseries.offsets.MonthBegin : Offset to the first calendar day of the month. + Examples -------- >>> ts = pd.Timestamp(2022, 1, 14) From a9e30c5f62d080aea7629ca17cf1e9c0e8c3e080 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Sep 2024 19:57:49 +0200 Subject: [PATCH 06/33] String dtype: map builtin str alias to StringDtype (#59685) * String dtype: map builtin str alias to StringDtype * fix tests * fix datetimelike astype and more tests * remove xfails * try fix typing * fix copy_view tests * fix remaining tests with infer_string enabled * ignore typing issue for now * move to common.py * simplify Categorical._str_get_dummies * small cleanup * fix ensure_string_array to not modify extension arrays inplace * fix ensure_string_array once more + fix is_extension_array_dtype for str * still xfail TestArrowArray::test_astype_str when not using infer_string * ensure maybe_convert_objects copies object dtype input array when inferring StringDtype * update test_1d_object_array_does_not_copy test * update constructor copy test + do not copy in maybe_convert_objects? * skip str.get_dummies test for now * use pandas_dtype() instead of registry.find * fix corner cases for calling pandas_dtype * add TODO comment in ensure_string_array --- pandas/_libs/lib.pyx | 9 +++- pandas/_testing/__init__.py | 2 +- pandas/core/arrays/categorical.py | 4 +- pandas/core/arrays/datetimelike.py | 10 ++++- pandas/core/dtypes/common.py | 18 +++++++- pandas/core/indexes/base.py | 6 ++- pandas/core/indexes/interval.py | 3 +- pandas/tests/arrays/floating/test_astype.py | 6 +-- pandas/tests/arrays/integer/test_dtypes.py | 6 +-- pandas/tests/arrays/sparse/test_astype.py | 4 +- pandas/tests/arrays/sparse/test_dtype.py | 2 +- pandas/tests/dtypes/test_common.py | 12 ++++++ pandas/tests/extension/base/casting.py | 4 +- pandas/tests/extension/json/array.py | 3 +- pandas/tests/extension/test_arrow.py | 29 +++---------- pandas/tests/frame/methods/test_astype.py | 17 ++++---- .../tests/frame/methods/test_select_dtypes.py | 5 ++- pandas/tests/frame/test_constructors.py | 41 +++++++++++++++---- .../indexes/datetimes/methods/test_astype.py | 15 ++++--- pandas/tests/indexes/object/test_astype.py | 4 +- .../indexes/period/methods/test_astype.py | 9 +++- .../indexes/timedeltas/methods/test_astype.py | 9 +++- pandas/tests/interchange/test_impl.py | 1 + pandas/tests/io/excel/test_readers.py | 6 +-- .../io/parser/dtypes/test_dtypes_basic.py | 17 ++++---- pandas/tests/io/parser/test_na_values.py | 2 - .../io/parser/test_python_parser_only.py | 6 +-- pandas/tests/series/methods/test_astype.py | 30 ++++++++------ pandas/tests/series/methods/test_map.py | 4 +- pandas/tests/series/test_constructors.py | 2 +- pandas/tests/strings/test_get_dummies.py | 3 ++ pandas/tests/test_algos.py | 7 +++- 32 files changed, 185 insertions(+), 111 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 3f2dfbfb3b404..8af48a861967a 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -754,7 +754,14 @@ cpdef ndarray[object] ensure_string_array( if hasattr(arr, "to_numpy"): - if hasattr(arr, "dtype") and arr.dtype.kind in "mM": + if ( + hasattr(arr, "dtype") + and arr.dtype.kind in "mM" + # TODO: we should add a custom ArrowExtensionArray.astype implementation + # that handles astype(str) specifically, avoiding ending up here and + # then we can remove the below check for `_pa_array` (for ArrowEA) + and not hasattr(arr, "_pa_array") + ): # dtype check to exclude DataFrame # GH#41409 TODO: not a great place for this out = arr.astype(str).astype(object) diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index 5fa1a984b8aea..0be01da1816a2 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -108,7 +108,7 @@ COMPLEX_DTYPES: list[Dtype] = [complex, "complex64", "complex128"] if using_string_dtype(): - STRING_DTYPES: list[Dtype] = [str, "U"] + STRING_DTYPES: list[Dtype] = ["U"] else: STRING_DTYPES: list[Dtype] = [str, "str", "U"] # type: ignore[no-redef] COMPLEX_FLOAT_DTYPES: list[Dtype] = [*COMPLEX_DTYPES, *FLOAT_NUMPY_DTYPES] diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8e0225b31e17b..a69e197df851d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2685,7 +2685,9 @@ def _str_get_dummies(self, sep: str = "|", dtype: NpDtype | None = None): # sep may not be in categories. Just bail on this. from pandas.core.arrays import NumpyExtensionArray - return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep, dtype) + return NumpyExtensionArray(self.to_numpy(str, na_value="NaN"))._str_get_dummies( + sep, dtype + ) # ------------------------------------------------------------------------ # GroupBy Methods diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index fbe1677b95b33..7be8daa09c758 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -471,10 +471,16 @@ def astype(self, dtype, copy: bool = True): return self._box_values(self.asi8.ravel()).reshape(self.shape) + elif is_string_dtype(dtype): + if isinstance(dtype, ExtensionDtype): + arr_object = self._format_native_types(na_rep=dtype.na_value) # type: ignore[arg-type] + cls = dtype.construct_array_type() + return cls._from_sequence(arr_object, dtype=dtype, copy=False) + else: + return self._format_native_types() + elif isinstance(dtype, ExtensionDtype): return super().astype(dtype, copy=copy) - elif is_string_dtype(dtype): - return self._format_native_types() elif dtype.kind in "iu": # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 48d2106aff124..1a38bb03b2c1c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -12,6 +12,8 @@ import numpy as np +from pandas._config import using_string_dtype + from pandas._libs import ( Interval, Period, @@ -1470,7 +1472,15 @@ def is_extension_array_dtype(arr_or_dtype) -> bool: elif isinstance(dtype, np.dtype): return False else: - return registry.find(dtype) is not None + try: + with warnings.catch_warnings(): + # pandas_dtype(..) can raise UserWarning for class input + warnings.simplefilter("ignore", UserWarning) + dtype = pandas_dtype(dtype) + except (TypeError, ValueError): + # np.dtype(..) can raise ValueError + return False + return isinstance(dtype, ExtensionDtype) def is_ea_or_datetimelike_dtype(dtype: DtypeObj | None) -> bool: @@ -1773,6 +1783,12 @@ def pandas_dtype(dtype) -> DtypeObj: elif isinstance(dtype, (np.dtype, ExtensionDtype)): return dtype + # builtin aliases + if dtype is str and using_string_dtype(): + from pandas.core.arrays.string_ import StringDtype + + return StringDtype(na_value=np.nan) + # registered extension types result = registry.find(dtype) if result is not None: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2346c20004210..852049804a4f5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -6262,7 +6262,11 @@ def _should_compare(self, other: Index) -> bool: return False dtype = _unpack_nested_dtype(other) - return self._is_comparable_dtype(dtype) or is_object_dtype(dtype) + return ( + self._is_comparable_dtype(dtype) + or is_object_dtype(dtype) + or is_string_dtype(dtype) + ) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: """ diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 359cdf880937b..8feac890883eb 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -51,6 +51,7 @@ is_number, is_object_dtype, is_scalar, + is_string_dtype, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -712,7 +713,7 @@ def _get_indexer( # left/right get_indexer, compare elementwise, equality -> match indexer = self._get_indexer_unique_sides(target) - elif not is_object_dtype(target.dtype): + elif not (is_object_dtype(target.dtype) or is_string_dtype(target.dtype)): # homogeneous scalar index: use IntervalTree # we should always have self._should_partial_index(target) here target = self._maybe_convert_i8(target) diff --git a/pandas/tests/arrays/floating/test_astype.py b/pandas/tests/arrays/floating/test_astype.py index ccf644b34051d..752ebe194ffcf 100644 --- a/pandas/tests/arrays/floating/test_astype.py +++ b/pandas/tests/arrays/floating/test_astype.py @@ -68,11 +68,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["0.1", "0.2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["0.1", "0.2", ""], dtype="U32") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["0.1", "0.2", ""], dtype="U32") diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py index fadd7ac67b58d..7972ba7b9fb0f 100644 --- a/pandas/tests/arrays/integer/test_dtypes.py +++ b/pandas/tests/arrays/integer/test_dtypes.py @@ -281,11 +281,9 @@ def test_astype_str(using_infer_string): if using_infer_string: expected = pd.array(["1", "2", None], dtype=pd.StringDtype(na_value=np.nan)) - tm.assert_extension_array_equal(a.astype("str"), expected) - # TODO(infer_string) this should also be a string array like above - expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") - tm.assert_numpy_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype(str), expected) + tm.assert_extension_array_equal(a.astype("str"), expected) else: expected = np.array(["1", "2", ""], dtype=f"{tm.ENDIAN}U21") diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py index 83a507e679d46..e6e4a11a0f5ab 100644 --- a/pandas/tests/arrays/sparse/test_astype.py +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -81,8 +81,8 @@ def test_astype_all(self, any_real_numpy_dtype): ), ( SparseArray([0, 1, 10]), - str, - SparseArray(["0", "1", "10"], dtype=SparseDtype(str, "0")), + np.str_, + SparseArray(["0", "1", "10"], dtype=SparseDtype(np.str_, "0")), ), (SparseArray(["10", "20"]), float, SparseArray([10.0, 20.0])), ( diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 1819744d9a9ae..6143163735ab8 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -184,7 +184,7 @@ def test_construct_from_string_fill_value_raises(string): [ (SparseDtype(int, 0), float, SparseDtype(float, 0.0)), (SparseDtype(int, 1), float, SparseDtype(float, 1.0)), - (SparseDtype(int, 1), str, SparseDtype(object, "1")), + (SparseDtype(int, 1), np.str_, SparseDtype(object, "1")), (SparseDtype(float, 1.5), int, SparseDtype(int, 1)), ], ) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 2c2dff7a957fe..e338fb1331734 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -810,11 +810,23 @@ def test_pandas_dtype_string_dtypes(string_storage): "pyarrow" if HAS_PYARROW else "python", na_value=np.nan ) + with pd.option_context("future.infer_string", True): + # with the default string_storage setting + result = pandas_dtype(str) + assert result == pd.StringDtype( + "pyarrow" if HAS_PYARROW else "python", na_value=np.nan + ) + with pd.option_context("future.infer_string", True): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", True): + with pd.option_context("string_storage", string_storage): + result = pandas_dtype(str) + assert result == pd.StringDtype(string_storage, na_value=np.nan) + with pd.option_context("future.infer_string", False): with pd.option_context("string_storage", string_storage): result = pandas_dtype("str") diff --git a/pandas/tests/extension/base/casting.py b/pandas/tests/extension/base/casting.py index e924e38ee5030..8e3f21e1a4f56 100644 --- a/pandas/tests/extension/base/casting.py +++ b/pandas/tests/extension/base/casting.py @@ -44,8 +44,8 @@ def test_tolist(self, data): assert result == expected def test_astype_str(self, data): - result = pd.Series(data[:5]).astype(str) - expected = pd.Series([str(x) for x in data[:5]], dtype=str) + result = pd.Series(data[:2]).astype(str) + expected = pd.Series([str(x) for x in data[:2]], dtype=str) tm.assert_series_equal(result, expected) @pytest.mark.parametrize( diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 3a4391edc99ef..4fa48023fbc95 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -208,9 +208,8 @@ def astype(self, dtype, copy=True): return self.copy() return self elif isinstance(dtype, StringDtype): - value = self.astype(str) # numpy doesn't like nested dicts arr_cls = dtype.construct_array_type() - return arr_cls._from_sequence(value, dtype=dtype, copy=False) + return arr_cls._from_sequence(self, dtype=dtype, copy=False) elif not copy: return np.asarray([dict(x) for x in self], dtype=dtype) else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f86d927ddda67..f56094dfd47ca 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -43,7 +43,6 @@ pa_version_under13p0, pa_version_under14p0, ) -import pandas.util._test_decorators as td from pandas.core.dtypes.dtypes import ( ArrowDtype, @@ -292,7 +291,7 @@ def test_map(self, data_missing, na_action): expected = data_missing.to_numpy() tm.assert_numpy_array_equal(result, expected) - def test_astype_str(self, data, request): + def test_astype_str(self, data, request, using_infer_string): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_binary(pa_dtype): request.applymarker( @@ -300,9 +299,10 @@ def test_astype_str(self, data, request): reason=f"For {pa_dtype} .astype(str) decodes.", ) ) - elif ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): + elif not using_infer_string and ( + (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + or pa.types.is_duration(pa_dtype) + ): request.applymarker( pytest.mark.xfail( reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", @@ -310,25 +310,6 @@ def test_astype_str(self, data, request): ) super().test_astype_str(data) - @pytest.mark.parametrize( - "nullable_string_dtype", - [ - "string[python]", - pytest.param("string[pyarrow]", marks=td.skip_if_no("pyarrow")), - ], - ) - def test_astype_string(self, data, nullable_string_dtype, request): - pa_dtype = data.dtype.pyarrow_dtype - if ( - pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None - ) or pa.types.is_duration(pa_dtype): - request.applymarker( - pytest.mark.xfail( - reason="pd.Timestamp/pd.Timedelta repr different from numpy repr", - ) - ) - super().test_astype_string(data, nullable_string_dtype) - def test_from_dtype(self, data, request): pa_dtype = data.dtype.pyarrow_dtype if pa.types.is_string(pa_dtype) or pa.types.is_decimal(pa_dtype): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 8647df0e8ad96..ab3743283ea13 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -168,21 +168,21 @@ def test_astype_str(self): "d": list(map(str, d._values)), "e": list(map(str, e._values)), }, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) - def test_astype_str_float(self): + def test_astype_str_float(self, using_infer_string): # see GH#11302 result = DataFrame([np.nan]).astype(str) - expected = DataFrame(["nan"], dtype="object") + expected = DataFrame([np.nan if using_infer_string else "nan"], dtype="str") tm.assert_frame_equal(result, expected) result = DataFrame([1.12345678901234567890]).astype(str) val = "1.1234567890123457" - expected = DataFrame([val], dtype="object") + expected = DataFrame([val], dtype="str") tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("dtype_class", [dict, Series]) @@ -284,7 +284,7 @@ def test_astype_duplicate_col_series_arg(self): result = df.astype(dtypes) expected = DataFrame( { - 0: Series(vals[:, 0].astype(str), dtype=object), + 0: Series(vals[:, 0].astype(str), dtype="str"), 1: vals[:, 1], 2: pd.array(vals[:, 2], dtype="Float64"), 3: vals[:, 3], @@ -647,9 +647,10 @@ def test_astype_dt64tz(self, timezone_frame): # dt64tz->dt64 deprecated timezone_frame.astype("datetime64[ns]") - def test_astype_dt64tz_to_str(self, timezone_frame): + def test_astype_dt64tz_to_str(self, timezone_frame, using_infer_string): # str formatting result = timezone_frame.astype(str) + na_value = np.nan if using_infer_string else "NaT" expected = DataFrame( [ [ @@ -657,7 +658,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): "2013-01-01 00:00:00-05:00", "2013-01-01 00:00:00+01:00", ], - ["2013-01-02", "NaT", "NaT"], + ["2013-01-02", na_value, na_value], [ "2013-01-03", "2013-01-03 00:00:00-05:00", @@ -665,7 +666,7 @@ def test_astype_dt64tz_to_str(self, timezone_frame): ], ], columns=timezone_frame.columns, - dtype="object", + dtype="str", ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py index 875dca321635f..0354e9df3d168 100644 --- a/pandas/tests/frame/methods/test_select_dtypes.py +++ b/pandas/tests/frame/methods/test_select_dtypes.py @@ -99,6 +99,9 @@ def test_select_dtypes_include_using_list_like(self, using_infer_string): ei = df[["a"]] tm.assert_frame_equal(ri, ei) + ri = df.select_dtypes(include=[str]) + tm.assert_frame_equal(ri, ei) + def test_select_dtypes_exclude_using_list_like(self): df = DataFrame( { @@ -358,7 +361,7 @@ def test_select_dtypes_datetime_with_tz(self): @pytest.mark.parametrize("dtype", [str, "str", np.bytes_, "S1", np.str_, "U1"]) @pytest.mark.parametrize("arg", ["include", "exclude"]) def test_select_dtypes_str_raises(self, dtype, arg, using_infer_string): - if using_infer_string and dtype == "str": + if using_infer_string and (dtype == "str" or dtype is str): # this is tested below pytest.skip("Selecting string columns works with future strings") df = DataFrame( diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3d46e03547c38..0a924aa393be5 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -24,7 +24,6 @@ from pandas._config import using_string_dtype from pandas._libs import lib -from pandas.compat import HAS_PYARROW from pandas.compat.numpy import np_version_gt2 from pandas.errors import IntCastingNaNError @@ -82,7 +81,7 @@ def test_constructor_from_ndarray_with_str_dtype(self): # with an array of strings each of which is e.g. "[0 1 2]" arr = np.arange(12).reshape(4, 3) df = DataFrame(arr, dtype=str) - expected = DataFrame(arr.astype(str), dtype=object) + expected = DataFrame(arr.astype(str), dtype="str") tm.assert_frame_equal(df, expected) def test_constructor_from_2d_datetimearray(self): @@ -300,18 +299,38 @@ def test_constructor_dtype_nocast_view_2d_array(self): df2 = DataFrame(df.values, dtype=df[0].dtype) assert df2._mgr.blocks[0].values.flags.c_contiguous - @pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="conversion copies") - def test_1d_object_array_does_not_copy(self): + def test_1d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) - @pytest.mark.xfail(using_string_dtype(), reason="conversion copies") - def test_2d_object_array_does_not_copy(self): + def test_2d_object_array_does_not_copy(self, using_infer_string): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") df = DataFrame(arr, copy=False) + if using_infer_string: + if df[0].dtype.storage == "pyarrow": + # object dtype strings are converted to arrow memory, + # no numpy arrays to compare + pass + else: + assert np.shares_memory(df[0].to_numpy(), arr) + else: + assert np.shares_memory(df.values, arr) + + df = DataFrame(arr, dtype=object, copy=False) assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): @@ -1766,12 +1785,18 @@ def test_constructor_column_duplicates(self): tm.assert_frame_equal(idf, edf) - def test_constructor_empty_with_string_dtype(self): + def test_constructor_empty_with_string_dtype(self, using_infer_string): # GH 9428 expected = DataFrame(index=[0, 1], columns=[0, 1], dtype=object) + expected_str = DataFrame( + index=[0, 1], columns=[0, 1], dtype=pd.StringDtype(na_value=np.nan) + ) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=str) - tm.assert_frame_equal(df, expected) + if using_infer_string: + tm.assert_frame_equal(df, expected_str) + else: + tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype=np.str_) tm.assert_frame_equal(df, expected) df = DataFrame(index=[0, 1], columns=[0, 1], dtype="U5") diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index 81dc3b3ecc45e..62be8903da206 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -101,13 +101,16 @@ def test_astype_tznaive_to_tzaware(self): # dt64->dt64tz deprecated idx._data.astype("datetime64[ns, US/Eastern]") - def test_astype_str_nat(self): + def test_astype_str_nat(self, using_infer_string): # GH 13149, GH 13209 # verify that we are returning NaT as a string (and not unicode) idx = DatetimeIndex(["2016-05-16", "NaT", NaT, np.nan]) result = idx.astype(str) - expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) + if using_infer_string: + expected = Index(["2016-05-16", None, None, None], dtype="str") + else: + expected = Index(["2016-05-16", "NaT", "NaT", "NaT"], dtype=object) tm.assert_index_equal(result, expected) def test_astype_str(self): @@ -117,7 +120,7 @@ def test_astype_str(self): expected = Index( ["2012-01-01", "2012-01-02", "2012-01-03", "2012-01-04"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -132,7 +135,7 @@ def test_astype_str_tz_and_name(self): "2012-01-03 00:00:00-05:00", ], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -143,7 +146,7 @@ def test_astype_str_freq_and_name(self): expected = Index( ["2011-01-01 00:00:00", "2011-01-01 01:00:00", "2011-01-01 02:00:00"], name="test_name", - dtype=object, + dtype="str", ) tm.assert_index_equal(result, expected) @@ -155,7 +158,7 @@ def test_astype_str_freq_and_tz(self): result = dti.astype(str) expected = Index( ["2012-03-06 00:00:00+00:00", "2012-03-06 01:00:00+00:00"], - dtype=object, + dtype="str", name="test_name", ) tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/object/test_astype.py b/pandas/tests/indexes/object/test_astype.py index 9c1ef302c5b51..ce05b5e9f2238 100644 --- a/pandas/tests/indexes/object/test_astype.py +++ b/pandas/tests/indexes/object/test_astype.py @@ -15,12 +15,12 @@ def test_astype_str_from_bytes(): # ensure_string_array which does f"{val}" idx = Index(["あ", b"a"], dtype="object") result = idx.astype(str) - expected = Index(["あ", "a"], dtype="object") + expected = Index(["あ", "a"], dtype="str") tm.assert_index_equal(result, expected) # while we're here, check that Series.astype behaves the same result = Series(idx).astype(str) - expected = Series(expected, dtype=object) + expected = Series(expected, dtype="str") tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index d545bfd2fae0f..af3c2667f51b4 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -22,7 +22,7 @@ def test_astype_raises(self, dtype): with pytest.raises(TypeError, match=msg): idx.astype(dtype) - def test_astype_conversion(self): + def test_astype_conversion(self, using_infer_string): # GH#13149, GH#13209 idx = PeriodIndex(["2016-05-16", "NaT", NaT, np.nan], freq="D", name="idx") @@ -41,7 +41,12 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="Y", name="idx") diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index 311f2b5c9aa59..5166cadae499e 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -44,7 +44,7 @@ def test_astype_object_with_nat(self): tm.assert_index_equal(result, expected) assert idx.tolist() == expected_list - def test_astype(self): + def test_astype(self, using_infer_string): # GH 13149, GH 13209 idx = TimedeltaIndex([1e14, "NaT", NaT, np.nan], name="idx") @@ -61,7 +61,12 @@ def test_astype(self): tm.assert_index_equal(result, expected) result = idx.astype(str) - expected = Index([str(x) for x in idx], name="idx", dtype=object) + if using_infer_string: + expected = Index( + [str(x) if x is not NaT else None for x in idx], name="idx", dtype="str" + ) + else: + expected = Index([str(x) for x in idx], name="idx", dtype=object) tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 38961345dc1f2..29ce9d0c03111 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -401,6 +401,7 @@ def test_interchange_from_corrected_buffer_dtypes(monkeypatch) -> None: pd.api.interchange.from_dataframe(df) +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_empty_string_column(): # https://github.com/pandas-dev/pandas/issues/56703 df = pd.DataFrame({"a": []}, dtype=str) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b831ec3bb2c6a..3989e022dbbd2 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -587,7 +587,7 @@ def test_reader_dtype(self, read_ext): expected["a"] = expected["a"].astype("float64") expected["b"] = expected["b"].astype("float32") - expected["c"] = Series(["001", "002", "003", "004"], dtype=object) + expected["c"] = Series(["001", "002", "003", "004"], dtype="str") tm.assert_frame_equal(actual, expected) msg = "Unable to convert column d to type int64" @@ -611,8 +611,8 @@ def test_reader_dtype(self, read_ext): { "a": Series([1, 2, 3, 4], dtype="float64"), "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": Series(["001", "002", "003", "004"], dtype=object), - "d": Series(["1", "2", np.nan, "4"], dtype=object), + "c": Series(["001", "002", "003", "004"], dtype="str"), + "d": Series(["1", "2", np.nan, "4"], dtype="str"), }, ), ], diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index b664423364f6b..e02562ac8d93d 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -31,7 +31,7 @@ @pytest.mark.parametrize("dtype", [str, object]) @pytest.mark.parametrize("check_orig", [True, False]) @pytest.mark.usefixtures("pyarrow_xfail") -def test_dtype_all_columns(all_parsers, dtype, check_orig): +def test_dtype_all_columns(all_parsers, dtype, check_orig, using_infer_string): # see gh-3795, gh-6607 parser = all_parsers @@ -49,8 +49,10 @@ def test_dtype_all_columns(all_parsers, dtype, check_orig): if check_orig: expected = df.copy() result = result.astype(float) - else: + elif using_infer_string and dtype is str: expected = df.astype(str) + else: + expected = df.astype(str).astype(object) tm.assert_frame_equal(result, expected) @@ -300,7 +302,6 @@ def test_true_values_cast_to_bool(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") @pytest.mark.parametrize("dtypes, exp_value", [({}, "1"), ({"a.1": "int64"}, 1)]) def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): @@ -316,7 +317,6 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.usefixtures("pyarrow_xfail") def test_dtype_mangle_dup_cols_single_dtype(all_parsers): # GH#42022 @@ -565,7 +565,7 @@ def test_string_inference(all_parsers): @pytest.mark.parametrize("dtype", ["O", object, "object", np.object_, str, np.str_]) -def test_string_inference_object_dtype(all_parsers, dtype): +def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): # GH#56047 data = """a,b x,a @@ -575,10 +575,11 @@ def test_string_inference_object_dtype(all_parsers, dtype): with pd.option_context("future.infer_string", True): result = parser.read_csv(StringIO(data), dtype=dtype) + expected_dtype = pd.StringDtype(na_value=np.nan) if dtype is str else object expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), - "b": pd.Series(["a", "a", "a"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), + "b": pd.Series(["a", "a", "a"], dtype=expected_dtype), }, columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), ) @@ -589,7 +590,7 @@ def test_string_inference_object_dtype(all_parsers, dtype): expected = DataFrame( { - "a": pd.Series(["x", "y", "z"], dtype=object), + "a": pd.Series(["x", "y", "z"], dtype=expected_dtype), "b": pd.Series(["a", "a", "a"], dtype=pd.StringDtype(na_value=np.nan)), }, columns=pd.Index(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index b612e60c959b1..89645b526f2ee 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -667,7 +667,6 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): @@ -719,7 +718,6 @@ def test_cast_NA_to_bool_raises_error(all_parsers, data, na_values): # TODO: this test isn't about the na_values keyword, it is about the empty entries # being returned with NaN entries, whereas the pyarrow engine returns "nan" @xfail_pyarrow # mismatched shapes -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_str_nan_dropped(all_parsers): # see gh-21131 parser = all_parsers diff --git a/pandas/tests/io/parser/test_python_parser_only.py b/pandas/tests/io/parser/test_python_parser_only.py index 26480010fc687..a5bb151e84f47 100644 --- a/pandas/tests/io/parser/test_python_parser_only.py +++ b/pandas/tests/io/parser/test_python_parser_only.py @@ -18,8 +18,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ( ParserError, ParserWarning, @@ -499,7 +497,6 @@ def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parse tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype", [{"a": object}, {"a": str, "b": np.int64, "c": np.int64}] ) @@ -524,10 +521,11 @@ def test_no_thousand_convert_with_dot_for_non_numeric_cols(python_parser_only, d "c": [0, 4000, 131], } ) + if dtype["a"] == object: + expected["a"] = expected["a"].astype(object) tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") @pytest.mark.parametrize( "dtype,expected", [ diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py index 579d41f964df0..4a7e204ee4161 100644 --- a/pandas/tests/series/methods/test_astype.py +++ b/pandas/tests/series/methods/test_astype.py @@ -76,7 +76,7 @@ def test_astype_dict_like(self, dtype_class): dt1 = dtype_class({"abc": str}) result = ser.astype(dt1) - expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype=object) + expected = Series(["0", "2", "4", "6", "8"], name="abc", dtype="str") tm.assert_series_equal(result, expected) dt2 = dtype_class({"abc": "float64"}) @@ -173,10 +173,14 @@ def test_astype_empty_constructor_equality(self, dtype): def test_astype_str_map(self, dtype, data, using_infer_string): # see GH#4405 series = Series(data) + using_string_dtype = using_infer_string and dtype is str result = series.astype(dtype) - expected = series.map(str) - if using_infer_string: - expected = expected.astype(object) + if using_string_dtype: + expected = series.map(lambda val: str(val) if val is not np.nan else np.nan) + else: + expected = series.map(str) + if using_infer_string: + expected = expected.astype(object) tm.assert_series_equal(result, expected) def test_astype_float_to_period(self): @@ -213,7 +217,7 @@ def test_astype_dt64_to_str(self): # GH#10442 : testing astype(str) is correct for Series/DatetimeIndex dti = date_range("2012-01-01", periods=3) result = Series(dti).astype(str) - expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype=object) + expected = Series(["2012-01-01", "2012-01-02", "2012-01-03"], dtype="str") tm.assert_series_equal(result, expected) def test_astype_dt64tz_to_str(self): @@ -226,7 +230,7 @@ def test_astype_dt64tz_to_str(self): "2012-01-02 00:00:00-05:00", "2012-01-03 00:00:00-05:00", ], - dtype=object, + dtype="str", ) tm.assert_series_equal(result, expected) @@ -286,13 +290,13 @@ def test_astype_str_cast_dt64(self): ts = Series([Timestamp("2010-01-04 00:00:00")]) res = ts.astype(str) - expected = Series(["2010-01-04"], dtype=object) + expected = Series(["2010-01-04"], dtype="str") tm.assert_series_equal(res, expected) ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")]) res = ts.astype(str) - expected = Series(["2010-01-04 00:00:00-05:00"], dtype=object) + expected = Series(["2010-01-04 00:00:00-05:00"], dtype="str") tm.assert_series_equal(res, expected) def test_astype_str_cast_td64(self): @@ -301,7 +305,7 @@ def test_astype_str_cast_td64(self): td = Series([Timedelta(1, unit="D")]) ser = td.astype(str) - expected = Series(["1 days"], dtype=object) + expected = Series(["1 days"], dtype="str") tm.assert_series_equal(ser, expected) def test_dt64_series_astype_object(self): @@ -347,7 +351,7 @@ def test_astype_from_float_to_str(self, any_float_dtype): # https://github.com/pandas-dev/pandas/issues/36451 ser = Series([0.1], dtype=any_float_dtype) result = ser.astype(str) - expected = Series(["0.1"], dtype=object) + expected = Series(["0.1"], dtype="str") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -358,11 +362,13 @@ def test_astype_from_float_to_str(self, any_float_dtype): (NA, ""), ], ) - def test_astype_to_str_preserves_na(self, value, string_value): + def test_astype_to_str_preserves_na(self, value, string_value, using_infer_string): # https://github.com/pandas-dev/pandas/issues/36904 ser = Series(["a", "b", value], dtype=object) result = ser.astype(str) - expected = Series(["a", "b", string_value], dtype=object) + expected = Series( + ["a", "b", None if using_infer_string else string_value], dtype="str" + ) tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dtype", ["float32", "float64", "int64", "int32"]) diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index fe84ffafa70b4..7fa8686fcc6c8 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -549,13 +549,11 @@ def f(x): (list(range(3)), {0: 42}, [42] + [np.nan] * 3), ], ) -def test_map_missing_mixed(vals, mapping, exp, using_infer_string): +def test_map_missing_mixed(vals, mapping, exp): # GH20495 s = Series(vals + [np.nan]) result = s.map(mapping) exp = Series(exp) - if using_infer_string and mapping == {np.nan: "not NaN"}: - exp.iloc[-1] = np.nan tm.assert_series_equal(result, exp) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1771a4dfdb71f..69f42b5e42878 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -229,7 +229,7 @@ def test_constructor_empty(self, input_class, using_infer_string): # GH 19853 : with empty string, index and dtype str empty = Series("", dtype=str, index=range(3)) if using_infer_string: - empty2 = Series("", index=range(3), dtype=object) + empty2 = Series("", index=range(3), dtype="str") else: empty2 = Series("", index=range(3)) tm.assert_series_equal(empty, empty2) diff --git a/pandas/tests/strings/test_get_dummies.py b/pandas/tests/strings/test_get_dummies.py index 0656f505dc745..3b989e284ca25 100644 --- a/pandas/tests/strings/test_get_dummies.py +++ b/pandas/tests/strings/test_get_dummies.py @@ -1,6 +1,8 @@ import numpy as np import pytest +from pandas._config import using_string_dtype + import pandas.util._test_decorators as td from pandas import ( @@ -96,6 +98,7 @@ def test_get_dummies_with_pyarrow_dtype(any_string_dtype, dtype): # GH#47872 +@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_get_dummies_with_str_dtype(any_string_dtype): s = Series(["a|b", "a|c", np.nan], dtype=any_string_dtype) result = s.str.get_dummies("|", dtype=str) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 06fd81ed722d9..dac74a0e32a42 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1877,13 +1877,16 @@ def test_strobj_mode(self): tm.assert_series_equal(ser.mode(), exp) @pytest.mark.parametrize("dt", [str, object]) - def test_strobj_multi_char(self, dt): + def test_strobj_multi_char(self, dt, using_infer_string): exp = ["bar"] data = ["foo"] * 2 + ["bar"] * 3 ser = Series(data, dtype=dt) exp = Series(exp, dtype=dt) - tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) + if using_infer_string and dt is str: + tm.assert_extension_array_equal(algos.mode(ser.values), exp.values) + else: + tm.assert_numpy_array_equal(algos.mode(ser.values), exp.values) tm.assert_series_equal(ser.mode(), exp) def test_datelike_mode(self): From 0962007726634e55f75150db82aadb754bea9752 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:29:16 +0530 Subject: [PATCH 07/33] DOC: fix SA01 for pandas.api.types.is_interval_dtype (#59863) --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 7cc314007aabd..a436acd01013b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -115,7 +115,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_float PR01,SA01" \ -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ - -i "pandas.api.types.is_interval_dtype SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_list_like SA01" \ -i "pandas.api.types.is_named_tuple PR07,SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 1a38bb03b2c1c..1093b35afa8a0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -481,6 +481,15 @@ def is_interval_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the Interval dtype. + See Also + -------- + api.types.is_object_dtype : Check whether an array-like or dtype is of the + object dtype. + api.types.is_numeric_dtype : Check whether the provided array or dtype is + of a numeric dtype. + api.types.is_categorical_dtype : Check whether an array-like or dtype is of + the Categorical dtype. + Examples -------- >>> from pandas.core.dtypes.common import is_interval_dtype From ffb3c1523747738369bd27d5cdb924ee6884100d Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:30:08 +0530 Subject: [PATCH 08/33] DOC: fix SA01 for pandas.api.types.is_list_like (#59864) --- ci/code_checks.sh | 1 - pandas/_libs/lib.pyx | 6 ++++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index a436acd01013b..dd1b441b51772 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -116,7 +116,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ - -i "pandas.api.types.is_list_like SA01" \ -i "pandas.api.types.is_named_tuple PR07,SA01" \ -i "pandas.api.types.is_object_dtype SA01" \ -i "pandas.api.types.is_re PR07,SA01" \ diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 8af48a861967a..de7d9af731010 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1220,6 +1220,12 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: bool Whether `obj` has list-like properties. + See Also + -------- + Series : One-dimensional ndarray with axis labels (including time series). + Index : Immutable sequence used for indexing and alignment. + numpy.ndarray : Array object from NumPy, which is considered list-like. + Examples -------- >>> import datetime From 4b22453651cb71684ce1f56aa67ff6fc451af053 Mon Sep 17 00:00:00 2001 From: musvaage <112724366+musvaage@users.noreply.github.com> Date: Wed, 25 Sep 2024 20:16:32 +0200 Subject: [PATCH 09/33] typo (#59852) --- pandas/io/pytables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index be7b8dc6640ba..618254fee9259 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3580,7 +3580,7 @@ def is_transposed(self) -> bool: @property def data_orientation(self) -> tuple[int, ...]: - """return a tuple of my permuted axes, non_indexable at the front""" + """return a tuple of my permutated axes, non_indexable at the front""" return tuple( itertools.chain( [int(a[0]) for a in self.non_index_axes], From 7543426cdf2728635e92b59585203963035ae536 Mon Sep 17 00:00:00 2001 From: Vibavari Gurunathan Date: Wed, 25 Sep 2024 11:17:57 -0700 Subject: [PATCH 10/33] BUG: Fix from_records() column reorder issue, if columns!=None use passed param (#59717) (#59809) * BUG: Fix columns param reorder issue - if columns!=None, use passed param (#59717) * Add tests for to_arrays() * Fix import order with isort * fix sort * Update datatype to int32 * Fis test * Revert commit * Add test for DaaFrame.from_records() * Apply comments * Delete test_to_arrays.py --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/internals/construction.py | 3 ++- .../frame/constructors/test_from_records.py | 23 +++++++++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3b5183c43bcd0..516a5d938fb18 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -619,6 +619,7 @@ I/O ^^^ - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`) - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`) +- Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 07465e7b87fcd..959e572b2b35b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -750,7 +750,8 @@ def to_arrays( elif isinstance(data, np.ndarray) and data.dtype.names is not None: # e.g. recarray - columns = Index(list(data.dtype.names)) + if columns is None: + columns = Index(data.dtype.names) arrays = [data[k] for k in columns] return arrays, columns diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index abc3aab1c1492..1d4a2c0075e3e 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -469,3 +469,26 @@ def test_from_records_empty2(self): alt = DataFrame(arr) tm.assert_frame_equal(alt, expected) + + def test_from_records_structured_array(self): + # GH 59717 + data = np.array( + [ + ("John", 25, "New York", 50000), + ("Jane", 30, "San Francisco", 75000), + ("Bob", 35, "Chicago", 65000), + ("Alice", 28, "Los Angeles", 60000), + ], + dtype=[("name", "U10"), ("age", "i4"), ("city", "U15"), ("salary", "i4")], + ) + + actual_result = DataFrame.from_records(data, columns=["name", "salary", "city"]) + + modified_data = { + "name": ["John", "Jane", "Bob", "Alice"], + "salary": np.array([50000, 75000, 65000, 60000], dtype="int32"), + "city": ["New York", "San Francisco", "Chicago", "Los Angeles"], + } + expected_result = DataFrame(modified_data) + + tm.assert_frame_equal(actual_result, expected_result) From e38409c304f8da88efd7cf074819a1cf7d12be31 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:49:00 +0530 Subject: [PATCH 11/33] DOC: fix SA01 for pandas.arrays.BooleanArray (#59866) --- ci/code_checks.sh | 1 - pandas/core/arrays/boolean.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index dd1b441b51772..40582f3069e97 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -122,7 +122,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ - -i "pandas.arrays.BooleanArray SA01" \ -i "pandas.arrays.DatetimeArray SA01" \ -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.left SA01" \ diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 74c0cd7719c13..53ebc35b68d14 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -286,6 +286,13 @@ class BooleanArray(BaseMaskedArray): ------- BooleanArray + See Also + -------- + array : Create an array from data with the appropriate dtype. + BooleanDtype : Extension dtype for boolean data. + Series : One-dimensional ndarray with axis labels (including time series). + DataFrame : Two-dimensional, size-mutable, potentially heterogeneous tabular data. + Examples -------- Create an BooleanArray with :func:`pandas.array`: From f049159d8245959bf313e05d1109ed33f778a077 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:49:56 +0530 Subject: [PATCH 12/33] DOC: fix SA01, ES01 for pandas.api.types.is_object_dtype (#59865) --- ci/code_checks.sh | 1 - pandas/core/dtypes/common.py | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 40582f3069e97..4eb9d4055e1f8 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -117,7 +117,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ -i "pandas.api.types.is_named_tuple PR07,SA01" \ - -i "pandas.api.types.is_object_dtype SA01" \ -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 1093b35afa8a0..98c770ec4a8b0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -141,6 +141,11 @@ def is_object_dtype(arr_or_dtype) -> bool: """ Check whether an array-like or dtype is of the object dtype. + This method examines the input to determine if it is of the + object data type. Object dtype is a generic data type that can + hold any Python objects, including strings, lists, and custom + objects. + Parameters ---------- arr_or_dtype : array-like or dtype @@ -151,6 +156,15 @@ def is_object_dtype(arr_or_dtype) -> bool: boolean Whether or not the array-like or dtype is of the object dtype. + See Also + -------- + api.types.is_numeric_dtype : Check whether the provided array or dtype is of a + numeric dtype. + api.types.is_string_dtype : Check whether the provided array or dtype is of + the string dtype. + api.types.is_bool_dtype : Check whether the provided array or dtype is of a + boolean dtype. + Examples -------- >>> from pandas.api.types import is_object_dtype From e221fa48a5d5e61f9adc830ed33562548bea9dd4 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:51:58 +0530 Subject: [PATCH 13/33] DOC: fix RT03 for pandas.date_range (#59868) --- ci/code_checks.sh | 1 - pandas/core/indexes/datetimes.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 4eb9d4055e1f8..72e12effb1104 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -170,7 +170,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.sum SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.date_range RT03" \ -i "pandas.errors.AttributeConflictWarning SA01" \ -i "pandas.errors.CSSWarning SA01" \ -i "pandas.errors.CategoricalConversionWarning SA01" \ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 3b3cda8f7cd33..536f22d38468d 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -875,6 +875,7 @@ def date_range( Returns ------- DatetimeIndex + A DatetimeIndex object of the generated dates. See Also -------- From cf79ac87545744d7c7af7e49b443b2ed0b3ed047 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Wed, 25 Sep 2024 23:52:33 +0530 Subject: [PATCH 14/33] DOC: fix RT03, ES01 for pandas.core.resample.Resampler.ffill (#59871) --- ci/code_checks.sh | 1 - pandas/core/resample.py | 8 +++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 72e12effb1104..49702dce0e258 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -155,7 +155,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.SeriesGroupBy.plot PR02" \ -i "pandas.core.groupby.SeriesGroupBy.sem SA01" \ -i "pandas.core.resample.Resampler.__iter__ RT03,SA01" \ - -i "pandas.core.resample.Resampler.ffill RT03" \ -i "pandas.core.resample.Resampler.get_group RT03,SA01" \ -i "pandas.core.resample.Resampler.groups SA01" \ -i "pandas.core.resample.Resampler.indices SA01" \ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index b621fcf9a6415..711396096a5e3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -529,6 +529,11 @@ def ffill(self, limit: int | None = None): """ Forward fill the values. + This method fills missing values by propagating the last valid + observation forward, up to the next valid observation. It is commonly + used in time series analysis when resampling data to a higher frequency + (upsampling) and filling gaps in the resampled output. + Parameters ---------- limit : int, optional @@ -536,7 +541,8 @@ def ffill(self, limit: int | None = None): Returns ------- - An upsampled Series. + Series + The resampled data with missing values filled forward. See Also -------- From 1ddf028c9469a9d6264171c4c79ef1691fe2c680 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 26 Sep 2024 00:08:18 +0530 Subject: [PATCH 15/33] DOC: fix SA01, ES01 for pandas.arrays.IntervalArray.mid (#59867) * DOC: fix SA01, ES01 for pandas.arrays.IntervalArray.mid * DOC: add double backticks for sphinx compatibility Co-authored-by: mroeschke --------- Co-authored-by: mroeschke --- ci/code_checks.sh | 1 - pandas/core/arrays/interval.py | 10 ++++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 49702dce0e258..3dfd5a3931ecd 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -125,7 +125,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.left SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ - -i "pandas.arrays.IntervalArray.mid SA01" \ -i "pandas.arrays.IntervalArray.right SA01" \ -i "pandas.arrays.NumpyExtensionArray SA01" \ -i "pandas.arrays.SparseArray PR07,SA01" \ diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 52d64162358c8..2ac9c77bef322 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1291,6 +1291,16 @@ def mid(self) -> Index: """ Return the midpoint of each Interval in the IntervalArray as an Index. + The midpoint of an interval is calculated as the average of its + ``left`` and ``right`` bounds. This property returns a ``pandas.Index`` object + containing the midpoint for each interval. + + See Also + -------- + Interval.left : Return left bound for the interval. + Interval.right : Return right bound for the interval. + Interval.length : Return the length of each interval. + Examples -------- From 22055e4d3d42c297b1c86306d77f7a27fad8dcf8 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 26 Sep 2024 00:08:59 +0530 Subject: [PATCH 16/33] DOC: fix SA01, ES01 for pandas.RangeIndex.step (#59857) * DOC: fix SA01, ES01 for pandas.RangeIndex.step * DOC: add double backticks for sphinx compatibility Co-authored-by: mroeschke --------- Co-authored-by: mroeschke --- ci/code_checks.sh | 1 - pandas/core/indexes/range.py | 9 +++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 3dfd5a3931ecd..01486f0e3f926 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -73,7 +73,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Period.freq GL08" \ -i "pandas.Period.ordinal GL08" \ -i "pandas.RangeIndex.from_range PR01,SA01" \ - -i "pandas.RangeIndex.step SA01" \ -i "pandas.Series.cat.add_categories PR01,PR02" \ -i "pandas.Series.cat.as_ordered PR01" \ -i "pandas.Series.cat.as_unordered PR01" \ diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 75d0dfbeb6f01..dc96d1c11db74 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -351,6 +351,15 @@ def step(self) -> int: """ The value of the `step` parameter (``1`` if this was not supplied). + The ``step`` parameter determines the increment (or decrement in the case + of negative values) between consecutive elements in the ``RangeIndex``. + + See Also + -------- + RangeIndex : Immutable index implementing a range-based index. + RangeIndex.stop : Returns the stop value of the RangeIndex. + RangeIndex.start : Returns the start value of the RangeIndex. + Examples -------- >>> idx = pd.RangeIndex(5) From efbc29666d820cf62854556cdeadf044b489de4c Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 26 Sep 2024 00:09:46 +0530 Subject: [PATCH 17/33] DOC: fix SA01, ES01 for pandas.Timedelta.to_timedelta64 (#59860) * DOC: fix SA01, ES01 for pandas.Timedelta.to_timedelta64 * DOC: add double backticks for sphinx compatibility Co-authored-by: mroeschke --------- Co-authored-by: mroeschke --- ci/code_checks.sh | 1 - pandas/_libs/tslibs/timedeltas.pyx | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 01486f0e3f926..20e75f0f6f616 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -101,7 +101,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ - -i "pandas.Timedelta.to_timedelta64 SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.min PR02" \ diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 0ff5c5fb81df8..84ca48c96459f 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1411,6 +1411,18 @@ cdef class _Timedelta(timedelta): """ Return a numpy.timedelta64 object with 'ns' precision. + Since NumPy uses ``timedelta64`` objects for its time operations, converting + a pandas ``Timedelta`` into a NumPy ``timedelta64`` provides seamless + integration between the two libraries, especially when working in environments + that heavily rely on NumPy for array-based calculations. + + See Also + -------- + to_timedelta : Convert argument to timedelta. + numpy.timedelta64 : A NumPy object for time duration. + Timedelta : Represents a duration, the difference between two dates + or times. + Examples -------- >>> td = pd.Timedelta('3D') From c5cfe5d32c7fef4d42e1b22e188a438b5607b804 Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Thu, 26 Sep 2024 00:12:43 +0530 Subject: [PATCH 18/33] DOC: fix SA01, ES01 for pandas.errors.EmptyDataError (#59872) --- ci/code_checks.sh | 1 - pandas/errors/__init__.py | 11 +++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 20e75f0f6f616..f662b4781e84b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -173,7 +173,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.errors.ClosedFileError SA01" \ -i "pandas.errors.DataError SA01" \ -i "pandas.errors.DuplicateLabelError SA01" \ - -i "pandas.errors.EmptyDataError SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ -i "pandas.errors.InvalidIndexError SA01" \ -i "pandas.errors.InvalidVersion SA01" \ diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 7851bc90c5782..b9ceae341afd3 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -205,6 +205,17 @@ class EmptyDataError(ValueError): """ Exception raised in ``pd.read_csv`` when empty data or header is encountered. + This error is typically encountered when attempting to read an empty file or + an invalid file where no data or headers are present. + + See Also + -------- + read_csv : Read a comma-separated values (CSV) file into DataFrame. + errors.ParserError : Exception that is raised by an error encountered in parsing + file contents. + errors.DtypeWarning : Warning raised when reading different dtypes in a column + from a file. + Examples -------- >>> from io import StringIO From 7e5282f5f125406cff7fdf80b452e114adfa4c26 Mon Sep 17 00:00:00 2001 From: Jonathan Marriott <34217286+JonathanMarriott@users.noreply.github.com> Date: Wed, 25 Sep 2024 20:14:49 +0100 Subject: [PATCH 19/33] DOC: Fix inconsistent and incomplete documentation of `pandas.eval` (#59855) * Improve content and organisation of eval documentation * Link to pd.eval in pd.DataFrame.query * Correct name for `//` is floor division * Include arctan2 Co-authored-by: Xiao Yuan --------- Co-authored-by: Xiao Yuan --- pandas/core/computation/eval.py | 37 +++++++++++++++++++++++++-------- pandas/core/frame.py | 33 +++++++++++++++-------------- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py index 485c7f87d6f33..4ccfbd71d9ce8 100644 --- a/pandas/core/computation/eval.py +++ b/pandas/core/computation/eval.py @@ -188,15 +188,6 @@ def eval( """ Evaluate a Python expression as a string using various backends. - The following arithmetic operations are supported: ``+``, ``-``, ``*``, - ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following - boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not). - Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, - :keyword:`or`, and :keyword:`not` with the same semantics as the - corresponding bitwise operators. :class:`~pandas.Series` and - :class:`~pandas.DataFrame` objects are supported and behave as they would - with plain ol' Python evaluation. - .. warning:: ``eval`` can run arbitrary code which can make you vulnerable to code @@ -210,6 +201,34 @@ def eval( `__, only Python `expressions `__. + + By default, with the numexpr engine, the following operations are supported: + + - Arthimetic operations: ``+``, ``-``, ``*``, ``/``, ``**``, ``%`` + - Boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not) + - Comparison operators: ``<``, ``<=``, ``==``, ``!=``, ``>=``, ``>`` + + Furthermore, the following mathematical functions are supported: + + - Trigonometric: ``sin``, ``cos``, ``tan``, ``arcsin``, ``arccos``, \ + ``arctan``, ``arctan2``, ``sinh``, ``cosh``, ``tanh``, ``arcsinh``, \ + ``arccosh`` and ``arctanh`` + - Logarithms: ``log`` natural, ``log10`` base 10, ``log1p`` log(1+x) + - Absolute Value ``abs`` + - Square root ``sqrt`` + - Exponential ``exp`` and Exponential minus one ``expm1`` + + See the numexpr engine `documentation + `__ + for further function support details. + + Using the ``'python'`` engine allows the use of native Python operators + such as floor division ``//``, in addition to built-in and user-defined + Python functions. + + Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`, + :keyword:`or`, and :keyword:`not` with the same semantics as the + corresponding bitwise operators. parser : {'pandas', 'python'}, default 'pandas' The parser to use to construct the syntax tree from the expression. The default of ``'pandas'`` parses code slightly different than standard diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c80e9dfd23ba2..4c56948a48eb2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4479,20 +4479,11 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No expr : str The query string to evaluate. - You can refer to variables - in the environment by prefixing them with an '@' character like - ``@a + b``. - - You can refer to column names that are not valid Python variable names - by surrounding them in backticks. Thus, column names containing spaces - or punctuation (besides underscores) or starting with digits must be - surrounded by backticks. (For example, a column named "Area (cm^2)" would - be referenced as ```Area (cm^2)```). Column names which are Python keywords - (like "if", "for", "import", etc) cannot be used. - - For example, if one of your columns is called ``a a`` and you want - to sum it with ``b``, your query should be ```a a` + b``. + See the documentation for :func:`eval` for details of + supported operations and functions in the query string. + See the documentation for :meth:`DataFrame.eval` for details on + referring to column names and variables in the query string. inplace : bool Whether to modify the DataFrame rather than creating a new one. **kwargs @@ -4651,8 +4642,18 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: in the environment by prefixing them with an '@' character like ``@a + b``. - You can refer to column names that are not valid Python variable - names by surrounding them with backticks `````. + You can refer to column names that are not valid Python variable names + by surrounding them in backticks. Thus, column names containing spaces + or punctuation (besides underscores) or starting with digits must be + surrounded by backticks. (For example, a column named "Area (cm^2)" would + be referenced as ```Area (cm^2)```). Column names which are Python keywords + (like "if", "for", "import", etc) cannot be used. + + For example, if one of your columns is called ``a a`` and you want + to sum it with ``b``, your query should be ```a a` + b``. + + See the documentation for :func:`eval` for full details of + supported operations and functions in the expression string. inplace : bool, default False If the expression contains an assignment, whether to perform the operation inplace and mutate the existing DataFrame. Otherwise, @@ -4660,7 +4661,7 @@ def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: **kwargs See the documentation for :func:`eval` for complete details on the keyword arguments accepted by - :meth:`~pandas.DataFrame.query`. + :meth:`~pandas.DataFrame.eval`. Returns ------- From c8a67401932c773ace0f62660f09b5684f39a148 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 Sep 2024 21:16:04 +0200 Subject: [PATCH 20/33] String dtype: allow string dtype for non-raw apply with numba engine (#59854) * String dtype: allow string dtype for non-raw apply with numba engine * remove xfails * clean-up --- pandas/core/_numba/extensions.py | 3 ++- pandas/core/apply.py | 5 ----- pandas/tests/apply/test_frame_apply.py | 1 - pandas/tests/apply/test_numba.py | 4 ---- 4 files changed, 2 insertions(+), 11 deletions(-) diff --git a/pandas/core/_numba/extensions.py b/pandas/core/_numba/extensions.py index e6f0427de2a3a..413fdafc7fd04 100644 --- a/pandas/core/_numba/extensions.py +++ b/pandas/core/_numba/extensions.py @@ -53,7 +53,8 @@ @contextmanager def set_numba_data(index: Index): numba_data = index._data - if numba_data.dtype == object: + if numba_data.dtype in (object, "string"): + numba_data = np.asarray(numba_data) if not lib.is_string_array(numba_data): raise ValueError( "The numba engine only supports using string or numeric column names" diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 5959156d11123..7d50b466f5126 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1172,12 +1172,7 @@ def apply_with_numba(self) -> dict[int, Any]: from pandas.core._numba.extensions import set_numba_data index = self.obj.index - if index.dtype == "string": - index = index.astype(object) - columns = self.obj.columns - if columns.dtype == "string": - columns = columns.astype(object) # Convert from numba dict to regular dict # Our isinstance checks in the df constructor don't pass for numbas typed dict diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 3be3562d23cd6..dee0efcd8fd15 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -65,7 +65,6 @@ def test_apply(float_frame, engine, request): assert result.index is float_frame.index -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("axis", [0, 1]) @pytest.mark.parametrize("raw", [True, False]) @pytest.mark.parametrize("nopython", [True, False]) diff --git a/pandas/tests/apply/test_numba.py b/pandas/tests/apply/test_numba.py index 825d295043e69..d6cd9c321ace6 100644 --- a/pandas/tests/apply/test_numba.py +++ b/pandas/tests/apply/test_numba.py @@ -1,8 +1,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas.util._test_decorators as td import pandas as pd @@ -20,7 +18,6 @@ def apply_axis(request): return request.param -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_noop(float_frame, apply_axis): func = lambda x: x result = float_frame.apply(func, engine="numba", axis=apply_axis) @@ -43,7 +40,6 @@ def test_numba_vs_python_string_index(): ) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_numba_vs_python_indexing(): frame = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7.0, 8.0, 9.0]}, From b87bf854519466182b43f9f7d5b6c9d91be87ad0 Mon Sep 17 00:00:00 2001 From: Naresh Kumar Date: Wed, 25 Sep 2024 12:18:03 -0700 Subject: [PATCH 21/33] ENH: Add kwargs to Series.map (#59843) Co-authored-by: Naresh Kumar --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/series.py | 9 +++++++++ pandas/tests/series/methods/test_map.py | 7 +++++++ 3 files changed, 17 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 516a5d938fb18..41ba80989a0ce 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -54,6 +54,7 @@ Other enhancements - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`) +- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`) - :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`) - :meth:`str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`) - Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0c26ce27c680c..bbcb6615aeefd 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -11,6 +11,7 @@ Mapping, Sequence, ) +import functools import operator import sys from textwrap import dedent @@ -4312,6 +4313,7 @@ def map( self, arg: Callable | Mapping | Series, na_action: Literal["ignore"] | None = None, + **kwargs, ) -> Series: """ Map values of Series according to an input mapping or function. @@ -4327,6 +4329,11 @@ def map( na_action : {None, 'ignore'}, default None If 'ignore', propagate NaN values, without passing them to the mapping correspondence. + **kwargs + Additional keyword arguments to pass as keywords arguments to + `arg`. + + .. versionadded:: 3.0.0 Returns ------- @@ -4388,6 +4395,8 @@ def map( 3 I am a rabbit dtype: object """ + if callable(arg): + arg = functools.partial(arg, **kwargs) new_values = self._map_values(arg, na_action=na_action) return self._constructor(new_values, index=self.index, copy=False).__finalize__( self, method="map" diff --git a/pandas/tests/series/methods/test_map.py b/pandas/tests/series/methods/test_map.py index 7fa8686fcc6c8..84b60a2afe6eb 100644 --- a/pandas/tests/series/methods/test_map.py +++ b/pandas/tests/series/methods/test_map.py @@ -597,3 +597,10 @@ def test_map_type(): result = s.map(type) expected = Series([int, str, type], index=["a", "b", "c"]) tm.assert_series_equal(result, expected) + + +def test_map_kwargs(): + # GH 59814 + result = Series([2, 4, 5]).map(lambda x, y: x + y, y=2) + expected = Series([4, 6, 7]) + tm.assert_series_equal(result, expected) From a9f76d753dfe3db9206e5556c90ffac0e0ebf46d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Sep 2024 12:19:47 -0700 Subject: [PATCH 22/33] REF: pass dtype explicitly to _from_sequence inside pd.array (#59773) REF: pass dtype explicitly to _from_sequence --- pandas/core/construction.py | 6 ++++-- pandas/tests/extension/base/methods.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index bb3aa3867ab08..1e1292f8ef089 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -358,7 +358,8 @@ def array( return cls._from_sequence(data, dtype=dtype, copy=copy) elif data.dtype.kind in "iu": - return IntegerArray._from_sequence(data, copy=copy) + dtype = IntegerArray._dtype_cls._get_dtype_mapping()[data.dtype] + return IntegerArray._from_sequence(data, dtype=dtype, copy=copy) elif data.dtype.kind == "f": # GH#44715 Exclude np.float16 bc FloatingArray does not support it; # we will fall back to NumpyExtensionArray. @@ -366,7 +367,8 @@ def array( return NumpyExtensionArray._from_sequence( data, dtype=data.dtype, copy=copy ) - return FloatingArray._from_sequence(data, copy=copy) + dtype = FloatingArray._dtype_cls._get_dtype_mapping()[data.dtype] + return FloatingArray._from_sequence(data, dtype=dtype, copy=copy) elif data.dtype.kind == "b": return BooleanArray._from_sequence(data, dtype="boolean", copy=copy) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index dd2ed0bd62a02..fd9fec0cb490c 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -549,7 +549,7 @@ def _test_searchsorted_bool_dtypes(self, data_for_sorting, as_series): dtype = data_for_sorting.dtype data_for_sorting = pd.array([True, False], dtype=dtype) b, a = data_for_sorting - arr = type(data_for_sorting)._from_sequence([a, b]) + arr = type(data_for_sorting)._from_sequence([a, b], dtype=dtype) if as_series: arr = pd.Series(arr) From a92b919a1bb676252b45e574d102b2af29daac12 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 25 Sep 2024 12:21:12 -0700 Subject: [PATCH 23/33] REF: pass dtype explicitly to _from_sequence (#59774) --- pandas/core/arrays/arrow/array.py | 8 ++++++- pandas/core/arrays/datetimelike.py | 6 ++--- pandas/core/arrays/datetimes.py | 6 +---- pandas/core/arrays/period.py | 2 +- .../arrays/datetimes/test_constructors.py | 22 ++++++++++++------- pandas/tests/arrays/test_array.py | 8 +++++-- pandas/tests/arrays/test_datetimelike.py | 10 +++++---- pandas/tests/arrays/test_datetimes.py | 12 +++++++--- pandas/tests/arrays/test_timedeltas.py | 8 +++---- pandas/tests/base/test_conversion.py | 3 ++- pandas/tests/dtypes/test_generic.py | 4 ++-- .../series/accessors/test_dt_accessor.py | 3 ++- 12 files changed, 57 insertions(+), 35 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 39cae5b8e2683..00d46ab9296d0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2300,7 +2300,13 @@ def _groupby_op( ) if isinstance(result, np.ndarray): return result - return type(self)._from_sequence(result, copy=False) + elif isinstance(result, BaseMaskedArray): + pa_result = result.__arrow_array__() + return type(self)(pa_result) + else: + # DatetimeArray, TimedeltaArray + pa_result = pa.array(result, from_pandas=True) + return type(self)(pa_result) def _apply_elementwise(self, func: Callable) -> list[list[Any]]: """Apply a callable to each element while maintaining the chunking structure.""" diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 7be8daa09c758..a25a698856747 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1393,7 +1393,7 @@ def __add__(self, other): if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(result) + return TimedeltaArray._from_sequence(result, dtype=result.dtype) return result def __radd__(self, other): @@ -1453,7 +1453,7 @@ def __sub__(self, other): if isinstance(result, np.ndarray) and lib.is_np_dtype(result.dtype, "m"): from pandas.core.arrays import TimedeltaArray - return TimedeltaArray._from_sequence(result) + return TimedeltaArray._from_sequence(result, dtype=result.dtype) return result def __rsub__(self, other): @@ -1472,7 +1472,7 @@ def __rsub__(self, other): # Avoid down-casting DatetimeIndex from pandas.core.arrays import DatetimeArray - other = DatetimeArray._from_sequence(other) + other = DatetimeArray._from_sequence(other, dtype=other.dtype) return other - self elif self.dtype.kind == "M" and hasattr(other, "dtype") and not other_is_dt64: # GH#19959 datetime - datetime is well-defined as timedelta, diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 201c449185057..43f4428118aa7 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -818,11 +818,7 @@ def _add_offset(self, offset: BaseOffset) -> Self: stacklevel=find_stack_level(), ) res_values = self.astype("O") + offset - # TODO(GH#55564): as_unit will be unnecessary - result = type(self)._from_sequence(res_values).as_unit(self.unit) - if not len(self): - # GH#30336 _from_sequence won't be able to infer self.tz - return result.tz_localize(self.tz) + result = type(self)._from_sequence(res_values, dtype=self.dtype) else: result = type(self)._simple_new(res_values, dtype=res_values.dtype) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index aa8dacbd6aad5..7d0ad74f851f0 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -812,7 +812,7 @@ def to_timestamp(self, freq=None, how: str = "start") -> DatetimeArray: new_parr = self.asfreq(freq, how=how) new_data = libperiod.periodarr_to_dt64arr(new_parr.asi8, base) - dta = DatetimeArray._from_sequence(new_data) + dta = DatetimeArray._from_sequence(new_data, dtype=np.dtype("M8[ns]")) if self.freq.name == "B": # See if we can retain BDay instead of Day in cases where diff --git a/pandas/tests/arrays/datetimes/test_constructors.py b/pandas/tests/arrays/datetimes/test_constructors.py index d7264c002c67f..74cc3e991bb76 100644 --- a/pandas/tests/arrays/datetimes/test_constructors.py +++ b/pandas/tests/arrays/datetimes/test_constructors.py @@ -28,10 +28,12 @@ def test_mixing_naive_tzaware_raises(self, meth): # GH#24569 arr = np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]) - msg = ( - "Cannot mix tz-aware with tz-naive values|" - "Tz-aware datetime.datetime cannot be converted " - "to datetime64 unless utc=True" + msg = "|".join( + [ + "Cannot mix tz-aware with tz-naive values", + "Tz-aware datetime.datetime cannot be converted " + "to datetime64 unless utc=True", + ] ) for obj in [arr, arr[::-1]]: @@ -63,10 +65,10 @@ def test_bool_dtype_raises(self): def test_copy(self): data = np.array([1, 2, 3], dtype="M8[ns]") - arr = DatetimeArray._from_sequence(data, copy=False) + arr = DatetimeArray._from_sequence(data, dtype=data.dtype, copy=False) assert arr._ndarray is data - arr = DatetimeArray._from_sequence(data, copy=True) + arr = DatetimeArray._from_sequence(data, dtype=data.dtype, copy=True) assert arr._ndarray is not data def test_numpy_datetime_unit(self, unit): @@ -163,7 +165,9 @@ def test_from_arrow_from_empty(unit, tz): dtype = DatetimeTZDtype(unit=unit, tz=tz) result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence(np.array(data, dtype=f"datetime64[{unit}]")) + expected = DatetimeArray._from_sequence( + np.array(data, dtype=f"datetime64[{unit}]"), dtype=np.dtype(f"M8[{unit}]") + ) expected = expected.tz_localize(tz=tz) tm.assert_extension_array_equal(result, expected) @@ -179,7 +183,9 @@ def test_from_arrow_from_integers(): dtype = DatetimeTZDtype(unit="ns", tz="UTC") result = dtype.__from_arrow__(arr) - expected = DatetimeArray._from_sequence(np.array(data, dtype="datetime64[ns]")) + expected = DatetimeArray._from_sequence( + np.array(data, dtype="datetime64[ns]"), dtype=np.dtype("M8[ns]") + ) expected = expected.tz_localize("UTC") tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 4070a2844846f..3c0ef1e4d928b 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -370,11 +370,15 @@ def test_array_copy(): ), ( np.array([1, 2], dtype="m8[ns]"), - TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")), + TimedeltaArray._from_sequence( + np.array([1, 2], dtype="m8[ns]"), dtype=np.dtype("m8[ns]") + ), ), ( np.array([1, 2], dtype="m8[us]"), - TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")), + TimedeltaArray._from_sequence( + np.array([1, 2], dtype="m8[us]"), dtype=np.dtype("m8[us]") + ), ), # integer ([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 6dd1ef9d59ab4..0c8eefab95464 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -257,7 +257,8 @@ def test_fillna_method_doesnt_change_orig(self, method): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls._from_sequence(data) + dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]" + arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype)) arr[4] = NaT fill_value = arr[3] if method == "pad" else arr[5] @@ -273,7 +274,8 @@ def test_searchsorted(self): if self.array_cls is PeriodArray: arr = self.array_cls(data, dtype="period[D]") else: - arr = self.array_cls._from_sequence(data) + dtype = "M8[ns]" if self.array_cls is DatetimeArray else "m8[ns]" + arr = self.array_cls._from_sequence(data, dtype=np.dtype(dtype)) # scalar result = arr.searchsorted(arr[1]) @@ -739,10 +741,10 @@ def test_array_i8_dtype(self, arr1d): def test_from_array_keeps_base(self): # Ensure that DatetimeArray._ndarray.base isn't lost. arr = np.array(["2000-01-01", "2000-01-02"], dtype="M8[ns]") - dta = DatetimeArray._from_sequence(arr) + dta = DatetimeArray._from_sequence(arr, dtype=arr.dtype) assert dta._ndarray is arr - dta = DatetimeArray._from_sequence(arr[:0]) + dta = DatetimeArray._from_sequence(arr[:0], dtype=arr.dtype) assert dta._ndarray.base is arr def test_from_dti(self, arr1d): diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 8e348805de978..e3f49d04a0ff2 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -499,7 +499,7 @@ def test_value_counts_preserves_tz(self): @pytest.mark.parametrize("method", ["pad", "backfill"]) def test_fillna_preserves_tz(self, method): dti = pd.date_range("2000-01-01", periods=5, freq="D", tz="US/Central") - arr = DatetimeArray._from_sequence(dti, copy=True) + arr = DatetimeArray._from_sequence(dti, dtype=dti.dtype, copy=True) arr[2] = pd.NaT fill_val = dti[1] if method == "pad" else dti[3] @@ -665,7 +665,9 @@ def test_shift_fill_value(self): dti = pd.date_range("2016-01-01", periods=3) dta = dti._data - expected = DatetimeArray._from_sequence(np.roll(dta._ndarray, 1)) + expected = DatetimeArray._from_sequence( + np.roll(dta._ndarray, 1), dtype=dti.dtype + ) fv = dta[-1] for fill_value in [fv, fv.to_pydatetime(), fv.to_datetime64()]: @@ -731,7 +733,11 @@ def test_iter_zoneinfo_fold(self, tz): ) utc_vals *= 1_000_000_000 - dta = DatetimeArray._from_sequence(utc_vals).tz_localize("UTC").tz_convert(tz) + dta = ( + DatetimeArray._from_sequence(utc_vals, dtype=np.dtype("M8[ns]")) + .tz_localize("UTC") + .tz_convert(tz) + ) left = dta[2] right = list(dta)[2] diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index bcc52f197ee51..fb7c7afdc6ff9 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -263,10 +263,10 @@ def test_searchsorted_invalid_types(self, other, index): class TestUnaryOps: def test_abs(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) evals = np.array([3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - expected = TimedeltaArray._from_sequence(evals) + expected = TimedeltaArray._from_sequence(evals, dtype=evals.dtype) result = abs(arr) tm.assert_timedelta_array_equal(result, expected) @@ -276,7 +276,7 @@ def test_abs(self): def test_pos(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) result = +arr tm.assert_timedelta_array_equal(result, arr) @@ -288,7 +288,7 @@ def test_pos(self): def test_neg(self): vals = np.array([-3600 * 10**9, "NaT", 7200 * 10**9], dtype="m8[ns]") - arr = TimedeltaArray._from_sequence(vals) + arr = TimedeltaArray._from_sequence(vals, dtype=vals.dtype) evals = np.array([3600 * 10**9, "NaT", -7200 * 10**9], dtype="m8[ns]") expected = TimedeltaArray._from_sequence(evals) diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 13a3ff048c79e..d8af7abe83084 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -333,7 +333,8 @@ def test_array_multiindex_raises(): # Timedelta ( TimedeltaArray._from_sequence( - np.array([0, 3600000000000], dtype="i8").view("m8[ns]") + np.array([0, 3600000000000], dtype="i8").view("m8[ns]"), + dtype=np.dtype("m8[ns]"), ), np.array([0, 3600000000000], dtype="m8[ns]"), ), diff --git a/pandas/tests/dtypes/test_generic.py b/pandas/tests/dtypes/test_generic.py index 261f86bfb0326..2b90886a8d070 100644 --- a/pandas/tests/dtypes/test_generic.py +++ b/pandas/tests/dtypes/test_generic.py @@ -20,8 +20,8 @@ class TestABCClasses: df = pd.DataFrame({"names": ["a", "b", "c"]}, index=multi_index) sparse_array = pd.arrays.SparseArray(np.random.default_rng(2).standard_normal(10)) - datetime_array = pd.core.arrays.DatetimeArray._from_sequence(datetime_index) - timedelta_array = pd.core.arrays.TimedeltaArray._from_sequence(timedelta_index) + datetime_array = datetime_index.array + timedelta_array = timedelta_index.array abc_pairs = [ ("ABCMultiIndex", multi_index), diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 9b9a8ea3600ae..885adb3543b46 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -790,7 +790,8 @@ def test_end_time_timevalues(self, input_vals): # GH#17157 # Check that the time part of the Period is adjusted by end_time # when using the dt accessor on a Series - input_vals = PeriodArray._from_sequence(np.asarray(input_vals)) + dtype = pd.PeriodDtype(input_vals[0].freq) + input_vals = PeriodArray._from_sequence(np.asarray(input_vals), dtype=dtype) ser = Series(input_vals) result = ser.dt.end_time From b96491a11b7938c9146a26bfac339a6ebe0ca4a2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 25 Sep 2024 09:22:49 -1000 Subject: [PATCH 24/33] DOC: Emphasize team managed pandas in installation docs (#59822) * DOC: Emphasize team managed pandas in installation docs * grammar --- doc/source/development/maintaining.rst | 2 +- doc/source/getting_started/index.rst | 3 +- doc/source/getting_started/install.rst | 151 ++++++------------ web/pandas/getting_started.md | 29 +--- .../static/img/install/anaconda_prompt.png | Bin 1373 -> 0 bytes .../static/img/install/jupyterlab_home.png | Bin 1962 -> 0 bytes .../img/install/pandas_import_and_version.png | Bin 2252 -> 0 bytes 7 files changed, 52 insertions(+), 133 deletions(-) delete mode 100644 web/pandas/static/img/install/anaconda_prompt.png delete mode 100644 web/pandas/static/img/install/jupyterlab_home.png delete mode 100644 web/pandas/static/img/install/pandas_import_and_version.png diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index 50d380cab1d50..1e4a851d0e72d 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -344,7 +344,7 @@ in the next places: - Git repo with a `new tag `_ - Source distribution in a `GitHub release `_ - Pip packages in the `PyPI `_ -- Conda/Mamba packages in `conda-forge `_ +- Conda packages in `conda-forge `_ The process for releasing a new version of pandas is detailed next section. diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index 36ed553d9d88e..a17699a71fbd3 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -17,8 +17,7 @@ Installation :columns: 12 12 6 6 :padding: 3 - pandas is part of the `Anaconda `__ - distribution and can be installed with Anaconda or Miniconda: + pandas can be installed via conda from `conda-forge `__. ++++++++++++++++++++++ diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 8e6cb9e9a132d..b3982c4ad091f 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -6,15 +6,16 @@ Installation ============ -The easiest way to install pandas is to install it -as part of the `Anaconda `__ distribution, a -cross platform distribution for data analysis and scientific computing. -The `Conda `__ package manager is the -recommended installation method for most users. +The pandas development team officially distributes pandas for installation +through the following methods: -Instructions for installing :ref:`from source `, -:ref:`PyPI `, or a -:ref:`development version ` are also provided. +* Available on `conda-forge `__ for installation with the conda package manager. +* Available on `PyPI `__ for installation with pip. +* Available on `Github `__ for installation from source. + +.. note:: + pandas may be installable from other sources besides the ones listed above, + but they are **not** managed by the pandas development team. .. _install.version: @@ -26,68 +27,54 @@ See :ref:`Python support policy `. Installing pandas ----------------- -.. _install.anaconda: +.. _install.conda: -Installing with Anaconda -~~~~~~~~~~~~~~~~~~~~~~~~ +Installing with Conda +~~~~~~~~~~~~~~~~~~~~~ -For users that are new to Python, the easiest way to install Python, pandas, and the -packages that make up the `PyData `__ stack -(`SciPy `__, `NumPy `__, -`Matplotlib `__, `and more `__) -is with `Anaconda `__, a cross-platform -(Linux, macOS, Windows) Python distribution for data analytics and -scientific computing. Installation instructions for Anaconda -`can be found here `__. +For users working with the `Conda `__ package manager, +pandas can be installed from the ``conda-forge`` channel. -.. _install.miniconda: +.. code-block:: shell -Installing with Miniconda -~~~~~~~~~~~~~~~~~~~~~~~~~ + conda install -c conda-forge pandas -For users experienced with Python, the recommended way to install pandas with -`Miniconda `__. -Miniconda allows you to create a minimal, self-contained Python installation compared to Anaconda and use the -`Conda `__ package manager to install additional packages -and create a virtual environment for your installation. Installation instructions for Miniconda -`can be found here `__. +To install the Conda package manager on your system, the +`Miniforge distribution `__ +is recommended. -The next step is to create a new conda environment. A conda environment is like a -virtualenv that allows you to specify a specific version of Python and set of libraries. -Run the following commands from a terminal window. +Additionally, it is recommended to install and run pandas from a virtual environment. .. code-block:: shell conda create -c conda-forge -n name_of_my_env python pandas - -This will create a minimal environment with only Python and pandas installed. -To put your self inside this environment run. - -.. code-block:: shell - + # On Linux or MacOS source activate name_of_my_env # On Windows activate name_of_my_env -.. _install.pypi: +.. tip:: + For users that are new to Python, the easiest way to install Python, pandas, and the + packages that make up the `PyData `__ stack such as + `SciPy `__, `NumPy `__ and + `Matplotlib `__ + is with `Anaconda `__, a cross-platform + (Linux, macOS, Windows) Python distribution for data analytics and + scientific computing. -Installing from PyPI -~~~~~~~~~~~~~~~~~~~~ + However, pandas from Anaconda is **not** officially managed by the pandas development team. -pandas can be installed via pip from -`PyPI `__. +.. _install.pip: -.. code-block:: shell - - pip install pandas +Installing with pip +~~~~~~~~~~~~~~~~~~~ -.. note:: - You must have ``pip>=19.3`` to install from PyPI. +For users working with the `pip `__ package manager, +pandas can be installed from `PyPI `__. -.. note:: +.. code-block:: shell - It is recommended to install and run pandas from a virtual environment, for example, - using the Python standard library's `venv `__ + pip install pandas pandas can also be installed with sets of optional dependencies to enable certain functionality. For example, to install pandas with the optional dependencies to read Excel files. @@ -98,25 +85,8 @@ to install pandas with the optional dependencies to read Excel files. The full list of extras that can be installed can be found in the :ref:`dependency section.` -Handling ImportErrors -~~~~~~~~~~~~~~~~~~~~~ - -If you encounter an ``ImportError``, it usually means that Python couldn't find pandas in the list of available -libraries. Python internally has a list of directories it searches through, to find packages. You can -obtain these directories with. - -.. code-block:: python - - import sys - sys.path - -One way you could be encountering this error is if you have multiple Python installations on your system -and you don't have pandas installed in the Python installation you're currently using. -In Linux/Mac you can run ``which python`` on your terminal and it will tell you which Python installation you're -using. If it's something like "/usr/bin/python", you're using the Python from the system, which is not recommended. - -It is highly recommended to use ``conda``, for quick installation and for package and dependency updates. -You can find simple installation instructions for pandas :ref:`in this document `. +Additionally, it is recommended to install and run pandas from a virtual environment, for example, +using the Python standard library's `venv `__ .. _install.source: @@ -144,49 +114,24 @@ index from the PyPI registry of anaconda.org. You can install it by running. pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple pandas -Note that you might be required to uninstall an existing version of pandas to install the development version. +.. note:: + You might be required to uninstall an existing version of pandas to install the development version. -.. code-block:: shell + .. code-block:: shell - pip uninstall pandas -y + pip uninstall pandas -y Running the test suite ---------------------- -pandas is equipped with an exhaustive set of unit tests. The packages required to run the tests -can be installed with ``pip install "pandas[test]"``. To run the tests from a -Python terminal. - -.. code-block:: python - - >>> import pandas as pd - >>> pd.test() - running: pytest -m "not slow and not network and not db" /home/user/anaconda3/lib/python3.10/site-packages/pandas - - ============================= test session starts ============================== - platform linux -- Python 3.9.7, pytest-6.2.5, py-1.11.0, pluggy-1.0.0 - rootdir: /home/user - plugins: dash-1.19.0, anyio-3.5.0, hypothesis-6.29.3 - collected 154975 items / 4 skipped / 154971 selected - ........................................................................ [ 0%] - ........................................................................ [ 99%] - ....................................... [100%] - - ==================================== ERRORS ==================================== - - =================================== FAILURES =================================== - - =============================== warnings summary =============================== - - =========================== short test summary info ============================ - - = 1 failed, 146194 passed, 7402 skipped, 1367 xfailed, 5 xpassed, 197 warnings, 10 errors in 1090.16s (0:18:10) = +If pandas has been installed :ref:`from source `, running ``pytest pandas`` will run all of pandas unit tests. +The unit tests can also be run from the pandas module itself with the :func:`test` function. The packages required to run the tests +can be installed with ``pip install "pandas[test]"``. .. note:: - This is just an example of what information is shown. Test failures are not necessarily indicative - of a broken pandas installation. + Test failures are not necessarily indicative of a broken pandas installation. .. _install.dependencies: @@ -219,7 +164,7 @@ For example, :func:`pandas.read_hdf` requires the ``pytables`` package, while optional dependency is not installed, pandas will raise an ``ImportError`` when the method requiring that dependency is called. -If using pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml) +With pip, optional pandas dependencies can be installed or managed in a file (e.g. requirements.txt or pyproject.toml) as optional extras (e.g. ``pandas[performance, aws]``). All optional dependencies can be installed with ``pandas[all]``, and specific sets of dependencies are listed in the sections below. diff --git a/web/pandas/getting_started.md b/web/pandas/getting_started.md index 0c4219e1ae12e..801081a9ef391 100644 --- a/web/pandas/getting_started.md +++ b/web/pandas/getting_started.md @@ -2,33 +2,8 @@ ## Installation instructions -The next steps provides the easiest and recommended way to set up your -environment to use pandas. Other installation options can be found in -the [advanced installation page]({{ base_url}}docs/getting_started/install.html). - -1. Download [Anaconda](https://www.anaconda.com/download/) for your operating system and - the latest Python version, run the installer, and follow the steps. Please note: - - - It is not needed (and discouraged) to install Anaconda as root or administrator. - - When asked if you wish to initialize Anaconda3, answer yes. - - Restart the terminal after completing the installation. - - Detailed instructions on how to install Anaconda can be found in the - [Anaconda documentation](https://docs.anaconda.com/anaconda/install/). - -2. In the Anaconda prompt (or terminal in Linux or macOS), start JupyterLab: - - - -3. In JupyterLab, create a new (Python 3) notebook: - - - -4. In the first cell of the notebook, you can import pandas and check the version with: - - - -5. Now you are ready to use pandas, and you can write your code in the next cells. +To install pandas, please reference the [installation page]({{ base_url}}docs/getting_started/install.html) +from the pandas documentation. ## Tutorials diff --git a/web/pandas/static/img/install/anaconda_prompt.png b/web/pandas/static/img/install/anaconda_prompt.png deleted file mode 100644 index 7b547e4ebb02a6102ecf615ddddf576dc74ccd15..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1373 zcmeAS@N?(olHy`uVBq!ia0y~yVA=y@|6*YVk{h`?HGq^sfKP}k10#b9bF2%uKq8Bc zoSdPsM`yJOV`zA^eCw@!9nLL_Hq6}KxA)+_-aCJPy;}PB|DUWqmG2oCSYCL#IEGZ* zdV6bswsgA0fsf$}7Q0V-C)y;v<(a<5EiOjKX*{jYYJ;_H*%M>%es_ziw=tk%35>kwN(GMdp|eD+v?`u#*FU$_YYp{-z{xd_vPMdfu$Mq z_b#zrrsVsZJFeoPYQ@9W=@0(uf2{NBy7zda`91S|w-4{yzyJB;B6M#}*47iZzoiHG zYu|el9(QBjrhq{Cd;aR`+V}YT%Rg`1eDB-q%6#+p`IW^J4@JMQ6TDaVpYPtI@4sG0 zZkZkaPf2jk`TUB0`xiI;*Y(BzOo?CZF)zkGEB%?n!@0HQU+?_!);D?Ib9ZND>BO|7 zfuYM&F7`dFTd=1xf8x2t@Tj8+eN|-Mv!uJPv~0IE?>M%*B6eTVp4TFG)2qH7e;)67sCwDgi4T8n z4X?YjcI(e2^KTw?GXlEo*wa2woyS@FRCE7~#}U=`)1)_Z@9v!X(z`i*<<9)r z>%qS_)?bOU==Z*Nr+Sw7<=<;}&OLee^q%dHMT+MqKi_a?#@zTnOM}yo_I;c8G3EYm zbM4BnckiG6aqSy_Wv2cf7X3XRg!X(m(bpAUZ=Zag;xDg+Hz52wSc$R1xzm+dkTEnleepdRpSGcyqyz(#?-~Ktr z9&ApqJiBjut@0s^;@`;qp1b);qoiDsgw-|5aG> za!=^p!$*9!%eVOApd{e)Hh;}t2VQU%U3-wI;wsAwEyAk z{_lI;3N~AaymyZNz{dLC(dgqGm9j}HoZBi=CFXXP&)!~WAr@D?r~dW%gmujCuYIXA zo-#YUHsd&Of|dK8s4n5iKccB*XkbB z-dsFkYxuI8?@pevY47t%UomHXx2a0Z^Qo_Ie9zokETOvbXF&aZshP13?%#Gj{cv*M z^-rhHRI$E4xZBOU&pbKHZ_bU!Ag_2<&izrm_Uro(`&P)9)W2>`QV&>jvMj#({4=q8 zSGR9lD?T^&ey#P#uI&@rzW=K)z7e)&pP}x4W$yK#&ozHP4~khx)FGl15#bOqq*w+f s0<2;%VWh+YR|*qDw`P!8z2z1E3ib2V9y4c40?SJVPgg&ebxsLQ0Dqmv%>V!Z diff --git a/web/pandas/static/img/install/jupyterlab_home.png b/web/pandas/static/img/install/jupyterlab_home.png deleted file mode 100644 index c62d33a5e0fc605be6d66c4a7be9f31d9baee8bc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1962 zcmY*aX;f2576yr|C!ium34ulk$V5mBVvYnv8bsrQ2tvTfbBO~AhNmd-LK+t~8(G|w zZZ^fGl{JG9BJfZk5=0F+G65Sqk0dNKexVHcd4p->#NEQ4cUq{ zw>38~Fu>Bb1%(+HEQ0lT4r8R3b>_Ki^uza|JNIs}x5pYd-DUe*1k z%vIBQ4hKMHFgp94I-?}2QIV}z^#UoA*VyC<4x1rALk{=C_+h-kpRThYZ*=bL*pC2c zia?N%+<|ve|K3q^nbJp&cH_ihceofoADYf%M)eH1z8@3MihnH3f8@eW< zI8th{1-mOEK!iWZZ0v3M`{+nuXG^SKINXN@JCMTSt5KJHtqW4uIG0EQz{8;$AMtna zo(sy=f$O&VpdA)u_?x&|Yov$E^atdf?%tu4X;;vE%QwE_D2QL|=f72=V~0GO*QSrb z01)RF?K*hxSNMNEUsHVbK>*c)sw?Lch98SKY>Izo11Zw0ApebzkPLc08>#>xEu{6) zuR80@=N5udEhtw~d3^ktVjYGMa1=0R6VTzkUDm<-JEIizl*G{s#FVff5 z_34XD2pZal+{a^o%iQe2scBivlFxDga&*ML^`I#vMCDb)0W>0wPeU! z(>R?oqDA9JY6Q!{562YfpfQL+epM1=*>op$$PUfE2{e9`^*5yZ7G-&)Tq>IP93g!4 zv&$#Gl8CBm(k4tcDz>H=H(PyR>ysIGawMen&bw#ghp~C7tu!3K!q;a?68)^?(w9`F zHxATULEhNTN^Rkv8}!OKundRzj&?vRUat>)htLrpF!Po&NlPbqQ|t-`NHU&|1~R<6>0Y#?Z3(7T^&; zv%x$Yo1=UL35GEL;q(8P{RM$eeBL_?ax5N!`>gmb$f1Gh1Kc7AXJ-NIf)UgaIH{0| zIsMZ;wz*_aZffGICt)`uM&6*~@s|CM65PA7+X ztF0$F3aRHg!Ew>75q|~Hl|$JCq;8fM)TJ#(D4o=6wl)0o^%CMu+i)`PA{wlaY!69} z9XPH?C)&*D=7sVCF+(t@>35|BsU(y+O z`c^{n4}TsIRh>^A#M9#@fyn2`BmdC|ckefBH*0`EH3@%MYcq{Jj|qo0mG4RO6|d}0 z@8u5}+qGapbcq@A2YlnLqNhB`|EMK8KAJ2Zhjq#^GRshXswmR~5ZspRbVCj@SD=xi z3M}$RoMiz1r~_0SPBjL<7q;WA`CcxlII2}o;?FpwaLN>6qCKfaO?S@r8X|$z`mU*% zk*ZM5*u)}rc>zd)DVI!PR*`x*5fbI8fxcS`zuCbRI{)<@+`FpVJo?41ZB~`yPDUUl y1er|d?4wgb;X(igiMl+x-r3`?cVQ@0XH=pqxn;r(->m<94QN|Jf@*&_nDK8H{P*er diff --git a/web/pandas/static/img/install/pandas_import_and_version.png b/web/pandas/static/img/install/pandas_import_and_version.png deleted file mode 100644 index 64c1303ac495ccf72a7c649401cce26c47c15ace..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2252 zcmZ{mc|6Wpy{dDhjCxN;DWyRAKT3#6!c$7O199%SvO7=DJRb0p64C6$g|JF zTAw<-^m@N`3KWT@3E`Bzyd#cqWCog8sr(66Dz64>{Nn8&KY_ihRZ97es|LP`*jD=h zEA8-nu=H%uPN9!@DZ)c!iAx_H%kPz3VwxTGZa}dW^iYqWb9BWL!1a*TciL)^!y#3fsKrS2sQC_&=q+eG< z(9?22qGyJ}=aJC{Crg9O7l;h^oa&wdn^4N)rkwYInz<{Tbd1Z3tP_AKm`uf%R+|w^ zoz$xhN({Qy06RzSMthB@ItW~}0$$x6N@D^t%NxT%Ojpd@vWucL2OWs&RuK2>uk|XG zMCXvS^a^Vw(P!h%Y`l@3+WW|0M1ed!h6(6sr&VZ`+^?1x6T^}6#N69XHrJ4^*nI%o zS{7&}Rk%$OyvMrL(~M0A!JQV!S9H%Ta7=; zK$aRD5Q}uHWRcGzVGj1U(~bdO5doI+ph(6 zx9liJ_c&wMNqCuZag9-=0KQ(?fy2hu*@O|Y?xHQuV=j3k#wqYWyl(Xx7_VykN!MKc z>29swK`j%`+nM-yqTO=&;KA02?nTD=-PwlcV^_~BFI4;VBeQtCnZ&`xD-AJ*2|6+G z0B4T`ujB+_8-moJ%?y_zcsp|rgqB8-Yjy|5c$m)uHPUdddMq=?#gO#V=Hx{oq zz-{9)`{H-SK)5wcNgPS76&LSuJmK0co$TXNbMkZ3%yQw-X z=YB+Iz^)-}wnj(ofUwMCRNr^(f-1+pp@WAq&7MkPUyNk>VMAuWnu%`}P8hH@<@RaA z3|K!1qZTm~rhILb4p*#0s?et2m( zGgtyMG`g0TZ|YI^%2nw z#K?lV{FT{z#CRBwftD3G6$s_n_ zP2i|nyh^k(*~M=^=b>f>Z4~>JSo`tRqVHJq(f_M5>b~B5AuMsoLn>kr(HxxAyOu1? zsf6OMzfLK3G8Fi4FmNEB7m~IzKMDcd>FrgvD&Noc;9}PyV97<1f9wQbR9YD8DJ^xy z=-Wm75<5ffMfI(t&9Oh6_*#O$Yw#_^{tCYDwfX<4_K&^)M`!*MFup6v6KhATdP%`I R`QH-(TPsJ)dW)+!{{$*Kl8*oY From b9488218ae27b70d1669a932ab16e8ce5a257cf0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 25 Sep 2024 14:47:10 -1000 Subject: [PATCH 25/33] CI/TST: Check for tzset in set_timezone (#59893) * CI/TST: Check for tzset in set_timezone * adjust test message --- pandas/_testing/contexts.py | 17 +++++++++-------- pandas/tests/tslibs/test_parsing.py | 11 +++++++---- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/pandas/_testing/contexts.py b/pandas/_testing/contexts.py index 91b5d2a981bef..4ca67d6fc082d 100644 --- a/pandas/_testing/contexts.py +++ b/pandas/_testing/contexts.py @@ -73,14 +73,15 @@ def set_timezone(tz: str) -> Generator[None, None, None]: import time def setTZ(tz) -> None: - if tz is None: - try: - del os.environ["TZ"] - except KeyError: - pass - else: - os.environ["TZ"] = tz - time.tzset() + if hasattr(time, "tzset"): + if tz is None: + try: + del os.environ["TZ"] + except KeyError: + pass + else: + os.environ["TZ"] = tz + time.tzset() orig_tz = os.environ.get("TZ") setTZ(tz) diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py index 9b64beaf09273..07425af8ed37a 100644 --- a/pandas/tests/tslibs/test_parsing.py +++ b/pandas/tests/tslibs/test_parsing.py @@ -37,10 +37,13 @@ ) def test_parsing_tzlocal_deprecated(): # GH#50791 - msg = ( - r"Parsing 'EST' as tzlocal \(dependent on system timezone\) " - r"is no longer supported\. " - "Pass the 'tz' keyword or call tz_localize after construction instead" + msg = "|".join( + [ + r"Parsing 'EST' as tzlocal \(dependent on system timezone\) " + r"is no longer supported\. " + "Pass the 'tz' keyword or call tz_localize after construction instead", + ".*included an un-recognized timezone", + ] ) dtstr = "Jan 15 2004 03:00 EST" From 23c497bb2f7e05af1fda966e7fb04db942453559 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 26 Sep 2024 05:06:07 -1000 Subject: [PATCH 26/33] DOC: Recommend conda from miniforge for contributing environment (#59894) --- doc/source/development/contributing.rst | 6 ++--- .../development/contributing_codebase.rst | 2 +- .../development/contributing_environment.rst | 23 +++++++++---------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index fe5271dab7132..4d99f282aa695 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -305,15 +305,15 @@ It is important to periodically update your local ``main`` branch with updates f branch and update your development environment to reflect any changes to the various packages that are used during development. -If using :ref:`mamba `, run: +If using :ref:`conda `, run: .. code-block:: shell git checkout main git fetch upstream git merge upstream/main - mamba activate pandas-dev - mamba env update -f environment.yml --prune + conda activate pandas-dev + conda env update -f environment.yml --prune If using :ref:`pip ` , do: diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index 9d5a992e911b6..670ffe6996302 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -244,7 +244,7 @@ in your python environment. .. warning:: - * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment ` or select a `recently succeeded workflow `_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs. + * Please be aware that the above commands will use the current python environment. If your python packages are older/newer than those installed by the pandas CI, the above commands might fail. This is often the case when the ``mypy`` or ``numpy`` versions do not match. Please see :ref:`how to setup the python environment ` or select a `recently succeeded workflow `_, select the "Docstring validation, typing, and other manual pre-commit hooks" job, then click on "Set up Conda" and "Environment info" to see which versions the pandas CI installs. .. _contributing.ci: diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 643021db7b823..1426d3a84a748 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -43,7 +43,7 @@ and consult the ``Linux`` instructions below. **macOS** -To use the :ref:`mamba `-based compilers, you will need to install the +To use the :ref:`conda `-based compilers, you will need to install the Developer Tools using ``xcode-select --install``. If you prefer to use a different compiler, general information can be found here: @@ -51,9 +51,9 @@ https://devguide.python.org/setup/#macos **Linux** -For Linux-based :ref:`mamba ` installations, you won't have to install any -additional components outside of the mamba environment. The instructions -below are only needed if your setup isn't based on mamba environments. +For Linux-based :ref:`conda ` installations, you won't have to install any +additional components outside of the conda environment. The instructions +below are only needed if your setup isn't based on conda environments. Some Linux distributions will come with a pre-installed C compiler. To find out which compilers (and versions) are installed on your system:: @@ -82,19 +82,18 @@ Before we begin, please: * Make sure that you have :any:`cloned the repository ` * ``cd`` to the pandas source directory you just created with the clone command -.. _contributing.mamba: +.. _contributing.conda: -Option 1: using mamba (recommended) +Option 1: using conda (recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Install miniforge to get `mamba `_ -* Make sure your mamba is up to date (``mamba update mamba``) -* Create and activate the ``pandas-dev`` mamba environment using the following commands: +* Install miniforge to get `conda `_ +* Create and activate the ``pandas-dev`` conda environment using the following commands: -.. code-block:: none +.. code-block:: bash - mamba env create --file environment.yml - mamba activate pandas-dev + conda env create --file environment.yml + conda activate pandas-dev .. _contributing.pip: From 5ced458f6318f0319877ab655b8cb6b86092ea62 Mon Sep 17 00:00:00 2001 From: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> Date: Sat, 28 Sep 2024 07:51:30 -0400 Subject: [PATCH 27/33] CI: Pin micromamba to 1.x (#59912) --- .github/actions/setup-conda/action.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 3eb68bdd2a15c..4fe901998cbcc 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -9,6 +9,8 @@ runs: - name: Install ${{ inputs.environment-file }} uses: mamba-org/setup-micromamba@v1 with: + # Pinning to avoid 2.0 failures + micromamba-version: '1.5.10-0' environment-file: ${{ inputs.environment-file }} environment-name: test condarc-file: ci/.condarc From 96de1f13103cd21417101de9d555f203cf93867a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 29 Sep 2024 00:07:34 +0530 Subject: [PATCH 28/33] DOC: fix SA01, ES01 for pandas.Series.sparse.npoints (#59896) * DOC: fix SA01, ES01 for pandas.Series.sparse.npoints * Update pandas/core/arrays/sparse/array.py --- ci/code_checks.sh | 1 - pandas/core/arrays/sparse/array.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index f662b4781e84b..149c5c0326733 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -97,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Series.dt.unit GL08" \ -i "pandas.Series.pad PR01,SA01" \ -i "pandas.Series.sparse.from_coo PR07,SA01" \ - -i "pandas.Series.sparse.npoints SA01" \ -i "pandas.Timedelta.max PR02" \ -i "pandas.Timedelta.min PR02" \ -i "pandas.Timedelta.resolution PR02" \ diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index c8ec4068ca199..0c76280e7fdb4 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -708,6 +708,18 @@ def npoints(self) -> int: """ The number of non- ``fill_value`` points. + This property returns the number of elements in the sparse series that are + not equal to the ``fill_value``. Sparse data structures store only the + non-``fill_value`` elements, reducing memory usage when the majority of + values are the same. + + See Also + -------- + Series.sparse.to_dense : Convert a Series from sparse values to dense. + Series.sparse.fill_value : Elements in ``data`` that are ``fill_value`` are + not stored. + Series.sparse.density : The percent of non- ``fill_value`` points, as decimal. + Examples -------- >>> from pandas.arrays import SparseArray From cf12e6722cfaba646e7f0a1e5e8db88be8d076cd Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 29 Sep 2024 00:08:55 +0530 Subject: [PATCH 29/33] DOC: fix RT03, ES01 for pandas.core.groupby.DataFrameGroupBy.agg and pandas.core.groupby.DataFrameGroupBy.aggregate (#59869) * DOC: add double backticks for sphinx compatibility Co-authored-by: mroeschke * DOC: remove _agg_template_frame Co-authored-by: mroeschke Co-authored-by: rhshadrach * DOC: fix RT03, ES01 for pandas.core.groupby.DataFrameGroupBy.aggregate --------- Co-authored-by: mroeschke Co-authored-by: rhshadrach --- ci/code_checks.sh | 2 - pandas/core/groupby/generic.py | 176 +++++++++++++++++++++++++- pandas/core/groupby/groupby.py | 78 ------------ scripts/validate_unwanted_patterns.py | 1 - 4 files changed, 174 insertions(+), 83 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 149c5c0326733..669c793737161 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -127,8 +127,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.arrays.SparseArray PR07,SA01" \ -i "pandas.arrays.TimedeltaArray PR07,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.__iter__ RT03,SA01" \ - -i "pandas.core.groupby.DataFrameGroupBy.agg RT03" \ - -i "pandas.core.groupby.DataFrameGroupBy.aggregate RT03" \ -i "pandas.core.groupby.DataFrameGroupBy.boxplot PR07,RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.DataFrameGroupBy.groups SA01" \ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index bec9d344d42e2..0c211afb5073c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -67,7 +67,6 @@ from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, - _agg_template_frame, _agg_template_series, _transform_template, ) @@ -1515,8 +1514,181 @@ class DataFrameGroupBy(GroupBy[DataFrame]): """ ) - @doc(_agg_template_frame, examples=_agg_examples_doc, klass="DataFrame") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + """ + Aggregate using one or more operations. + + The ``aggregate`` function allows the application of one or more aggregation + operations on groups of data within a DataFrameGroupBy object. It supports + various aggregation methods, including user-defined functions and predefined + functions such as 'sum', 'mean', etc. + + Parameters + ---------- + func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a DataFrame or when passed to DataFrame.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - dict of index labels -> functions, function names or list of such. + - None, in which case ``**kwargs`` are used with Named Aggregation. Here the + output has one column for each element in ``**kwargs``. The name of the + column is keyword, whereas the value determines the aggregation used to + compute the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + *args + Positional arguments to pass to func. + engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + + **kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + + Returns + ------- + DataFrame + Aggregated DataFrame based on the grouping and the applied aggregation + functions. + + See Also + -------- + DataFrame.groupby.apply : Apply function func group-wise + and combine the results together. + DataFrame.groupby.transform : Transforms the Series on each group + based on the given function. + DataFrame.aggregate : Aggregate using one or more operations. + + Notes + ----- + When using ``engine='numba'``, there will be no "fall back" behavior internally. + The group data and group index will be passed as numpy arrays to the JITed + user defined function, and no alternative execution attempts will be tried. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Examples + -------- + >>> data = { + ... "A": [1, 1, 2, 2], + ... "B": [1, 2, 3, 4], + ... "C": [0.362838, 0.227877, 1.267767, -0.562860], + ... } + >>> df = pd.DataFrame(data) + >>> df + A B C + 0 1 1 0.362838 + 1 1 2 0.227877 + 2 2 3 1.267767 + 3 2 4 -0.562860 + + The aggregation is for each column. + + >>> df.groupby("A").agg("min") + B C + A + 1 1 0.227877 + 2 3 -0.562860 + + Multiple aggregations + + >>> df.groupby("A").agg(["min", "max"]) + B C + min max min max + A + 1 1 2 0.227877 0.362838 + 2 3 4 -0.562860 1.267767 + + Select a column for aggregation + + >>> df.groupby("A").B.agg(["min", "max"]) + min max + A + 1 1 2 + 2 3 4 + + User-defined function for aggregation + + >>> df.groupby("A").agg(lambda x: sum(x) + 2) + B C + A + 1 5 2.590715 + 2 9 2.704907 + + Different aggregations per column + + >>> df.groupby("A").agg({"B": ["min", "max"], "C": "sum"}) + B C + min max sum + A + 1 1 2 0.590715 + 2 3 4 0.704907 + + To control the output names with different aggregations per column, + pandas supports "named aggregation" + + >>> df.groupby("A").agg( + ... b_min=pd.NamedAgg(column="B", aggfunc="min"), + ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"), + ... ) + b_min c_sum + A + 1 1 0.590715 + 2 3 0.704907 + + - The keywords are the *output* column names + - The values are tuples whose first element is the column to select + and the second element is the aggregation to apply to that column. + Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields + ``['column', 'aggfunc']`` to make it clearer what the arguments are. + As usual, the aggregation can be a callable or a string alias. + + See :ref:`groupby.aggregate.named` for more. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating + function. + + >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min()) + B + A + 1 1.0 + 2 3.0 + """ relabeling, func, columns, order = reconstruct_func(func, **kwargs) func = maybe_mangle_lambdas(func) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 38dad446b4c39..9e36837bc679f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -445,84 +445,6 @@ class providing the base-class of operations. see the examples below. {examples}""" -_agg_template_frame = """ -Aggregate using one or more operations. - -Parameters ----------- -func : function, str, list, dict or None - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - dict of index labels -> functions, function names or list of such. - - None, in which case ``**kwargs`` are used with Named Aggregation. Here the - output has one column for each element in ``**kwargs``. The name of the - column is keyword, whereas the value determines the aggregation used to compute - the values in the column. - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - -**kwargs - * If ``func`` is None, ``**kwargs`` are used to define the output names and - aggregations via Named Aggregation. See ``func`` entry. - * Otherwise, keyword arguments to be passed into func. - -Returns -------- -{klass} - -See Also --------- -{klass}.groupby.apply : Apply function func group-wise - and combine the results together. -{klass}.groupby.transform : Transforms the Series on each group - based on the given function. -{klass}.aggregate : Aggregate using one or more operations. - -Notes ------ -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` -for more details. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. -{examples}""" - @final class GroupByPlot(PandasObject): diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 35f6ffb4980df..5962709056ae8 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -30,7 +30,6 @@ "_new_Index", "_new_PeriodIndex", "_agg_template_series", - "_agg_template_frame", "_pipe_template", "_apply_groupings_depr", "__main__", From d538a1cd1ad5d1e506c2dc36144e4cac5534858a Mon Sep 17 00:00:00 2001 From: Tuhin Sharma Date: Sun, 29 Sep 2024 01:08:32 +0530 Subject: [PATCH 30/33] DOC: fix RT03, ES01 for pandas.core.groupby.SeriesGroupBy.agg and pandas.core.groupby.SeriesGroupBy.aggregate (#59898) * DOC: fix RT03, ES01 for pandas.core.groupby.SeriesGroupBy.agg * DOC: remove _agg_template_series Co-authored-by: mroeschke Co-authored-by: rhshadrach * DOC: remove _agg_template_series Co-authored-by: mroeschke Co-authored-by: rhshadrach * DOC: remove _agg_template_seris --------- Co-authored-by: mroeschke Co-authored-by: rhshadrach --- ci/code_checks.sh | 2 - pandas/core/groupby/generic.py | 136 +++++++++++++++++++++++++- pandas/core/groupby/groupby.py | 81 --------------- scripts/validate_unwanted_patterns.py | 1 - 4 files changed, 134 insertions(+), 86 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 669c793737161..b65dcedbd8a10 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -137,8 +137,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.groupby.DataFrameGroupBy.plot PR02" \ -i "pandas.core.groupby.DataFrameGroupBy.sem SA01" \ -i "pandas.core.groupby.SeriesGroupBy.__iter__ RT03,SA01" \ - -i "pandas.core.groupby.SeriesGroupBy.agg RT03" \ - -i "pandas.core.groupby.SeriesGroupBy.aggregate RT03" \ -i "pandas.core.groupby.SeriesGroupBy.get_group RT03,SA01" \ -i "pandas.core.groupby.SeriesGroupBy.groups SA01" \ -i "pandas.core.groupby.SeriesGroupBy.indices SA01" \ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 0c211afb5073c..110c0ea88a0a1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -67,7 +67,6 @@ from pandas.core.groupby.groupby import ( GroupBy, GroupByPlot, - _agg_template_series, _transform_template, ) from pandas.core.indexes.api import ( @@ -323,8 +322,141 @@ def apply(self, func, *args, **kwargs) -> Series: """ return super().apply(func, *args, **kwargs) - @doc(_agg_template_series, examples=_agg_examples_doc, klass="Series") def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): + """ + Aggregate using one or more operations. + + The ``aggregate`` method enables flexible and efficient aggregation of grouped + data using a variety of functions, including built-in, user-defined, and + optimized JIT-compiled functions. + + Parameters + ---------- + func : function, str, list, dict or None + Function to use for aggregating the data. If a function, must either + work when passed a Series or when passed to Series.apply. + + Accepted combinations are: + + - function + - string function name + - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` + - None, in which case ``**kwargs`` are used with Named Aggregation. Here + the output has one column for each element in ``**kwargs``. The name of + the column is keyword, whereas the value determines the aggregation + used to compute the values in the column. + + Can also accept a Numba JIT function with + ``engine='numba'`` specified. Only passing a single function is supported + with this engine. + + If the ``'numba'`` engine is chosen, the function must be + a user defined function with ``values`` and ``index`` as the + first and second arguments respectively in the function signature. + Each group's index will be passed to the user defined function + and optionally available for use. + + .. deprecated:: 2.1.0 + + Passing a dictionary is deprecated and will raise in a future version + of pandas. Pass a list of aggregations instead. + *args + Positional arguments to pass to func. + engine : str, default None + * ``'cython'`` : Runs the function through C-extensions from cython. + * ``'numba'`` : Runs the function through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be + applied to the function + + **kwargs + * If ``func`` is None, ``**kwargs`` are used to define the output names and + aggregations via Named Aggregation. See ``func`` entry. + * Otherwise, keyword arguments to be passed into func. + + Returns + ------- + Series + Aggregated Series based on the grouping and the applied aggregation + functions. + + See Also + -------- + SeriesGroupBy.apply : Apply function func group-wise + and combine the results together. + SeriesGroupBy.transform : Transforms the Series on each group + based on the given function. + Series.aggregate : Aggregate using one or more operations. + + Notes + ----- + When using ``engine='numba'``, there will be no "fall back" behavior internally. + The group data and group index will be passed as numpy arrays to the JITed + user defined function, and no alternative execution attempts will be tried. + + Functions that mutate the passed object can produce unexpected + behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` + for more details. + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the passed ``func``, + see the examples below. + + Examples + -------- + >>> s = pd.Series([1, 2, 3, 4]) + + >>> s + 0 1 + 1 2 + 2 3 + 3 4 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).min() + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg("min") + 1 1 + 2 3 + dtype: int64 + + >>> s.groupby([1, 1, 2, 2]).agg(["min", "max"]) + min max + 1 1 2 + 2 3 4 + + The output column names can be controlled by passing + the desired column names and aggregations as keyword arguments. + + >>> s.groupby([1, 1, 2, 2]).agg( + ... minimum="min", + ... maximum="max", + ... ) + minimum maximum + 1 1 2 + 2 3 4 + + .. versionchanged:: 1.3.0 + + The resulting dtype will reflect the return value of the aggregating + function. + + >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min()) + 1 1.0 + 2 3.0 + dtype: float64 + """ relabeling = func is None columns = None if relabeling: diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9e36837bc679f..e2410788ea95e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -364,87 +364,6 @@ class providing the base-class of operations. -------- %(example)s""" -_agg_template_series = """ -Aggregate using one or more operations. - -Parameters ----------- -func : function, str, list, dict or None - Function to use for aggregating the data. If a function, must either - work when passed a {klass} or when passed to {klass}.apply. - - Accepted combinations are: - - - function - - string function name - - list of functions and/or function names, e.g. ``[np.sum, 'mean']`` - - None, in which case ``**kwargs`` are used with Named Aggregation. Here the - output has one column for each element in ``**kwargs``. The name of the - column is keyword, whereas the value determines the aggregation used to compute - the values in the column. - - Can also accept a Numba JIT function with - ``engine='numba'`` specified. Only passing a single function is supported - with this engine. - - If the ``'numba'`` engine is chosen, the function must be - a user defined function with ``values`` and ``index`` as the - first and second arguments respectively in the function signature. - Each group's index will be passed to the user defined function - and optionally available for use. - - .. deprecated:: 2.1.0 - - Passing a dictionary is deprecated and will raise in a future version - of pandas. Pass a list of aggregations instead. -*args - Positional arguments to pass to func. -engine : str, default None - * ``'cython'`` : Runs the function through C-extensions from cython. - * ``'numba'`` : Runs the function through JIT compiled code from numba. - * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba`` - -engine_kwargs : dict, default None - * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` - * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` - and ``parallel`` dictionary keys. The values must either be ``True`` or - ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is - ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be - applied to the function - -**kwargs - * If ``func`` is None, ``**kwargs`` are used to define the output names and - aggregations via Named Aggregation. See ``func`` entry. - * Otherwise, keyword arguments to be passed into func. - -Returns -------- -{klass} - -See Also --------- -{klass}GroupBy.apply : Apply function func group-wise - and combine the results together. -{klass}GroupBy.transform : Transforms the Series on each group - based on the given function. -{klass}.aggregate : Aggregate using one or more operations. - -Notes ------ -When using ``engine='numba'``, there will be no "fall back" behavior internally. -The group data and group index will be passed as numpy arrays to the JITed -user defined function, and no alternative execution attempts will be tried. - -Functions that mutate the passed object can produce unexpected -behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` -for more details. - -.. versionchanged:: 1.3.0 - - The resulting dtype will reflect the return value of the passed ``func``, - see the examples below. -{examples}""" - @final class GroupByPlot(PandasObject): diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py index 5962709056ae8..076acc359f933 100755 --- a/scripts/validate_unwanted_patterns.py +++ b/scripts/validate_unwanted_patterns.py @@ -29,7 +29,6 @@ "_shared_docs", "_new_Index", "_new_PeriodIndex", - "_agg_template_series", "_pipe_template", "_apply_groupings_depr", "__main__", From 34f546f8e73386659457fec0b3fa1ef5b0c6d569 Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Sun, 29 Sep 2024 23:05:45 +0530 Subject: [PATCH 31/33] DOC: fix docstrings for multiple api.types methods (#59920) fix docstrings for api.types --- ci/code_checks.sh | 5 --- pandas/core/dtypes/inference.py | 63 ++++++++++++++++++++++++++++++--- 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index b65dcedbd8a10..2b3e83d64ab21 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -107,14 +107,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.year GL08" \ - -i "pandas.api.types.is_dict_like PR07,SA01" \ - -i "pandas.api.types.is_file_like PR07,SA01" \ -i "pandas.api.types.is_float PR01,SA01" \ - -i "pandas.api.types.is_hashable PR01,RT03,SA01" \ -i "pandas.api.types.is_integer PR01,SA01" \ -i "pandas.api.types.is_iterator PR07,SA01" \ - -i "pandas.api.types.is_named_tuple PR07,SA01" \ - -i "pandas.api.types.is_re PR07,SA01" \ -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index f042911b53d2b..6adb34ff0f777 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -113,13 +113,24 @@ def is_file_like(obj: object) -> bool: Parameters ---------- - obj : The object to check + obj : object + The object to check for file-like properties. + This can be any Python object, and the function will + check if it has attributes typically associated with + file-like objects (e.g., `read`, `write`, `__iter__`). Returns ------- bool Whether `obj` has file-like properties. + See Also + -------- + api.types.is_dict_like : Check if the object is dict-like. + api.types.is_hashable : Return True if hash(obj) will succeed, False otherwise. + api.types.is_named_tuple : Check if the object is a named tuple. + api.types.is_iterator : Check if the object is an iterator. + Examples -------- >>> import io @@ -142,13 +153,24 @@ def is_re(obj: object) -> TypeGuard[Pattern]: Parameters ---------- - obj : The object to check + obj : object + The object to check for being a regex pattern. Typically, + this would be an object that you expect to be a compiled + pattern from the `re` module. Returns ------- bool Whether `obj` is a regex pattern. + See Also + -------- + api.types.is_float : Return True if given object is float. + api.types.is_iterator : Check if the object is an iterator. + api.types.is_integer : Return True if given object is integer. + api.types.is_re_compilable : Check if the object can be compiled + into a regex pattern instance. + Examples -------- >>> from pandas.api.types import is_re @@ -275,13 +297,22 @@ def is_dict_like(obj: object) -> bool: Parameters ---------- - obj : The object to check + obj : object + The object to check. This can be any Python object, + and the function will determine whether it + behaves like a dictionary. Returns ------- bool Whether `obj` has dict-like properties. + See Also + -------- + api.types.is_list_like : Check if the object is list-like. + api.types.is_file_like : Check if the object is a file-like. + api.types.is_named_tuple : Check if the object is a named tuple. + Examples -------- >>> from pandas.api.types import is_dict_like @@ -308,13 +339,22 @@ def is_named_tuple(obj: object) -> bool: Parameters ---------- - obj : The object to check + obj : object + The object that will be checked to determine + whether it is a named tuple. Returns ------- bool Whether `obj` is a named tuple. + See Also + -------- + api.types.is_dict_like: Check if the object is dict-like. + api.types.is_hashable: Return True if hash(obj) + will succeed, False otherwise. + api.types.is_categorical_dtype : Check if the dtype is categorical. + Examples -------- >>> from collections import namedtuple @@ -340,9 +380,24 @@ def is_hashable(obj: object) -> TypeGuard[Hashable]: Distinguish between these and other types by trying the call to hash() and seeing if they raise TypeError. + Parameters + ---------- + obj : object + The object to check for hashability. Any Python object can be passed here. + Returns ------- bool + True if object can be hashed (i.e., does not raise TypeError when + passed to hash()), and False otherwise (e.g., if object is mutable + like a list or dictionary). + + See Also + -------- + api.types.is_float : Return True if given object is float. + api.types.is_iterator : Check if the object is an iterator. + api.types.is_list_like : Check if the object is list-like. + api.types.is_dict_like : Check if the object is dict-like. Examples -------- From 5b35c77041a74b53ebd7c330ca5930fa22929726 Mon Sep 17 00:00:00 2001 From: gameofby Date: Mon, 30 Sep 2024 01:36:34 +0800 Subject: [PATCH 32/33] DOC: the table name should be `air_quality_parameters` rather than `air_quality_parameters_name` (#59918) --- .../getting_started/intro_tutorials/08_combine_dataframes.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst index 05729809491b5..024300bb8a9b0 100644 --- a/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst +++ b/doc/source/getting_started/intro_tutorials/08_combine_dataframes.rst @@ -271,7 +271,7 @@ Add the parameters' full description and name, provided by the parameters metada Compared to the previous example, there is no common column name. However, the ``parameter`` column in the ``air_quality`` table and the -``id`` column in the ``air_quality_parameters_name`` both provide the +``id`` column in the ``air_quality_parameters`` table both provide the measured variable in a common format. The ``left_on`` and ``right_on`` arguments are used here (instead of just ``on``) to make the link between the two tables. From 90c26ce7ce04d97fdabb394e604ecee0a558c019 Mon Sep 17 00:00:00 2001 From: Deepak Saldanha Date: Mon, 30 Sep 2024 00:25:17 +0530 Subject: [PATCH 33/33] DOC: Separate out examples - pandas.str.is methods (#59850) --- pandas/core/strings/accessor.py | 193 ++++++++++++++++++++++++++------ 1 file changed, 156 insertions(+), 37 deletions(-) diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 6d10365a1b968..10117aa6bf503 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3443,10 +3443,10 @@ def casefold(self): Series or Index of bool Series or Index of boolean values with the same length as the original Series/Index. - + """ + _shared_docs["isalpha"] = """ See Also -------- - Series.str.isalpha : Check whether all characters are alphabetic. Series.str.isnumeric : Check whether all characters are numeric. Series.str.isalnum : Check whether all characters are alphanumeric. Series.str.isdigit : Check whether all characters are digits. @@ -3458,24 +3458,56 @@ def casefold(self): Examples -------- - **Checks for Alphabetic and Numeric Characters** >>> s1 = pd.Series(['one', 'one1', '1', '']) - >>> s1.str.isalpha() 0 True 1 False 2 False 3 False dtype: bool + """ + _shared_docs["isnumeric"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + + Examples + -------- + The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but + also includes other characters that can represent quantities such as + unicode fractions. + >>> s1 = pd.Series(['one', 'one1', '1', '']) >>> s1.str.isnumeric() 0 False 1 False 2 True 3 False dtype: bool + """ + _shared_docs["isalnum"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. + Examples + -------- + >>> s1 = pd.Series(['one', 'one1', '1', '']) >>> s1.str.isalnum() 0 True 1 True @@ -3492,47 +3524,72 @@ def casefold(self): 1 False 2 False dtype: bool + """ + _shared_docs["isdecimal"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **More Detailed Checks for Numeric Characters** - - There are several different but overlapping sets of numeric characters that - can be checked for. + Examples + -------- + The ``s3.str.isdecimal`` method checks for characters used to form + numbers in base 10. >>> s3 = pd.Series(['23', '³', '⅕', '']) - - The ``s3.str.isdecimal`` method checks for characters used to form numbers - in base 10. - >>> s3.str.isdecimal() 0 True 1 False 2 False 3 False dtype: bool + """ + _shared_docs["isdigit"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - The ``s.str.isdigit`` method is the same as ``s3.str.isdecimal`` but also - includes special digits, like superscripted and subscripted digits in - unicode. + Examples + -------- + Similar to ``str.isdecimal`` but also includes special digits, like + superscripted and subscripted digits in unicode. + >>> s3 = pd.Series(['23', '³', '⅕', '']) >>> s3.str.isdigit() 0 True 1 True 2 False 3 False dtype: bool + """ - The ``s.str.isnumeric`` method is the same as ``s3.str.isdigit`` but also - includes other characters that can represent quantities such as unicode - fractions. - - >>> s3.str.isnumeric() - 0 True - 1 True - 2 True - 3 False - dtype: bool + _shared_docs["isspace"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **Checks for Whitespace** + Examples + -------- >>> s4 = pd.Series([' ', '\\t\\r\\n ', '']) >>> s4.str.isspace() @@ -3540,30 +3597,74 @@ def casefold(self): 1 True 2 False dtype: bool + """ + _shared_docs["islower"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.isupper : Check whether all characters are uppercase. + Series.str.istitle : Check whether all characters are titlecase. - **Checks for Character Case** + Examples + -------- >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) - >>> s5.str.islower() 0 True 1 False 2 False 3 False dtype: bool + """ + + _shared_docs["isupper"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.istitle : Check whether all characters are titlecase. + Examples + -------- + + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s5.str.isupper() 0 False 1 False 2 True 3 False dtype: bool + """ + _shared_docs["istitle"] = """ + See Also + -------- + Series.str.isalpha : Check whether all characters are alphabetic. + Series.str.isnumeric : Check whether all characters are numeric. + Series.str.isalnum : Check whether all characters are alphanumeric. + Series.str.isdigit : Check whether all characters are digits. + Series.str.isdecimal : Check whether all characters are decimal. + Series.str.isspace : Check whether all characters are whitespace. + Series.str.islower : Check whether all characters are lowercase. + Series.str.isupper : Check whether all characters are uppercase. + Examples + ------------ The ``s5.str.istitle`` method checks for whether all words are in title case (whether only the first letter of each word is capitalized). Words are assumed to be as any sequence of non-numeric characters separated by whitespace characters. + >>> s5 = pd.Series(['leopard', 'Golden Eagle', 'SNAKE', '']) >>> s5.str.istitle() 0 False 1 True @@ -3583,31 +3684,49 @@ def casefold(self): # force _noarg_wrapper return type with dtype=np.dtype(bool) (GH 29624) isalnum = _map_and_wrap( - "isalnum", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + "isalnum", + docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + + _shared_docs["isalnum"], ) isalpha = _map_and_wrap( - "isalpha", docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + "isalpha", + docstring=_shared_docs["ismethods"] % _doc_args["isalpha"] + + _shared_docs["isalpha"], ) isdigit = _map_and_wrap( - "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + "isdigit", + docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] + + _shared_docs["isdigit"], ) isspace = _map_and_wrap( - "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"] + "isspace", + docstring=_shared_docs["ismethods"] % _doc_args["isspace"] + + _shared_docs["isspace"], ) islower = _map_and_wrap( - "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] + "islower", + docstring=_shared_docs["ismethods"] % _doc_args["islower"] + + _shared_docs["islower"], ) isupper = _map_and_wrap( - "isupper", docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + "isupper", + docstring=_shared_docs["ismethods"] % _doc_args["isupper"] + + _shared_docs["isupper"], ) istitle = _map_and_wrap( - "istitle", docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + "istitle", + docstring=_shared_docs["ismethods"] % _doc_args["istitle"] + + _shared_docs["istitle"], ) isnumeric = _map_and_wrap( - "isnumeric", docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + "isnumeric", + docstring=_shared_docs["ismethods"] % _doc_args["isnumeric"] + + _shared_docs["isnumeric"], ) isdecimal = _map_and_wrap( - "isdecimal", docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + "isdecimal", + docstring=_shared_docs["ismethods"] % _doc_args["isdecimal"] + + _shared_docs["isdecimal"], )